From 45a3008ceb0b9f55b23fdc3dc8d4f4be480b86aa Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 10 Nov 2025 08:02:17 +0000
Subject: [PATCH 001/578] feat: Integrate AITER bpreshuffle and ck operators on
 top of fp8 refactor

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../schemes/compressed_tensors_w8a8_fp8.py    |   2 +
 .../kernels/scaled_mm/__init__.py             |   4 +
 .../quantization/kernels/scaled_mm/aiter.py   | 217 +++++++++++++++++-
 3 files changed, 222 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 2cd29e0905d0..e25d2aaa439b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -192,6 +192,8 @@ def process_weights_after_loading(self, layer) -> None:
         if self.strategy == QuantizationStrategy.BLOCK:
             maybe_post_process_fp8_weight_block(layer, self.cutlass_block_fp8_supported)
 
+        self.fp8_linear.process_weights_after_loading(layer)
+
     def apply_weights(
         self,
         layer: torch.nn.Module,
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index b033cc7905e4..b8c7f78aac64 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -8,6 +8,8 @@
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
+    AiterBpreshufflePerTokenFp8ScaledMMLinearKernel,
+    AiterCKPerTokenFp8ScaledMMLinearKernel,
     AiterScaledMMLinearKernel,
 )
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import (
@@ -64,6 +66,8 @@
         ChannelWiseTorchScaledMMLinearKernel,
     ],
     PlatformEnum.ROCM: [
+        AiterBpreshufflePerTokenFp8ScaledMMLinearKernel,
+        AiterCKPerTokenFp8ScaledMMLinearKernel,
         ROCmScaledMMLinearKernel,
         PerTensorTorchScaledMMLinearKernel,
         RowWiseTorchScaledMMLinearKernel,
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index 3ac90553bbc7..430e407156c5 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -2,15 +2,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+from collections.abc import Callable
+
 import torch
+from aiter.ops.shuffle import shuffle_weight
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import direct_register_custom_op
 
 from .cutlass import CutlassScaledMMLinearKernel
-from .ScaledMMLinearKernel import Int8ScaledMMLinearLayerConfig
+from .ScaledMMLinearKernel import (
+    FP8ScaledMMLinearKernel,
+    FP8ScaledMMLinearLayerConfig,
+    Int8ScaledMMLinearLayerConfig,
+)
+
+logger = init_logger(__name__)
 
 
 def rocm_aiter_gemm_w8a8_impl(
@@ -52,6 +62,54 @@ def rocm_aiter_gemm_w8a8_fake(
     )
 
 
+# bpshuffle
+def rocm_aiter_gemm_a8w8_bpreshuffle_impl(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype | None = None,
+    scale_a: torch.Tensor | None = None,
+    scale_b: torch.Tensor | None = None,
+) -> torch.Tensor:
+    # This AITER function can be used for
+    # - per-token activations + per-channel weights
+    #   e.g. vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+    # accept the weight as # keep the weight as (N, K)
+    # NOTE: The weight has to be shuffled in the
+    # process_weights_after_loading of the CompressedTensorsW8A8Fp8 class
+
+    from aiter import gemm_a8w8_bpreshuffle_ck
+
+    m = input.shape[0]
+    n = weight.shape[0]
+    Y = torch.empty(m, n, dtype=out_dtype, device=input.device)
+    gemm_a8w8_bpreshuffle_ck(input, weight, scale_a, scale_b, Y)
+    return Y
+
+
+def rocm_aiter_gemm_a8w8_bpreshuffle_fake(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype | None = None,
+    scale_a: torch.Tensor | None = None,
+    scale_b: torch.Tensor | None = None,
+) -> torch.Tensor:
+    m = input.shape[0]
+    n = weight.shape[0]
+    if out_dtype is None:
+        out_dtype = input.dtype
+    return torch.empty((m, n), dtype=out_dtype, device=input.device)
+
+
+if current_platform.is_rocm():
+    direct_register_custom_op(
+        op_name="rocm_aiter_gemm_a8w8_bpreshuffle",
+        op_func=rocm_aiter_gemm_a8w8_bpreshuffle_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_gemm_a8w8_bpreshuffle_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+
 class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
     @classmethod
     def get_min_capability(cls) -> int:
@@ -157,3 +215,160 @@ def apply_weights(
         return torch.ops.vllm.rocm_aiter_gemm_w8a8(
             x_q, w_q.t(), x_s, w_s, bias, out_dtype
         )
+
+
+# bpreshuffle
+class AiterBpreshufflePerTokenFp8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
+    def get_ouput_padding(self) -> int | None:
+        # PTPC kernels do not require padding.
+        return None
+
+    @classmethod
+    def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_rocm():
+            return (False, "AITER bpreshuffle is ROCm-only")
+        if not (envs.VLLM_ROCM_USE_AITER_LINEAR and envs.VLLM_ROCM_USE_AITER):
+            return (False, "AITER bpreshuffle is disabled by env var")
+        try:
+            import aiter  # noqa: F401
+        except Exception:
+            return (False, "AITER not installed")
+
+        # Check if the configuration is PTPC
+        is_per_channel_weight = c.weight_quant_key.scale.group_shape.is_per_token()
+        is_per_token_activation = (
+            c.activation_quant_key.scale.group_shape.is_per_token()
+        )
+        is_ptpc = is_per_channel_weight and is_per_token_activation
+
+        logger.info_once(f"AiterBpreshuffle: can_implement called. is_ptpc={is_ptpc}")
+
+        if not is_ptpc:
+            return (False, "This kernel only handles Per-Token/Per-Channel (PTPC)")
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        logger.info_once("AiterBpreshuffle: SHUFFLING WEIGHTS NOW.")
+
+        w_q, _, _, _ = self._get_layer_params(layer)
+
+        N = w_q.shape[1]
+        K = w_q.shape[0]
+
+        if N % 16 == 0 and K % 16 == 0:
+            # AITER shuffle_weight expectation [N, K]
+            w_q_nk = w_q.t().contiguous()
+
+            # Execute shuffle
+            shuffled_w_nk = shuffle_weight(w_q_nk, layout=(16, 16))
+
+            del layer.weight
+            layer.register_buffer("weight", shuffled_w_nk)
+
+            logger.info_once("[AiterBpreshuffle: Weight shuffle COMPLETE.")
+
+        else:
+            raise ValueError(
+                f"Weight shape (N={N}, K={K}) not divisible by 16 "
+                "for AITER bpreshuffle."
+            )
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        # 1. Obtain parameters
+        w_q, w_s, x_s, x_s_ub = self._get_layer_params(layer)
+        # 2. Dynamic quantization input
+        qinput, qinput_scale = self.quant_fp8(x, x_s, x_s_ub)
+
+        logger.info_once(
+            "AiterBpreshuffle: apply_weights... ABOUT TO CALL C++ KERNEL..."
+        )
+
+        # 3. Call the AITER bpreshuffle CK operator.
+        output = torch.ops.vllm.rocm_aiter_gemm_a8w8_bpreshuffle(
+            qinput,
+            w_q,  # Input [N, K] shuffle weights
+            out_dtype=self.config.out_dtype,
+            scale_a=qinput_scale,
+            scale_b=w_s,
+        )
+
+        logger.info_once("AiterBpreshuffle: C++ KERNEL CALL SUCCEEDED.")
+
+        if bias is not None:
+            output.add_(bias)
+        return output
+
+    def get_scaled_mm_func(self) -> Callable[..., torch.Tensor]:
+        return rocm_aiter_gemm_a8w8_bpreshuffle_impl
+
+
+# AITER FP8 CK
+class AiterCKPerTokenFp8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
+    """
+    AITER PTPC kernel (gemm_a8w8_CK) without pre-shuffling.
+    """
+
+    def get_ouput_padding(self) -> int | None:
+        return None
+
+    @classmethod
+    def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_rocm():
+            return (False, "AITER CK is ROCm-only")
+        if not (envs.VLLM_ROCM_USE_AITER_LINEAR and envs.VLLM_ROCM_USE_AITER):
+            return (False, "AITER CK is disabled by env var")
+        try:
+            import aiter  # noqa: F401
+        except Exception:
+            return (False, "AITER not installed")
+
+        is_per_channel_weight = c.weight_quant_key.scale.group_shape.is_per_token()
+        is_per_token_activation = (
+            c.activation_quant_key.scale.group_shape.is_per_token()
+        )
+        is_ptpc = is_per_channel_weight and is_per_token_activation
+
+        logger.info_once(f"AiterCK: can_implement called. is_ptpc={is_ptpc}")
+
+        if not is_ptpc:
+            return (False, "This kernel only handles Per-Token/Per-Channel (PTPC)")
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        logger.info_once(
+            "AITER CK: process_weights_after_loading... DOING NOTHING (pass)."
+        )
+        pass
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        w_q, w_s, x_s, x_s_ub = self._get_layer_params(layer)
+
+        qinput, qinput_scale = self.quant_fp8(x, x_s, x_s_ub)
+
+        logger.info_once(
+            "AiterCK: apply_weights... "
+            "ABOUT TO CALL C++ KERNEL (this is where it hangs)..."
+        )
+
+        output = torch.ops.vllm.rocm_aiter_gemm_w8a8(
+            qinput, w_q.t(), qinput_scale, w_s, bias, self.config.out_dtype
+        )
+
+        logger.info_once("AiterCK: C++ KERNEL CALL SUCCEEDED.")
+
+        return output
+
+    def get_scaled_mm_func(self) -> Callable[..., torch.Tensor]:
+        return rocm_aiter_gemm_w8a8_impl

From fa183e92713456dec682088a362dd9908100cc03 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Thu, 13 Nov 2025 15:59:58 +0800
Subject: [PATCH 002/578] [Bugfix] fix kimi-linear crash (#28445)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/model_executor/layers/kda.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py
index 26458f2e3c4d..2e7500bac718 100644
--- a/vllm/model_executor/layers/kda.py
+++ b/vllm/model_executor/layers/kda.py
@@ -44,7 +44,6 @@ def kda_attention(
     k_proj_states: torch.Tensor,
     v_proj_states: torch.Tensor,
     g1: torch.Tensor,
-    g2: torch.Tensor,
     beta: torch.Tensor,
     core_attn_out: torch.Tensor,
     layer_name: str,
@@ -56,7 +55,6 @@ def kda_attention(
         k_proj_states=k_proj_states,
         v_proj_states=v_proj_states,
         g1=g1,
-        g2=g2,
         beta=beta,
         core_attn_out=core_attn_out,
     )
@@ -67,7 +65,6 @@ def kda_attention_fake(
     k_proj_states: torch.Tensor,
     v_proj_states: torch.Tensor,
     g1: torch.Tensor,
-    g2: torch.Tensor,
     beta: torch.Tensor,
     core_attn_out: torch.Tensor,
     layer_name: str,
@@ -284,7 +281,6 @@ def forward(
             k,
             v,
             g1,
-            g2,
             beta,
             core_attn_out,
             self.prefix,
@@ -299,7 +295,6 @@ def _forward(
         k_proj_states: torch.Tensor,
         v_proj_states: torch.Tensor,
         g1: torch.Tensor,
-        g2: torch.Tensor,
         beta: torch.Tensor,
         core_attn_out: torch.Tensor,
     ) -> None:
@@ -316,8 +311,15 @@ def _forward(
         has_initial_state = attn_metadata.has_initial_state
         non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc
         non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
+        num_actual_tokens = attn_metadata.num_actual_tokens
         constant_caches = self.kv_cache[forward_context.virtual_engine]
 
+        q_proj_states = q_proj_states[:num_actual_tokens]
+        k_proj_states = k_proj_states[:num_actual_tokens]
+        v_proj_states = v_proj_states[:num_actual_tokens]
+        g1 = g1[:num_actual_tokens]
+        beta = beta[:num_actual_tokens]
+
         (conv_state_q, conv_state_k, conv_state_v, recurrent_state) = constant_caches
         # deal with strides
         conv_state_q = conv_state_q.transpose(-1, -2)
@@ -372,7 +374,7 @@ def _forward(
             ).transpose(0, 1)
         else:
             decode_conv_indices = non_spec_state_indices_tensor[
-                : attn_metadata.num_decodes
+                : attn_metadata.num_actual_tokens
             ]
             q = causal_conv1d_update(
                 q_proj_states,
@@ -438,8 +440,9 @@ def _forward(
                 beta=beta,
                 initial_state=recurrent_state,
                 use_qk_l2norm_in_kernel=True,
-                cu_seqlens=non_spec_query_start_loc,
+                cu_seqlens=non_spec_query_start_loc[: attn_metadata.num_decodes + 1],
                 ssm_state_indices=non_spec_state_indices_tensor,
             )
-        assert core_attn_out_non_spec.shape == core_attn_out.shape
-        core_attn_out[:] = core_attn_out_non_spec
+        core_attn_out[0, :num_actual_tokens] = core_attn_out_non_spec[
+            0, :num_actual_tokens
+        ]

From 5c9ad138d507320f6432cfc3d727980853fd5e91 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Thu, 13 Nov 2025 16:14:13 +0800
Subject: [PATCH 003/578] [Frontend] supports interleaved thinking (#28531)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 docs/features/interleaved_thinking.md         | 118 ++++++++++++++++++
 ...penai_chat_completion_client_with_tools.py |   1 +
 vllm/entrypoints/chat_utils.py                |  17 ++-
 3 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 docs/features/interleaved_thinking.md

diff --git a/docs/features/interleaved_thinking.md b/docs/features/interleaved_thinking.md
new file mode 100644
index 000000000000..7343324b4849
--- /dev/null
+++ b/docs/features/interleaved_thinking.md
@@ -0,0 +1,118 @@
+# Interleaved Thinking
+
+## Introduction
+
+Interleaved thinking allows models to reason between tool calls, enabling more sophisticated decision-making after receiving tool results. This feature helps models chain multiple tool calls with reasoning steps in between and make nuanced decisions based on intermediate results.
+
+Important: Interleaved thinking increases token usage and response latency. Consider your budget and performance requirements when enabling this feature.
+
+## How Interleaved Thinking Works
+
+With interleaved thinking, the model can:
+
+- Reason about the results of a tool call before deciding what to do next
+- Chain multiple tool calls with reasoning steps in between
+- Make more nuanced decisions based on intermediate results
+- Provide transparent reasoning for its tool selection process
+
+## Supported Models
+
+vLLM currently supports the following interleaved thinking models:
+
+| Model Series | Reasoning Parser Name |
+|--------------|-----------------------|
+| moonshotai/Kimi-K2-Thinking    |  kimi_k2  |
+| MiniMaxAI/MiniMax-M2           |  minimax_m2  |
+
+## Example Usage
+
+To use interleaved thinking with tool calls, specify a model that supports this feature and enable tool calls in your chat completion request. Here's an example:
+
+??? code
+
+    ```python
+    """
+    vllm serve MiniMaxAI/MiniMax-M2 \
+      --tensor-parallel-size 4 \
+      --tool-call-parser minimax_m2 \
+      --reasoning-parser minimax_m2 \
+      --enable-auto-tool-choice
+    """
+    import json
+    
+    from openai import OpenAI
+    
+    client = OpenAI(base_url="http://localhost:8000/v1",     api_key="dummy")
+    
+    
+    def get_current_weather(location: str, unit: "str"):
+        """Get the current weather in a given location"""
+        if unit == "celsius":
+            return f"The current temperature in {location} is 22°C."
+        else:
+            return f"The current temperature in {location} is 72°F."
+    
+    
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given     location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City and state, e.g.,     'San Francisco, CA'",
+                        },
+                        "unit": {"type": "string", "enum":     ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location", "unit"],
+                },
+            },
+        }
+    ]
+    messages = [{"role": "user", "content": "What's the weather in Fahrenheit like in San Francisco?"}]
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    
+    tool_call = response.choices[0].message.tool_calls[0].function
+    
+    messages.append(
+        {
+            "role": "assistant",
+            "tool_calls": response.choices[0].message.tool_calls,
+            "reasoning": response.choices[0].message.reasoning, # append reasoning
+        }
+    )
+    
+    # Simulate tool execution
+    available_tools = {"get_weather": get_current_weather}
+    
+    completion_tool_calls = response.choices[0].message.tool_calls
+    for call in completion_tool_calls:
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        messages.append(
+            {
+                "role": "tool",
+                "content": result,
+                "tool_call_id": call.id,
+                "name": call.function.name,
+            }
+        )
+    response_2 = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    print(response_2.choices[0].message.content)
+    ```
+This example demonstrates how to set up interleaved thinking with tool calls using a weather retrieval function. The model reasons about the tool results before generating the final response.
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index 41dbb3236297..0bd1d05322f8 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -161,6 +161,7 @@ def main():
         {
             "role": "assistant",
             "tool_calls": chat_completion.choices[0].message.tool_calls,
+            "reasoning": chat_completion.choices[0].message.reasoning,
         }
     )
 
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index d7d6419d643b..3b722c2d9277 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -240,6 +240,9 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
     """The tool calls generated by the model, such as function calls."""
 
+    reasoning: str | None
+    """The reasoning content for interleaved thinking."""
+
 
 ChatCompletionMessageParam: TypeAlias = (
     OpenAIChatCompletionMessageParam
@@ -265,6 +268,12 @@ class ConversationMessage(TypedDict, total=False):
     tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
     """The tool calls generated by the model, such as function calls."""
 
+    reasoning: str | None
+    """The reasoning content for interleaved thinking."""
+
+    reasoning_content: str | None
+    """Deprecated: The reasoning content for interleaved thinking."""
+
 
 # Passed in by user
 ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
@@ -1374,7 +1383,7 @@ def _parse_chat_message_content(
 ) -> list[ConversationMessage]:
     role = message["role"]
     content = message.get("content")
-
+    reasoning = message.get("reasoning") or message.get("reasoning_content")
     if content is None:
         content = []
     elif isinstance(content, str):
@@ -1396,6 +1405,12 @@ def _parse_chat_message_content(
             # follow the OpenAI spec.
             if "tool_calls" in parsed_msg and parsed_msg["tool_calls"] is not None:
                 result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
+            # Include reasoning if present for interleaved thinking.
+            if reasoning is not None:
+                result_msg["reasoning"] = cast(str, reasoning)
+                result_msg["reasoning_content"] = cast(
+                    str, reasoning
+                )  # keep compatibility
         elif role == "tool":
             parsed_msg = _ToolParser(message)
             if "tool_call_id" in parsed_msg:

From 11ac9ddd037c63a8c9404cd1f62f9f81a5f38652 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Wed, 12 Nov 2025 22:57:20 -1000
Subject: [PATCH 004/578] Support all interleaved layer types (#28485)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 vllm/transformers_utils/config.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 14cae2b168e1..b7418cfb7cc7 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -472,8 +472,7 @@ def is_interleaved(config: PretrainedConfig) -> bool:
     """
     text_config = config.get_text_config()
     if layer_types := getattr(text_config, "layer_types", None):
-        interleaved_types = {"full_attention", "sliding_attention"}
-        return interleaved_types.issubset(layer_types)
+        return len(set(layer_types)) > 1
     return False
 
 
From e63fd445605b442a81a4eb2f402206cc337ab8dd Mon Sep 17 00:00:00 2001
From: Di Wu <95495325+dw2761@users.noreply.github.com>
Date: Thu, 13 Nov 2025 18:57:44 +0800
Subject: [PATCH 005/578] Fix: Correctly filter special tokens in
 benchmark_prefix_caching (#28615)

Signed-off-by: Di Wu <dw2761@nyu.edu>
---
 benchmarks/benchmark_prefix_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 146c268a6b7f..28fc383a318d 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -69,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
 
     # Remove the special tokens.
     return random.choices(
-        [v for k, v in vocab.items() if k not in all_special_ids],
+        [v for v in vocab.values() if v not in all_special_ids],
         k=length,
     )
 

From 5e973209aaf5fa15459555eaa42bcd20ea63aa0d Mon Sep 17 00:00:00 2001
From: Zijing Liu <liuzijing2014@users.noreply.github.com>
Date: Thu, 13 Nov 2025 03:30:04 -0800
Subject: [PATCH 006/578] [BugFix] Fix type error when assign a trition kernel
 tensor to a torch.nn.Parameter (#28603)

Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 0f69a18a1f3f..5552c1ae5edf 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -755,8 +755,8 @@ def _interleave_mxfp4_cutlass_sm90(w):
 
             self.w13_weight = w13_weight
             self.w2_weight = w2_weight
-            layer.w13_weight = w13_weight
-            layer.w2_weight = w2_weight
+            layer.w13_weight = Parameter(w13_weight.data, requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight.data, requires_grad=False)
         else:
             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
 

From c428e8d80b2bc17b0a306d1e80c8e4567b9dd9f4 Mon Sep 17 00:00:00 2001
From: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com>
Date: Thu, 13 Nov 2025 06:34:14 -0500
Subject: [PATCH 007/578] Fix io processor pooling  #28273 (#28484)

Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
---
 vllm/entrypoints/openai/serving_pooling.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 0eade272111f..ee4c5c8bacaa 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -4,7 +4,7 @@
 import asyncio
 import json
 import time
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, Sequence
 from typing import Final, cast
 
 import jinja2
@@ -122,6 +122,10 @@ async def create_pooling(
                 engine_prompts = await self.io_processor.pre_process_async(
                     prompt=validated_prompt, request_id=request_id
                 )
+                if not isinstance(engine_prompts, Sequence) or isinstance(
+                    engine_prompts, (str, bytes, bytearray)
+                ):
+                    engine_prompts = [engine_prompts]
 
             elif isinstance(request, PoolingChatRequest):
                 error_check_ret = self._validate_chat_template(

From c47b6c85ac25ecb0a26dfff76c70a0b1a9a4a6bf Mon Sep 17 00:00:00 2001
From: zofia <110436990+zufangzhu@users.noreply.github.com>
Date: Thu, 13 Nov 2025 19:35:04 +0800
Subject: [PATCH 008/578] [XPU] add sym params to IPEXConfig (#28611)

Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com>
---
 .../layers/quantization/ipex_quant.py         | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index e0234191c62b..5ca9167faec8 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -52,6 +52,7 @@ def __init__(
         modules_to_not_convert: list[str] | None = None,
         desc_act: bool | None = None,
         lm_head_quantized: bool | None = None,
+        is_sym: bool | None = None,
     ) -> None:
         super().__init__()
         self.method = method
@@ -60,6 +61,7 @@ def __init__(
         self.modules_to_not_convert = modules_to_not_convert or []
         self.desc_act = desc_act
         self.lm_head_quantized = lm_head_quantized
+        self.is_sym = is_sym
         self.pack_factor = 32 // self.weight_bits
 
         if self.weight_bits not in [4]:
@@ -108,15 +110,25 @@ def from_config(cls, config: dict[str, Any]) -> "IPEXConfig":
             modules_to_not_convert = cls.get_from_keys_or(
                 config, ["modules_to_not_convert"], None
             )
+            is_sym = not cls.get_from_keys_or(config, ["zero_point"], default=False)
             return cls(
-                method, weight_bits, group_size, modules_to_not_convert, False, False
+                method,
+                weight_bits,
+                group_size,
+                modules_to_not_convert,
+                False,
+                False,
+                is_sym,
             )
         # otherwise for gptq
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
         desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
-        return cls(method, weight_bits, group_size, [], desc_act, lm_head_quantized)
+        is_sym = cls.get_from_keys_or(config, ["sym"], default=True)
+        return cls(
+            method, weight_bits, group_size, [], desc_act, lm_head_quantized, is_sym
+        )
 
     @classmethod
     def override_quantization_method(
@@ -180,6 +192,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # The float activation will be quantized (dynamic, per-token) to INT8.
         act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK
 
+        assert isinstance(self.quant_config, IPEXConfig)
         qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
             weight_dtype=weight_dtype,
             lowp_mode=lowp_mode,
@@ -200,6 +213,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 bias=bias,
                 group_size=self.quant_config.group_size,
                 quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"],
+                weight_qscheme="sym" if self.quant_config.is_sym else "asym",
             )
         )
 
@@ -250,6 +264,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # The float activation will be quantized (dynamic, per-token) to INT8.
         act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH
 
+        assert isinstance(self.quant_config, IPEXConfig)
         qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
             weight_dtype=weight_dtype,
             lowp_mode=lowp_mode,
@@ -269,6 +284,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 bias=bias,
                 group_size=self.quant_config.group_size,
                 quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"],  # type: ignore
+                weight_qscheme="sym" if self.quant_config.is_sym else "asym",
             )
         )
 

From c9fe6abe7c0b03d552420edd63c6c678ed683dea Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Thu, 13 Nov 2025 21:06:06 +0800
Subject: [PATCH 009/578] [Bugfix] Fix FPS value type for Qwen2.5-Omni video
 processing (#28630)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
---
 examples/offline_inference/vision_language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 371cf6309a67..624de2a2debc 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1536,7 +1536,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
         mm_processor_kwargs={
             "min_pixels": 28 * 28,
             "max_pixels": 1280 * 28 * 28,
-            "fps": [1],
+            "fps": 1,
         },
         limit_mm_per_prompt={modality: 1},
     )

From 86d15bfd8d681a2ca2f3b2e550149a5ba3282ef1 Mon Sep 17 00:00:00 2001
From: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com>
Date: Thu, 13 Nov 2025 19:02:21 +0530
Subject: [PATCH 010/578] [Hardware][PowerPC] Fix fp16 compilation error for
 Power in cpu attention backend and bump oneDNN version (#28535)

Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
---
 cmake/cpu_extension.cmake  | 4 ++--
 csrc/cpu/cpu_attn_impl.hpp | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index bb0179c79c10..aa84125818d1 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -242,7 +242,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
                 SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
                 SOURCE_DIR   "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
                 GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
-                GIT_TAG        v52.2.0
+                GIT_TAG        v52.6.0
                 GIT_SHALLOW    TRUE
                 GIT_PROGRESS   TRUE
             )
@@ -310,7 +310,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
         FetchContent_Declare(
             oneDNN
             GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-            GIT_TAG v3.9
+            GIT_TAG v3.10
             GIT_PROGRESS TRUE
             GIT_SHALLOW TRUE
         )
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 8f4c78099802..c317453530af 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -821,10 +821,12 @@ struct VecTypeTrait<c10::BFloat16> {
   using vec_t = vec_op::BF16Vec16;
 };
 
+#if !defined(__powerpc__)
 template <>
 struct VecTypeTrait<c10::Half> {
   using vec_t = vec_op::FP16Vec16;
 };
+#endif
 
 template <typename T>
 void print_logits(const char* name, T* ptr, int32_t row, int32_t col,

From 8da2f28f53c14e2c21c50821d89e3909d9c84af6 Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Thu, 13 Nov 2025 22:18:20 +0800
Subject: [PATCH 011/578] [ROCm][BugFix]Fix `get_cu_count` in rocm_aiter_fa.py
 (#28618)

Signed-off-by: ganyi <ygan@amd.com>
---
 vllm/v1/attention/backends/rocm_aiter_fa.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index c7f925817a6a..ad454daa582e 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -18,6 +18,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
+from vllm.utils.platform_utils import get_cu_count
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
@@ -38,7 +39,7 @@ def block_size(x, head_dim):
         return min(65536 // x.element_size(), triton.next_power_of_2(head_dim))
 
     def num_programs(total_tokens):
-        return min(total_tokens, current_platform.get_cu_count())
+        return min(total_tokens, get_cu_count())
 
     @triton.jit
     def cp_mha_gather_cache_kernel(

From a7791eac9d29a4a26b007db42130a9e28b3e77ee Mon Sep 17 00:00:00 2001
From: amdfaa <107946068+amdfaa@users.noreply.github.com>
Date: Thu, 13 Nov 2025 09:34:55 -0500
Subject: [PATCH 012/578] [CI/Build] Install uv for AMD MI300: Language Models
 Tests (Hybrid) %N (#28142)

Signed-off-by: amdfaa <107946068+amdfaa@users.noreply.github.com>
Signed-off-by: zhewenli <zhewenli@meta.com>
Co-authored-by: zhewenli <zhewenli@meta.com>
---
 docker/Dockerfile.rocm | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 06d229f315bd..137452cad2c1 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -15,6 +15,20 @@ RUN apt-get update -q -y && apt-get install -q -y \
 # Remove sccache
 RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+
+# Install UV
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Activate virtual environment and add uv to PATH
+ENV PATH="/root/.local/bin:$PATH"
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
+
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}
 
@@ -59,13 +73,15 @@ FROM base AS test
 
 RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 
-# Install vLLM
+# Install vLLM using uv (inherited from base stage)
+# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    --mount=type=cache,target=/root/.cache/uv \
     cd /install \
-    && pip install -U -r requirements/rocm.txt \
-    && pip install -U -r requirements/rocm-test.txt \
+    && uv pip install --system -r requirements/rocm.txt \
+    && uv pip install --system -r requirements/rocm-test.txt \
     && pip uninstall -y vllm \
-    && pip install *.whl
+    && uv pip install --system *.whl
 
 WORKDIR /vllm-workspace
 ARG COMMON_WORKDIR
@@ -89,14 +105,17 @@ RUN case "$(which python3)" in \
             rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
         *) ;; esac
 
-RUN python3 -m pip install --upgrade huggingface-hub[cli]
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --upgrade huggingface-hub[cli]
 
-# Install vLLM
+# Install vLLM using uv (inherited from base stage)
+# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    --mount=type=cache,target=/root/.cache/uv \
     cd /install \
-    && pip install -U -r requirements/rocm.txt \
+    && uv pip install --system -r requirements/rocm.txt \
     && pip uninstall -y vllm \
-    && pip install *.whl
+    && uv pip install --system *.whl
 
 ARG COMMON_WORKDIR
 

From 07a606aa7eb30923a3cc631185d93de9e51b37cb Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Thu, 13 Nov 2025 07:11:27 -0800
Subject: [PATCH 013/578] [CI Failure] Fix backend selection for encoder-only
 models (#28534)

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 vllm/attention/backends/abstract.py               | 14 ++++++++++++++
 vllm/attention/layer.py                           |  1 +
 vllm/attention/layers/encoder_only_attention.py   |  6 +++++-
 vllm/attention/selector.py                        |  5 +++++
 vllm/platforms/cpu.py                             |  1 +
 vllm/platforms/cuda.py                            | 10 ++++++++++
 vllm/platforms/interface.py                       |  1 +
 vllm/platforms/rocm.py                            |  1 +
 vllm/platforms/tpu.py                             |  1 +
 vllm/platforms/xpu.py                             |  1 +
 vllm/v1/attention/backends/cpu_attn.py            | 11 +++++++++++
 vllm/v1/attention/backends/flash_attn.py          | 12 ++++++++++++
 vllm/v1/attention/backends/flex_attention.py      |  7 +++++++
 vllm/v1/attention/backends/mla/flashmla_sparse.py | 10 +++++-----
 14 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 697beed91869..9275d70fd86a 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -142,6 +142,17 @@ def supports_sink(cls) -> bool:
     def is_sparse(cls) -> bool:
         return False
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """Check if backend supports a given attention type.
+
+        By default, only supports decoder attention.
+        Backends should override this to support other attention types.
+        """
+        from vllm.attention import AttentionType
+
+        return attn_type == AttentionType.DECODER
+
     @classmethod
     def supports_compute_capability(cls, capability: "DeviceCapability") -> bool:
         return True
@@ -171,6 +182,7 @@ def validate_configuration(
         has_sink: bool,
         use_sparse: bool,
         device_capability: "DeviceCapability",
+        attn_type: str,
     ) -> list[str]:
         invalid_reasons = []
         if not cls.supports_head_size(head_size):
@@ -195,6 +207,8 @@ def validate_configuration(
                 invalid_reasons.append("non-sparse not supported")
         if not cls.supports_compute_capability(device_capability):
             invalid_reasons.append("compute capability not supported")
+        if not cls.supports_attn_type(attn_type):
+            invalid_reasons.append(f"attention type {attn_type} not supported")
         combination_reason = cls.supports_combination(
             head_size,
             dtype,
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 487bba76babf..37f9a4b383ce 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -291,6 +291,7 @@ def __init__(
                 block_size,
                 use_mla=False,
                 has_sink=self.has_sink,
+                attn_type=attn_type,
             )
         else:
             self.attn_backend = attn_backend
diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/attention/layers/encoder_only_attention.py
index 4929bbf5efc7..5e99c9901003 100644
--- a/vllm/attention/layers/encoder_only_attention.py
+++ b/vllm/attention/layers/encoder_only_attention.py
@@ -74,7 +74,11 @@ def __init__(
             block_size = 16
 
         underlying_attn_backend = get_attn_backend(
-            head_size, dtype, kv_cache_dtype, block_size
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            attn_type=AttentionType.ENCODER_ONLY,
         )
 
         attn_backend = create_encoder_only_attention_backend(underlying_attn_backend)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 262cdf0e575b..1a092db9ce37 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -76,6 +76,7 @@ def get_attn_backend(
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    attn_type: str | None = None,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
 
@@ -94,6 +95,7 @@ def get_attn_backend(
         use_mla=use_mla,
         has_sink=has_sink,
         use_sparse=use_sparse,
+        attn_type=attn_type,
     )
 
 
@@ -106,6 +108,7 @@ def _cached_get_attn_backend(
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    attn_type: str | None = None,
 ) -> type[AttentionBackend]:
     # Check whether a particular choice of backend was
     # previously forced.
@@ -159,6 +162,7 @@ def _cached_get_attn_backend(
             use_mla,
             has_sink,
             use_sparse,
+            attn_type,
         )
     else:
         attention_cls = current_platform.get_attn_backend_cls(
@@ -170,6 +174,7 @@ def _cached_get_attn_backend(
             use_mla,
             has_sink,
             use_sparse,
+            attn_type,
         )
     if not attention_cls:
         raise ValueError(
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 8b3b8d4cb44f..cf954768689f 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -134,6 +134,7 @@ def get_attn_backend_cls(
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        attn_type: str | None = None,
     ) -> str:
         from vllm.attention.backends.registry import AttentionBackendEnum
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index ebcc290a64cd..2e4dd8bb808b 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -298,6 +298,7 @@ def get_valid_backends(
         has_sink,
         use_sparse,
         device_capability,
+        attn_type,
     ) -> tuple[
         list[tuple["AttentionBackendEnum", int]],
         dict["AttentionBackendEnum", list[str]],
@@ -318,6 +319,7 @@ def get_valid_backends(
                     has_sink,
                     use_sparse,
                     device_capability,
+                    attn_type,
                 )
             except ImportError:
                 invalid_reasons_i = ["ImportError"]
@@ -339,7 +341,13 @@ def get_attn_backend_cls(
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        attn_type: str | None = None,
     ) -> str:
+        from vllm.attention import AttentionType
+
+        if attn_type is None:
+            attn_type = AttentionType.DECODER
+
         device_capability = cls.get_device_capability()
         assert device_capability is not None
 
@@ -356,6 +364,7 @@ def get_attn_backend_cls(
                     has_sink,
                     use_sparse,
                     device_capability,
+                    attn_type,
                 )
             except ImportError:
                 invalid_reasons = ["ImportError"]
@@ -379,6 +388,7 @@ def get_attn_backend_cls(
             has_sink,
             use_sparse,
             device_capability,
+            attn_type,
         )
         reasons_str = (
             "{"
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 12c377384270..0471c20429b1 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -222,6 +222,7 @@ def get_attn_backend_cls(
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        attn_type: str | None = None,
     ) -> str:
         """Get the attention backend class of a device."""
         return ""
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d20dc9e6b067..788f9d69c357 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -216,6 +216,7 @@ def get_attn_backend_cls(
         use_mla,
         has_sink,
         use_sparse,
+        attn_type: str | None = None,
     ) -> str:
         from vllm._aiter_ops import rocm_aiter_ops
         from vllm.attention.backends.registry import AttentionBackendEnum
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 4773fef6829d..b997bb9e6999 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -61,6 +61,7 @@ def get_attn_backend_cls(
         use_mla: bool,
         has_sink,
         use_sparse,
+        attn_type: str | None = None,
     ) -> str:
         from vllm.attention.backends.registry import AttentionBackendEnum
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index c629325f76a3..5552e4ca4b2f 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -51,6 +51,7 @@ def get_attn_backend_cls(
         use_mla: bool,
         has_sink: bool,
         use_sparse,
+        attn_type: str | None = None,
     ) -> str:
         from vllm.v1.attention.backends.utils import set_kv_cache_layout
 
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 674398e19c4c..f1254352c058 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -48,6 +48,17 @@ def get_supported_head_sizes(cls) -> list[int]:
     def get_name() -> str:
         return "CPU_ATTN"
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """CPU attention supports decoder and encoder-only attention."""
+        from vllm.attention import AttentionType
+
+        return attn_type in (
+            AttentionType.DECODER,
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+        )
+
     @staticmethod
     def get_impl_cls() -> type["CPUAttentionBackendImpl"]:
         return CPUAttentionBackendImpl
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index d9bd52d8f980..bfb4a45c2b56 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -66,6 +66,18 @@ class FlashAttentionBackend(AttentionBackend):
     def get_name() -> str:
         return "FLASH_ATTN"
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """FlashAttention supports all attention types."""
+        from vllm.attention import AttentionType
+
+        return attn_type in (
+            AttentionType.DECODER,
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+            AttentionType.ENCODER_DECODER,
+        )
+
     @staticmethod
     def get_impl_cls() -> type["FlashAttentionImpl"]:
         return FlashAttentionImpl
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index e53cd0d8af4f..7768827d26dc 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -84,6 +84,13 @@ class FlexAttentionBackend(AttentionBackend):
     def get_name() -> str:
         return "FLEX_ATTENTION"
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """FlexAttention supports both decoder and encoder-only attention."""
+        from vllm.attention import AttentionType
+
+        return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
+
     @staticmethod
     def get_impl_cls() -> type["FlexAttentionImpl"]:
         return FlexAttentionImpl
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index 5fe9c69d3500..bb8d914d1571 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -40,14 +40,14 @@
 """
 NOTE: FlashMLA Sparse uses an fp8 cache with the following format
 
-In the "FP8 with scale" format, each token's KV cache is 656 Bytes, 
+In the "FP8 with scale" format, each token's KV cache is 656 Bytes,
 structured as:
--   **First 512 bytes:** The "quantized NoPE" part, containing 512 
+-   **First 512 bytes:** The "quantized NoPE" part, containing 512
     `float8_e4m3` values.
--   **Next 16 bytes:** Scale factors, containing 4 `float32` values. 
-    The first `float32` is the scale for the first 128 `float8_e4m3` values, 
+-   **Next 16 bytes:** Scale factors, containing 4 `float32` values.
+    The first `float32` is the scale for the first 128 `float8_e4m3` values,
     the second for the next 128, and so on.
--   **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This 
+-   **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This
     part is not quantized for accuracy.
 """
 

From 3035d1a166821272d4e7eb204e2c613bb02bacd7 Mon Sep 17 00:00:00 2001
From: Yuanping Song <yuanping.song@outlook.com>
Date: Thu, 13 Nov 2025 10:24:35 -0500
Subject: [PATCH 014/578] [BugFix] DeepSeek-OCR: apply
 NoRepeatNGramLogitsProcessor to greedy path (#28617)

Signed-off-by: Yuanping Song <yuanping.song@outlook.com>
---
 vllm/model_executor/models/deepseek_ocr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index c89caab93a1e..8179f916ff41 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -161,7 +161,7 @@ def validate_params(cls, params: SamplingParams):
             )
 
     def is_argmax_invariant(self) -> bool:
-        return True
+        return False
 
     def new_req_logits_processor(
         self,

From b230286fbc0b6d192e176ead55000471fd4f1080 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 13 Nov 2025 16:02:42 +0000
Subject: [PATCH 015/578] Fix `get_num_experts` when config sets it explicitly
 to `None` (#28652)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: bruceszchen <bruceszchen@tencent.com>
---
 vllm/config/model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index c47b619118ff..f4ed99689e5b 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1342,7 +1342,8 @@ def get_num_experts(self) -> int:
             # Ernie VL's remote code uses list[int]...
             # The values are always the same so we just take the first one.
             return num_experts[0]
-        return num_experts
+        # Coerce to 0 if explicitly set to None
+        return num_experts or 0
 
     def get_layers_start_end_indices(
         self, parallel_config: ParallelConfig

From d3387750f191f3bcf6607db95436147bbccfacb3 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Thu, 13 Nov 2025 08:38:08 -0800
Subject: [PATCH 016/578] [Misc] Turn off encoder torch compile by default
 (#28634)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 tests/compile/test_multimodal_compile.py          | 9 ++++++---
 tests/models/multimodal/generation/test_common.py | 2 ++
 vllm/config/compilation.py                        | 5 +++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/compile/test_multimodal_compile.py b/tests/compile/test_multimodal_compile.py
index b76c29819a2d..621f6a51a918 100644
--- a/tests/compile/test_multimodal_compile.py
+++ b/tests/compile/test_multimodal_compile.py
@@ -10,8 +10,8 @@
 
 def test_compile():
     vllm_config = VllmConfig()
-    # Default configuration compiles mm encoder
-    assert vllm_config.compilation_config.compile_mm_encoder
+    # Default configuration does not compile mm encoder
+    assert not vllm_config.compilation_config.compile_mm_encoder
 
 
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@@ -39,7 +39,10 @@ def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
             "Qwen/Qwen2.5-VL-3B-Instruct",
             max_model_len=2048,
             gpu_memory_utilization=0.8,
-            compilation_config={"mode": CompilationMode.VLLM_COMPILE},
+            compilation_config={
+                "mode": CompilationMode.VLLM_COMPILE,
+                "compile_mm_encoder": True,
+            },
         ) as _,
     ):
         pass
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 5504c417fda4..22083d9f1614 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -131,6 +131,7 @@
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
         video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
+        enforce_eager=False,
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -160,6 +161,7 @@
             VLMTestType.MULTI_IMAGE,
             VLMTestType.VIDEO,
         ),
+        enforce_eager=False,
         needs_video_metadata=True,
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",  # noqa: E501
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index b0d1bc2bab30..10673041aa68 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -320,9 +320,10 @@ class CompilationConfig:
 
     If None, defaults to attention ops for piecewise cudagraphs.
     If empty list [], no ops are excluded (suitable for full cudagraphs)."""
-    compile_mm_encoder: bool = True
+    compile_mm_encoder: bool = False
     """Whether or not to compile the multimodal encoder.
-    Currently, this only works for `Qwen2_5_vl`."""
+    Currently, this only works for `Qwen2_5_vl` on selected platforms. 
+    Disabled by default until more models are supported/tested to work."""
 
     # Inductor capture
     use_inductor: bool | None = None

From 06c4873d959feb0d4cb062ef17cdd0dd09dbf10f Mon Sep 17 00:00:00 2001
From: "Jane (Yuan) Xu" <31798555+janeyx99@users.noreply.github.com>
Date: Thu, 13 Nov 2025 11:52:50 -0500
Subject: [PATCH 017/578] Rewrite C++ meta funcs to Python (#28595)

Signed-off-by: Jane Xu <janeyx@meta.com>
---
 .../gptq_marlin/awq_marlin_repack.cu          | 16 --------
 .../gptq_marlin/gptq_marlin_repack.cu         | 16 --------
 vllm/_custom_ops.py                           | 39 ++++++++++++++++++-
 3 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
index 8ba617a9e655..e607107b3e77 100644
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -247,22 +247,6 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
   return out;
 }
 
-torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
-                                     c10::SymInt size_k, c10::SymInt size_n,
-                                     int64_t num_bits) {
-  int const pack_factor = 32 / num_bits;
-  auto options = torch::TensorOptions()
-                     .dtype(b_q_weight.dtype())
-                     .device(b_q_weight.device());
-  return torch::empty_symint(
-      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
-      options);
-}
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("awq_marlin_repack", &awq_marlin_repack);
 }
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
-  m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
-}
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
index 7c2d089a70d9..ad80d51ece94 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -321,22 +321,6 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
   return out;
 }
 
-torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
-                                      torch::Tensor& perm, c10::SymInt size_k,
-                                      c10::SymInt size_n, int64_t num_bits) {
-  int const pack_factor = 32 / num_bits;
-  auto options = torch::TensorOptions()
-                     .dtype(b_q_weight.dtype())
-                     .device(b_q_weight.device());
-  return torch::empty_symint(
-      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
-      options);
-}
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("gptq_marlin_repack", &gptq_marlin_repack);
 }
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
-  m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
-}
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 7d70c01cefbb..096266c9764e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1174,13 +1174,50 @@ def gptq_marlin_repack(
     return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, num_bits)
 
 
-# gptq_marlin
+if hasattr(torch.ops._C, "gptq_marlin_repack"):
+
+    @register_fake("_C::gptq_marlin_repack")
+    def _gptq_marlin_repack_fake(
+        b_q_weight: torch.Tensor,
+        perm: torch.Tensor,
+        size_k: torch.SymInt,
+        size_n: torch.SymInt,
+        num_bits: int,
+    ) -> torch.Tensor:
+        pack_factor = 32 // num_bits
+        marlin_tile_size = 16
+        return torch.empty(
+            (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor),
+            dtype=b_q_weight.dtype,
+            device=b_q_weight.device,
+        )
+
+
+# awq_marlin
 def awq_marlin_repack(
     b_q_weight: torch.Tensor, size_k: int, size_n: int, num_bits: int
 ) -> torch.Tensor:
     return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
 
 
+if hasattr(torch.ops._C, "awq_marlin_repack"):
+
+    @register_fake("_C::awq_marlin_repack")
+    def _awq_marlin_repack_fake(
+        b_q_weight: torch.Tensor,
+        size_k: torch.SymInt,
+        size_n: torch.SymInt,
+        num_bits: int,
+    ) -> torch.Tensor:
+        pack_factor = 32 // num_bits
+        marlin_tile_size = 16
+        return torch.empty(
+            (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor),
+            dtype=b_q_weight.dtype,
+            device=b_q_weight.device,
+        )
+
+
 def gptq_marlin_moe_repack(
     b_q_weight: torch.Tensor,
     perm: torch.Tensor,

From 327c0a9a23f2939923d02fbf882640753bf1e030 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 13 Nov 2025 09:14:08 -0800
Subject: [PATCH 018/578] [BugFix] Ensure `EngineArgs.create_engine_config` is
 idempotent (#28515)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/engine/arg_utils.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 13c7704f5bf3..ca7f5e5e3e05 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1631,40 +1631,39 @@ def create_engine_config(
             )
 
         observability_config = ObservabilityConfig(
-            show_hidden_metrics_for_version=(self.show_hidden_metrics_for_version),
+            show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
             otlp_traces_endpoint=self.otlp_traces_endpoint,
             collect_detailed_traces=self.collect_detailed_traces,
         )
 
         # Compilation config overrides
+        compilation_config = copy.deepcopy(self.compilation_config)
         if self.cuda_graph_sizes is not None:
             logger.warning(
                 "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or "
                 "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes "
                 "instead."
             )
-            if self.compilation_config.cudagraph_capture_sizes is not None:
+            if compilation_config.cudagraph_capture_sizes is not None:
                 raise ValueError(
                     "cuda_graph_sizes and compilation_config."
                     "cudagraph_capture_sizes are mutually exclusive"
                 )
-            self.compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes
+            compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes
         if self.cudagraph_capture_sizes is not None:
-            if self.compilation_config.cudagraph_capture_sizes is not None:
+            if compilation_config.cudagraph_capture_sizes is not None:
                 raise ValueError(
                     "cudagraph_capture_sizes and compilation_config."
                     "cudagraph_capture_sizes are mutually exclusive"
                 )
-            self.compilation_config.cudagraph_capture_sizes = (
-                self.cudagraph_capture_sizes
-            )
+            compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes
         if self.max_cudagraph_capture_size is not None:
-            if self.compilation_config.max_cudagraph_capture_size is not None:
+            if compilation_config.max_cudagraph_capture_size is not None:
                 raise ValueError(
                     "max_cudagraph_capture_size and compilation_config."
                     "max_cudagraph_capture_size are mutually exclusive"
                 )
-            self.compilation_config.max_cudagraph_capture_size = (
+            compilation_config.max_cudagraph_capture_size = (
                 self.max_cudagraph_capture_size
             )
 
@@ -1679,7 +1678,7 @@ def create_engine_config(
             load_config=load_config,
             structured_outputs_config=self.structured_outputs_config,
             observability_config=observability_config,
-            compilation_config=self.compilation_config,
+            compilation_config=compilation_config,
             kv_transfer_config=self.kv_transfer_config,
             kv_events_config=self.kv_events_config,
             ec_transfer_config=self.ec_transfer_config,

From fdfd5075aa0b9b32e3000554d719f1622acff800 Mon Sep 17 00:00:00 2001
From: Johnny Yang <24908445+jcyang43@users.noreply.github.com>
Date: Thu, 13 Nov 2025 09:36:54 -0800
Subject: [PATCH 019/578] [TPU] patch TPU wheel build script to resolve
 metadata issue (#27279)

Signed-off-by: Johnny Yang <johnnyyang@google.com>
---
 setup.py                |  4 +++-
 tools/vllm-tpu/build.sh | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0934a8608eb1..e9b36e2a2e03 100644
--- a/setup.py
+++ b/setup.py
@@ -545,7 +545,9 @@ def get_vllm_version() -> str:
     # Allow overriding the version. This is useful to build platform-specific
     # wheels (e.g. CPU, TPU) without modifying the source.
     if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
-        return env_version
+        print(f"Overriding VLLM version with {env_version} from VLLM_VERSION_OVERRIDE")
+        os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = env_version
+        return get_version(write_to="vllm/_version.py")
 
     version = get_version(write_to="vllm/_version.py")
     sep = "+" if "+" not in version else "."  # dev versions might contain +
diff --git a/tools/vllm-tpu/build.sh b/tools/vllm-tpu/build.sh
index fbc91e379df3..45ef8dfcb1db 100755
--- a/tools/vllm-tpu/build.sh
+++ b/tools/vllm-tpu/build.sh
@@ -7,6 +7,13 @@ TOOLS_DIR=$(cd "$(dirname "$SCRIPT_PATH_PARAM")" && pwd) # Absolute path to the
 REPO_ROOT=$(cd "$TOOLS_DIR/../../" && pwd) # Absolute path to the repo root
 VLLM_DIR="$REPO_ROOT/" # Path to the vllm sources
 
+CHANGE_FILE_LIST=(
+  "vllm/entrypoints/cli/main.py"
+  "vllm/entrypoints/cli/run_batch.py"
+  "vllm/utils/__init__.py"
+  "vllm/platforms/__init__.py"
+)
+
 # Ensure we are not running from within the vllm directory if SCRIPT_PATH_PARAM is relative like "."
 if [ "$TOOLS_DIR" = "$VLLM_DIR" ]; then
     echo "Error: This script should not be run from the vllm directory directly if using relative paths."
@@ -30,6 +37,20 @@ if ! grep -q "name = \"vllm-tpu\"" "$PYPROJECT_FILE"; then
     echo "Patching pyproject.toml project name to vllm-tpu..."
     cp "$PYPROJECT_FILE" "${PYPROJECT_FILE}.bak"
     sed -i '0,/^name = "vllm"/s//name = "vllm-tpu"/' "$PYPROJECT_FILE"
+
+    echo "Patching ${CHANGE_FILE_LIST[@]} vllm to vllm-tpu..."
+    # patching
+    #   importlib.metadata.version('vllm') -> importlib.metadata.version('vllm-tpu')
+    #   importlib.metadata.version("vllm") -> importlib.metadata.version("vllm-tpu")
+    #   importlib.metadata.metadata('vllm') -> importlib.metadata.metadata('vllm-tpu')
+    #   importlib.metadata.metadata("vllm") -> importlib.metadata.metadata("vllm-tpu")
+    #   version('vllm') -> version('vllm-tpu')
+    #   version("vllm") -> version("vllm-tpu")
+    sed -i \
+        -e "s/importlib.metadata.version(\(['\"]\)vllm\1)/importlib.metadata.version(\1vllm-tpu\1)/" \
+        -e "s/importlib.metadata.metadata(\(['\"]\)vllm\1)/importlib.metadata.metadata(\1vllm-tpu\1)/" \
+        -e "s/version(\(['\"]\)vllm\1)/version(\1vllm-tpu\1)/" \
+        "${CHANGE_FILE_LIST[@]}"
     PATCHED=true
 else
     PATCHED=false
@@ -45,6 +66,13 @@ cleanup() {
         echo "Restoring original pyproject.toml..."
         cp "${PYPROJECT_FILE}.bak" "$PYPROJECT_FILE"
         rm -f "${PYPROJECT_FILE}.bak"
+
+        echo "Restoring vllm code..."
+        sed -i \
+            -e "s/importlib.metadata.version(\(['\"]\)vllm-tpu\1)/importlib.metadata.version(\1vllm\1)/" \
+            -e "s/importlib.metadata.metadata(\(['\"]\)vllm-tpu\1)/importlib.metadata.metadata(\1vllm\1)/" \
+            -e "s/version(\(['\"]\)vllm-tpu\1)/version(\1vllm\1)/" \
+            "${CHANGE_FILE_LIST[@]}"
     fi
 }
 trap cleanup EXIT HUP INT QUIT PIPE TERM # Register cleanup function to run on script exit and various signals

From fe1cd7704ddd3266ddc97181ab24a167b3c9223c Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 13 Nov 2025 13:16:55 -0500
Subject: [PATCH 020/578] [Performance][B200] silu_mul_quant: pack scales in
 int32 (#28358)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 csrc/quantization/activation_kernels.cu       | 164 ++++++---
 tests/conftest.py                             |  13 +
 tests/kernels/moe/test_deepep_deepgemm_moe.py |  18 +-
 tests/kernels/moe/test_deepep_moe.py          |   2 +-
 .../moe/test_silu_mul_fp8_quant_deep_gemm.py  | 311 +++++++++++++-----
 .../layers/fused_moe/batched_deep_gemm_moe.py |  76 +++--
 vllm/utils/deep_gemm.py                       |  23 ++
 7 files changed, 461 insertions(+), 146 deletions(-)

diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu
index 2521b2797e2c..0c3bcf3b64b2 100644
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -279,17 +279,17 @@ __device__ __forceinline__ void token_bounds(int32_t n_tokens,
 }
 
 template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
-          int THREADS, typename Idx_t, bool USE_UE8M0, int GROUP_SIZE = 128,
-          int NUM_STAGES = 3>
+          typename scale_t, int THREADS, typename Idx_t, bool CEIL_UE8M0,
+          int GROUP_SIZE = 128, int NUM_STAGES = 3>
 __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
     const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
-    float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
+    scale_t* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
     // sizes
     Idx_t E, Idx_t T, Idx_t H,
     // strides (in elements)
     Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
     Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
-    Idx_t stride_ys_g, Idx_t stride_counts_e) {
+    Idx_t stride_ys_g, Idx_t stride_ys_p, Idx_t stride_counts_e) {
 #ifndef USE_ROCM
   static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
 
@@ -466,9 +466,22 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
 
   __nv_fp8x4_e4m3* y_q_base_ptr =
       reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
-  auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g;
+
+  Idx_t scale_group_offset = 0;
+  if constexpr (std::is_same<scale_t, uint8_t>::value) {
+    // packed int32_t format
+    int pack_id = warp_position_scales / 4;
+    int scale_in_pack = warp_position_scales % 4;
+    scale_group_offset = pack_id * stride_ys_p + scale_in_pack * stride_ys_g;
+  } else {
+    scale_group_offset = warp_position_scales * stride_ys_g;
+  }
+
+  scale_t* const y_scale_base_ptr = _y_s + scale_group_offset;
 
   for (auto j = tokens_lower; j < tokens_upper; j++) {
+    int current_group_id = warp_position_scales;  // Running count of which
+                                                  // group is being processed
     const Idx_t base_ys = expert_id * stride_ys_e;
     auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
     __nv_fp8x4_e4m3* y_q_ptr =
@@ -509,7 +522,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
 
       __nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
 
-      if constexpr (USE_UE8M0) {
+      if constexpr (CEIL_UE8M0) {
         y_s = hexp2(hceil(hlog2(y_s)));
       }
 
@@ -527,8 +540,24 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
       y_q_ptr += WARP_SIZE * stride_yq_h;
 
       if (!lane_id) {
-        *y_s_ptr = y_s;
-        y_s_ptr += stride_ys_g;
+        // Store scales.
+        if constexpr (std::is_same<scale_t, uint8_t>::value) {
+          // Packed UE8MO format. Remove Mantissa.
+          *y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
+
+          bool const jump_pack = (current_group_id + 1) % 4 == 0;
+          // Minus 3 because we need to get to the first group in the
+          // next pack.
+          y_s_ptr += jump_pack ? (stride_ys_p - 3) : stride_ys_g;
+
+        } else {
+          // float32 format
+          static_assert(std::is_same<scale_t, float>::value);
+          *y_s_ptr = y_s;
+          y_s_ptr += stride_ys_g;
+        }
+
+        current_group_id += 1;
       }
     }
   }
@@ -573,7 +602,7 @@ void persistent_masked_m_silu_mul_quant(
     const at::Tensor& tokens_per_expert,  // (E)
     at::Tensor& y_q,                      // (E, T, H) [OUT]
     at::Tensor& y_s,                      // (E, T, H//group_size) [OUT]
-    bool use_ue8m0) {
+    bool cast_scale_ue8m0) {
 #ifndef USE_ROCM
 
   // This kernel currently only supports H % 128 == 0 and assumes a
@@ -583,9 +612,12 @@ void persistent_masked_m_silu_mul_quant(
   TORCH_CHECK(input.dtype() == torch::kBFloat16);
   TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
               y_q.dtype() == torch::kFloat8_e4m3fnuz);
-  TORCH_CHECK(y_s.dtype() == torch::kFloat32);
   TORCH_CHECK(input.size(-1) % (GROUP_SIZE * 2) == 0);
 
+  bool const is_packed_ue8m0 =
+      (y_s.dtype() == torch::kInt32 && cast_scale_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kFloat32 || is_packed_ue8m0);
+
   using Idx_t = int64_t;
 
   Idx_t E = input.size(0);
@@ -597,15 +629,18 @@ void persistent_masked_m_silu_mul_quant(
   Idx_t stride_yq_e = y_q.stride(0);
   Idx_t stride_yq_t = y_q.stride(1);
   Idx_t stride_yq_h = y_q.stride(2);
-  Idx_t stride_ys_e = y_s.stride(0);
-  Idx_t stride_ys_t = y_s.stride(1);
-  Idx_t stride_ys_g = y_s.stride(2);
 
   Idx_t stride_counts_e = tokens_per_expert.stride(0);
 
+  int const NUM_GROUPS = H / GROUP_SIZE;
+
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  #define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES)                 \
+  // TODO: Get this from cuda_arch ?
+  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
+
+  #define KERNEL(BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,  \
+                 STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, STAGES)                \
     static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE;                 \
     int sms = SILU_V2_BLOCK_COUNT;                                             \
     static constexpr int max_shared_mem_bytes =                                \
@@ -615,43 +650,86 @@ void persistent_masked_m_silu_mul_quant(
     VLLM_DISPATCH_FP8_TYPES(                                                   \
         y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] {        \
           vllm::silu_mul_fp8_quant_deep_gemm_kernel<                           \
-              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t,   \
-              USE_UE8M0, GROUP_SIZE, STAGES>                                   \
+              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, scale_t, THREAD_COUNT, \
+              Idx_t, CEIL_UE8M0, GROUP_SIZE, STAGES>                           \
               <<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>(  \
                   reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),          \
-                  (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),               \
+                  (fp8_t*)y_q.data_ptr(),                                      \
+                  reinterpret_cast<scale_t*>(y_s.data_ptr()),                  \
                   reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
                   T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e,       \
-                  stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t,          \
-                  stride_ys_g, stride_counts_e);                               \
+                  stride_yq_t, stride_yq_h, STRIDE_YS_E, STRIDE_YS_T,          \
+                  STRIDE_YS_G, STRIDE_YS_P, stride_counts_e);                  \
         });
 
-  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
-
-  int const NUM_GROUPS = H / GROUP_SIZE;
-  if (!use_ue8m0) {
-    if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
-      /* 8 warps config */
-      static constexpr int NUM_STAGES = 4;
-      static constexpr int THREAD_COUNT = 256;
-      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
-    } else {
-      /* 1 warp config */
-      static constexpr int THREAD_COUNT = 32;
-      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
-    }
-  } else {
-    if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
-      /* 8 warps config */
-      static constexpr int NUM_STAGES = 4;
-      static constexpr int THREAD_COUNT = 256;
-      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
-    } else {
-      /* 1 warp config */
-      static constexpr int THREAD_COUNT = 32;
-      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
+  #define LAUNCH_ON_H(scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,         \
+                      STRIDE_YS_P, CEIL_UE8M0)                                \
+    if (H >= 4096 && (NUM_GROUPS % 8) == 0) {                                 \
+      /* 8 warp config */                                                     \
+      static constexpr int NUM_STAGES = 4;                                    \
+      static constexpr int THREAD_COUNT = 256;                                \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, NUM_STAGES); \
+    } else {                                                                  \
+      /* 1 warp config */                                                     \
+      static constexpr int THREAD_COUNT = 32;                                 \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, 2);          \
     }
+
+  Idx_t stride_ys_e = y_s.stride(0);
+  Idx_t stride_ys_t = y_s.stride(1);
+  Idx_t stride_ys_g = y_s.stride(2);
+  Idx_t stride_ys_p = 0;
+  if (!cast_scale_ue8m0) {
+    TORCH_CHECK(!is_packed_ue8m0);
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                false);
+    return;
+  }
+
+  if (!is_packed_ue8m0) {
+    // UE8M0 but not packed
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                true);
+    return;
   }
 
+  TORCH_CHECK(cast_scale_ue8m0 && is_packed_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kInt32);
+
+  // Int32 packed ue8m0 scales tensor.
+  // Let E, T, G be the number to experts, number of tokens and number of groups
+  // respectively. Let, E = 2, T = 4, G = 6, in this case the int32 scales
+  // tensor are of shape [1, 4, 2] and stride [8, 1, 4]. The scales are expected
+  // to be arranged as follows,
+  // [[T0G0-T0G1-T0G2-T0G3, T0G4-T0G5-X-X,],
+  //  [T1G0-T1G1-T1G2-T1G3, T1G4-T1G5-X-X,]
+  //  [T2G0-T2G1-T2G2-T2G3, T2G4-T2G5-X-X,]
+  //  [T3G0-T3G1-T3G2-T3G3, T3G4-T3G5-X-X,]]
+  // where, TxGy is the scale ue8m0 scale value of Token x, Group y.
+  //
+  // In memory (in bytes) the scale values are arranged as,
+  //  [T0G0, T0G1, T0G2, T0G3, T1G0, T1G2, T1G3, T1G4, T2G0, T2G1, T2G3, T2G4,
+  //   T3G0, T3G1, T3G2, T3G3, T0G4, T0G5, X, X, T1G4, T1G5, X, X, T2G4, T2G5,
+  //   X, X, T3G4, T3G5, X, X]
+  //
+  // An Int32 tensor of size [1, 4, 2] and stride [8, 1, 4] can be represented
+  // as an uint8 tensor of shape [1, 2, 4, 4] and stride [32, 16, 4, 1]. In
+  // english, ignoring the Experts dimension, the original int32 tensor is
+  // simply treated as two packed [4, 4] uint8 tensor (or two [4, 1] int32
+  // tensor). The following strides setting reflects this change. Caveat: This
+  // means that the G dimension is no longer contiguous. i.e. Note that to move
+  // from G3 to G4, we need to jump along the packing dimension. The kernel
+  // handles this case.
+
+  stride_ys_e *= sizeof(int32_t);
+  stride_ys_p = T * sizeof(int32_t);  // Packing dimension
+  stride_ys_t = sizeof(int32_t);
+  stride_ys_g = 1;
+
+  LAUNCH_ON_H(uint8_t, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+              true);
+
 #endif
 }
diff --git a/tests/conftest.py b/tests/conftest.py
index 5e127e4e939e..b17081352edc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1384,3 +1384,16 @@ def image_urls(request, local_asset_server) -> list[str]:
     """Indirect fixture: takes a list of names, returns list of full URLs."""
     names: list[str] = request.param
     return [local_asset_server.url_for(name) for name in names]
+
+
+@pytest.fixture
+def disable_deepgemm_ue8m0(monkeypatch):
+    from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+    with monkeypatch.context() as monkeypatch_ctx:
+        monkeypatch_ctx.setenv("VLLM_USE_DEEP_GEMM_E8M0", "0")
+        is_deep_gemm_e8m0_used.cache_clear()
+        yield
+        # Clear cache so the next time it is used it is processed with the
+        # default VLLM_USE_DEEP_GEMM_E8M0  setting.
+        is_deep_gemm_e8m0_used.cache_clear()
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 9d039b81690a..0faf8bc95d2e 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -21,7 +21,11 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
-from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
+from vllm.utils.deep_gemm import (
+    get_mk_alignment_for_contiguous_layout,
+    is_deep_gemm_e8m0_used,
+    is_deep_gemm_supported,
+)
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
 
 from ...utils import multi_gpu_test
@@ -413,19 +417,16 @@ def _test_deepep_deepgemm_moe(
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(
-    is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
-)
 def test_ht_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
     num_experts: int,
     topk: int,
     world_dp_size: tuple[int, int],
+    disable_deepgemm_ue8m0,
 ):
     """
     Tests for High-Throughput DeepEP + DeepGemm integration.
     """
-    import deep_gemm
 
     m, n, k = mnk
     current_platform.seed_everything(7)
@@ -433,7 +434,7 @@ def test_ht_deepep_deepgemm_moe(
     if topk > num_experts:
         pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
 
-    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_m = get_mk_alignment_for_contiguous_layout()[0]
     block_size = [block_m, block_m]
 
     world_size, dp_size = world_dp_size
@@ -487,9 +488,6 @@ def test_ht_deepep_deepgemm_moe(
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(
-    is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
-)
 def test_ll_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
     num_experts: int,
@@ -497,10 +495,12 @@ def test_ll_deepep_deepgemm_moe(
     use_fp8_dispatch: bool,
     block_size: list[int],
     world_dp_size: tuple[int, int],
+    disable_deepgemm_ue8m0,
 ):
     """
     Tests for Low-Latency DeepEP + DeepGemm integration.
     """
+    assert not is_deep_gemm_e8m0_used()
 
     m, n, k = mnk
     current_platform.seed_everything(7)
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index b49319a7e6f5..d78b8250463a 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -294,7 +294,7 @@ def torch_moe_impl(
         # blockwise quant and de-quant.
         assert not per_act_token_quant
         a = test_tensors.rank_tokens
-        aq, aq_scale = per_token_group_quant_fp8(a, 128)
+        aq, aq_scale = per_token_group_quant_fp8(a, 128, use_ue8m0=False)
         a = (
             (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1))
             .view(a.shape)
diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
index 420dbbffaac0..d6b78dd2c232 100644
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
+import random
+
 import pytest
 import torch
 
@@ -8,27 +11,30 @@
     persistent_masked_m_silu_mul_quant,
 )
 from vllm.platforms import current_platform
-from vllm.utils.math_utils import cdiv
+from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
+from vllm.utils.math_utils import cdiv, round_up
 
 fp8_dtype = torch.float8_e4m3fn
 
 CASES = [
     (1, 1, 128, fp8_dtype),
-    (1, 4, 128, fp8_dtype),
-    (2, 4, 256, fp8_dtype),
-    (32, 64, 256, fp8_dtype),
-    (17, 31, 768, fp8_dtype),
-    (1, 1, 128 * 1, fp8_dtype),
-    (1, 1, 128 * 3, fp8_dtype),
-    (1, 1, 128 * 4, fp8_dtype),
-    (8, 16, 128 * 1, fp8_dtype),
-    (8, 16, 128 * 2, fp8_dtype),
-    (8, 16, 128 * 3, fp8_dtype),
+    (1, 4, 128 * 1, fp8_dtype),
+    (2, 4, 128 * 2, fp8_dtype),
+    (1, 4, 128 * 3, fp8_dtype),
+    (8, 16, 128 * 4, fp8_dtype),
+    (8, 16, 128 * 5, fp8_dtype),
+    (8, 16, 128 * 6, fp8_dtype),
+    (8, 16, 128 * 7, fp8_dtype),
+    (8, 16, 128 * 8, fp8_dtype),
+    (8, 16, 128 * 9, fp8_dtype),
     (8, 64, 7168, fp8_dtype),
     (8, 128, 128 * 33, fp8_dtype),
+    (1, 4, 128 * 10, fp8_dtype),
     (8, 128, 7168, fp8_dtype),
     (8, 512, 7168, fp8_dtype),
     (8, 1024, 7168, fp8_dtype),
+    (17, 31, 768, fp8_dtype),
+    (32, 64, 256, fp8_dtype),
     (256, 8, 7168, fp8_dtype),
     (256, 32, 7168, fp8_dtype),
     (256, 64, 7168, fp8_dtype),
@@ -38,14 +44,159 @@
 ]
 
 
+def as_uint8(x) -> torch.Tensor:
+    return (
+        torch.empty(x.shape, dtype=x.dtype, device=x.device).copy_(x).view(torch.uint8)
+    )
+
+
+def silu(x: torch.Tensor) -> torch.Tensor:
+    one_f32 = torch.tensor([1.0], device=x.device, dtype=torch.float32)
+    x_f32 = x.to(torch.float32)
+    act_f32 = x_f32 / (one_f32 + torch.exp(-x_f32))
+    assert act_f32.dtype == torch.float32
+    return act_f32.to(torch.bfloat16)
+
+
+def do_quant(x: torch.Tensor, group_size: int, ceil_ue8m0: bool):
+    eps_bf16 = torch.tensor([1e-10], device=x.device, dtype=torch.bfloat16)
+    one_bf16 = torch.tensor([1.0], device=x.device, dtype=torch.bfloat16)
+    fp8_max_bf16 = torch.tensor(
+        [torch.finfo(fp8_dtype).max], device=x.device, dtype=torch.bfloat16
+    )
+    fp8_min_bf16 = torch.tensor(
+        [torch.finfo(fp8_dtype).min], device=x.device, dtype=torch.bfloat16
+    )
+    fp8_max_inv = one_bf16 / fp8_max_bf16
+    assert fp8_max_inv.dtype == torch.bfloat16
+
+    assert x.size(-1) % group_size == 0
+    num_groups = x.numel() // group_size
+    x_og_shape = x.shape
+
+    x = x.to(torch.bfloat16)
+    x = x.view((-1, group_size))
+    amax = x.abs().amax(dim=1).clamp(min=eps_bf16)
+    assert amax.dtype == torch.bfloat16
+    s = amax * fp8_max_inv
+
+    if ceil_ue8m0:
+        s = torch.exp2(
+            torch.ceil(torch.log2(s).to(torch.bfloat16)).to(torch.bfloat16)
+        ).to(torch.bfloat16)
+
+    inv_s = one_bf16 / s
+    inv_s = inv_s.view((num_groups, 1))
+    xq = torch.clamp(x * inv_s, min=fp8_min_bf16.item(), max=fp8_max_bf16.item()).to(
+        fp8_dtype
+    )
+
+    xq = xq.view(x_og_shape)
+    xs = s.view((-1, xq.size(-1) // group_size))
+    return xq, xs
+
+
+def silu_mul_quant(
+    gate: torch.Tensor, up: torch.Tensor, group_size: int, ceil_ue8m0: bool
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert gate.size(-1) % group_size == 0
+    assert up.size(-1) % group_size == 0
+
+    assert gate.dtype == torch.bfloat16
+    assert up.dtype == torch.bfloat16
+
+    act_bf16 = silu(gate)
+    assert act_bf16.dtype == torch.bfloat16
+
+    # act & mul
+    a_m = act_bf16 * up
+    assert a_m.dtype == torch.bfloat16
+
+    q, s = do_quant(a_m, group_size, ceil_ue8m0)
+    return q, s
+
+
+def pack_scales(x: torch.Tensor, tokens_per_expert: torch.Tensor) -> torch.Tensor:
+    """
+    pack float32 scales into a int32 tensor
+    """
+    assert x.dtype == torch.float32
+    E, T, G = x.size()
+
+    # Add i32_padding here so we can view it as a i32 tensor later on.
+    i32_padding = round_up(G, 4) - G
+    ref_s_i8 = torch.empty((E, T, G + i32_padding), dtype=torch.uint8, device="cuda")
+    for e in range(E):
+        nt = tokens_per_expert[e].item()
+        ref_s_i8[e, :nt, :G] = x[e, :nt].view(torch.int32) >> 23
+
+    ref_s_i32 = ref_s_i8.view(torch.int32)
+
+    return ref_s_i32
+
+
+def ref_with_scale_fmt(
+    E: int,
+    T: int,
+    H: int,
+    group_size: int,
+    tokens_per_expert: torch.Tensor,
+    gate: torch.Tensor,
+    up: torch.Tensor,
+    scale_fmt: DeepGemmQuantScaleFMT,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    The precision types of the operations triggered by this function
+    match closely with the kernel implementation so we compare more
+    accurately.
+    """
+    scale_dtype = (
+        torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
+    )
+    ceil_ue8m0 = scale_fmt in [
+        DeepGemmQuantScaleFMT.UE8M0,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+    ]
+
+    ref_q = torch.empty((E, T, H), dtype=fp8_dtype, device="cuda")
+    ref_s_f32 = torch.empty(
+        (E, T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
+    )
+
+    for e in range(E):
+        nt = tokens_per_expert[e].item()
+        if nt == 0:
+            continue
+        ref_q[e, :nt], ref_s_f32[e, :nt] = silu_mul_quant(
+            gate[e, :nt], up[e, :nt], group_size, ceil_ue8m0=ceil_ue8m0
+        )
+
+    if scale_dtype == torch.float32:
+        return ref_q, ref_s_f32
+
+    assert scale_dtype == torch.int32
+    return ref_q, pack_scales(ref_s_f32, tokens_per_expert)
+
+
+def token_random(E, T, H2, tokens_per_expert):
+    """
+    Initialize each token in a random range so we test a range of
+    scale values.
+    """
+    y = torch.empty((E, T, H2), dtype=torch.bfloat16, device="cuda")
+    for e in range(E):
+        for t in range(tokens_per_expert[e].item()):
+            exp = random.choice(range(1, 20))
+            y[e, t].uniform_(-(2**exp), 2**exp)
+    return y
+
+
 @pytest.mark.parametrize("E,T,H,fp8_type", CASES)
 @torch.inference_mode()
-def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
+def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
     group_size = 128
     current_platform.seed_everything(42)
 
-    # Input tensor of shape (E, T, 2*H)
-    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
     tokens_per_expert = torch.randint(
         low=0,
         high=T,
@@ -54,71 +205,83 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
         device="cuda",
     )
 
-    # Run the SiLU V2 kernel
-    # TODO (varun): use_e8m0 is set to false as the reference impl does
-    # not handle that case.
-    y_q, y_s = persistent_masked_m_silu_mul_quant(
-        y, tokens_per_expert, group_size=group_size, use_ue8m0=False
-    )
+    # Input tensor of shape (E, T, 2*H)
+    y = token_random(E, T, 2 * H, tokens_per_expert)
 
-    torch.cuda.synchronize()
-    fp8_info = torch.finfo(fp8_dtype)
-    fp8_max = fp8_info.max
-    fp8_min = fp8_info.min
-    eps = 1e-10
+    gate = y[..., :H].to(torch.bfloat16)
+    up = y[..., H:].to(torch.bfloat16)
 
-    y1 = y[..., :H].float()
-    y2 = y[..., H:]
-    silu_x = y1 * torch.sigmoid(y1)
-    merged = silu_x * y2
+    scale_fmts = [
+        DeepGemmQuantScaleFMT.FLOAT32,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+        DeepGemmQuantScaleFMT.UE8M0,
+    ]
 
-    for e in range(E):
-        nt = tokens_per_expert[e].item()
-        ref_s = torch.empty(
-            (T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
+    # Run the SiLU V2 kernel
+    for scale_fmt in scale_fmts:
+        y_q, y_s = persistent_masked_m_silu_mul_quant(
+            y,
+            tokens_per_expert,
+            group_size=group_size,
+            quant_scale_fmt=scale_fmt,
         )
-        ref_q = torch.empty((T, H), dtype=fp8_dtype, device="cuda")
 
-        for t in range(nt):
-            data = merged[e, t].float()
-            ref_q_row = torch.empty_like(data)
+        ref_y_q, ref_y_s = ref_with_scale_fmt(
+            E, T, H, group_size, tokens_per_expert, gate, up, scale_fmt=scale_fmt
+        )
 
-            # process full groups
-            n_full_groups = H // group_size
-            if n_full_groups > 0:
-                data_grp = data[: n_full_groups * group_size].view(
-                    n_full_groups, group_size
-                )
-                amax = data_grp.abs().amax(dim=1).clamp(min=eps)
-                scale = amax / fp8_max
-                scaled = data[: n_full_groups * group_size] / scale.repeat_interleave(
-                    group_size
-                )
-                ref_q_row[: n_full_groups * group_size] = scaled.clamp(
-                    fp8_min, fp8_max
-                ).to(fp8_dtype)
-                ref_s[t, :n_full_groups] = scale
-
-            # process remainder group
-            rem = H % group_size
-            if rem > 0:
-                data_rem = data[-rem:]
-                amax = data_rem.abs().amax().clamp(min=eps)
-                scale = amax / fp8_max
-                scaled = data_rem / scale
-                ref_q_row[-rem:] = scaled.clamp(fp8_min, fp8_max).to(fp8_dtype)
-                ref_s[t, -1] = scale
-
-            ref_q[t] = ref_q_row
-
-        y_se = y_s[e].float()
-        y_qe = y_q[e].float()
-
-        torch.testing.assert_close(
-            y_qe[:nt].to(torch.float32),
-            ref_q[:nt].to(torch.float32),
-            atol=2,
-            rtol=2e-1,
+        # deepgemm scales transform
+        dg_scales = None
+        if (
+            has_deep_gemm()
+            and current_platform.has_device_capability(100)
+            and scale_fmt == DeepGemmQuantScaleFMT.UE8M0
+        ):
+            from deep_gemm import transform_sf_into_required_layout
+
+            _q, _s = ref_with_scale_fmt(
+                E,
+                T,
+                H,
+                group_size,
+                tokens_per_expert,
+                gate,
+                up,
+                scale_fmt=DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+            )
+            dg_scales = transform_sf_into_required_layout(
+                sf=_s,
+                mn=_q.size(1),
+                k=_q.size(2),
+                recipe=(1, 128, 128),
+                num_groups=_q.size(0),
+                is_sfa=True,
+            )
+
+        expected_scale_dtype = (
+            torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
         )
+        assert y_s.dtype == expected_scale_dtype
+        assert ref_y_s.dtype == expected_scale_dtype
 
-        torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
+        for e in range(E):
+            nt = tokens_per_expert[e].item()
+
+            torch.testing.assert_close(
+                y_q[e, :nt].to(torch.float32),
+                ref_y_q[e, :nt].to(torch.float32),
+            )
+
+            if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
+                G = H // group_size
+                y_s_sliced = as_uint8(y_s[e])
+                ref_s_sliced = as_uint8(ref_y_s[e])
+                torch.testing.assert_close(y_s_sliced[:nt, :G], ref_s_sliced[:nt, :G])
+                if dg_scales is not None:
+                    dg_sliced = as_uint8(dg_scales[e])
+                    torch.testing.assert_close(y_s_sliced[:nt, :G], dg_sliced[:nt, :G])
+            else:
+                torch.testing.assert_close(
+                    y_s[e, :nt],
+                    ref_y_s[e, :nt],
+                )
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 869082f8231d..79c92eb48612 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -13,14 +14,33 @@
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
     fp8_m_grouped_gemm_nt_masked,
     get_mk_alignment_for_contiguous_layout,
     is_deep_gemm_e8m0_used,
 )
+from vllm.utils.math_utils import cdiv
 
 logger = init_logger(__name__)
 
 
+def scales_shape_stride_dtype(
+    E: int, T: int, G: int, quant_scale_fmt: DeepGemmQuantScaleFMT
+) -> tuple[tuple[int, ...], tuple[int, ...], torch.dtype]:
+    shape = (E, T, G)
+    strides = (T * G, 1, T)
+    if quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+    ]:
+        return shape, strides, torch.float32
+
+    assert quant_scale_fmt == DeepGemmQuantScaleFMT.UE8M0
+    shape = (E, T, cdiv(G, 4))
+    strides = (T * cdiv(G, 4), 1, T)
+    return shape, strides, torch.int32
+
+
 @triton.jit
 def _silu_mul_fp8_quant_deep_gemm(
     # Pointers ------------------------------------------------------------
@@ -49,7 +69,7 @@ def _silu_mul_fp8_quant_deep_gemm(
     eps: tl.constexpr,
     fp8_min: tl.constexpr,
     fp8_max: tl.constexpr,
-    use_ue8m0: tl.constexpr,
+    ceil_ue8m0: tl.constexpr,
     # Meta ---------------------------------------------------------------
     BLOCK: tl.constexpr,
     NUM_STAGES: tl.constexpr,
@@ -86,7 +106,7 @@ def _silu_mul_fp8_quant_deep_gemm(
         y = gate * up
 
         y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
-        if use_ue8m0:
+        if ceil_ue8m0:
             y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
 
         y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
@@ -100,7 +120,7 @@ def persistent_masked_m_silu_mul_quant(
     tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
     num_parallel_tokens=16,
     group_size: int = 128,
-    use_ue8m0: bool | None = None,
+    quant_scale_fmt: DeepGemmQuantScaleFMT = DeepGemmQuantScaleFMT.FLOAT32,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
     y has shape (E, T, 2*H). The first half of the last dimension is
@@ -137,7 +157,13 @@ def persistent_masked_m_silu_mul_quant(
 
     Returns `(y_q, y_s)` where
     * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
-    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+    * `y_s` depends on quant_scale_fmt,
+      - quant_scale_fmt == FLOAT32,
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0,
+         `y_s`: Int32 tensor, shape (E, T, H // group_size // 4), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0_FLOAT32_SPARSE
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
     Let NUM_WARPS be the number of warps in a single thread block and
     `GROUP_SIZE = 128` be the size of the quantization group.
     """
@@ -155,17 +181,18 @@ def persistent_masked_m_silu_mul_quant(
     fp8_dtype = torch.float8_e4m3fn
     y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
 
-    stride_ys_e = T * G
-    stride_ys_t = 1
-    stride_ys_g = T
+    ys_shape, ys_strides, ys_dtype = scales_shape_stride_dtype(E, T, G, quant_scale_fmt)
     y_s = torch.empty_strided(
-        (E, T, G),
-        (stride_ys_e, stride_ys_t, stride_ys_g),
-        dtype=torch.float32,
+        ys_shape,
+        ys_strides,
+        dtype=ys_dtype,
         device=y.device,
     )
 
-    use_ue8m0 = use_ue8m0 if use_ue8m0 is not None else is_deep_gemm_e8m0_used()
+    ceil_ue8m0 = quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+        DeepGemmQuantScaleFMT.UE8M0,
+    ]
 
     cuda_arch = current_platform.get_device_capability(
         device_id=y.device.index
@@ -173,7 +200,7 @@ def persistent_masked_m_silu_mul_quant(
 
     if cuda_arch >= 80:
         torch.ops._C.persistent_masked_m_silu_mul_quant(
-            y, tokens_per_expert, y_q, y_s, use_ue8m0
+            y, tokens_per_expert, y_q, y_s, ceil_ue8m0
         )
     else:
         stride_cnt_e = tokens_per_expert.stride()[0]
@@ -189,6 +216,10 @@ def persistent_masked_m_silu_mul_quant(
         fp8_max = f_info.max
         fp8_min = f_info.min
         eps: float = 1e-10
+        assert y_s.dtype == torch.float32, (
+            "_silu_mul_fp8_quant_deep_gemm does"
+            "not support {y_s.dtype} scales. Only torch.float32 supported."
+        )
         _silu_mul_fp8_quant_deep_gemm[grid](
             y,
             y_q,
@@ -202,14 +233,14 @@ def persistent_masked_m_silu_mul_quant(
             stride_yq_e,
             stride_yq_t,
             stride_yq_h,
-            stride_ys_e,
-            stride_ys_t,
-            stride_ys_g,
+            ys_strides[0],
+            ys_strides[1],
+            ys_strides[2],
             stride_cnt_e,
             eps,
             fp8_min,
             fp8_max,
-            is_deep_gemm_e8m0_used(),
+            ceil_ue8m0,
             BLOCK=group_size,
             NUM_STAGES=4,
             num_warps=1,
@@ -255,7 +286,7 @@ def supports_packed_ue8m0_act_scales(self) -> bool:
         """
         DeepGemm supports packed ue8m0 activation scales format in devices == sm100
         """
-        return current_platform.is_device_capability(100)
+        return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100)
 
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         # Let PrepareAndFinalize::finalize() decide the impl.
@@ -329,10 +360,17 @@ def apply(
             expected_m,
         )
 
+        quant_scale_fmt = DeepGemmQuantScaleFMT.from_oracle()
         a2q, a2q_scale = persistent_masked_m_silu_mul_quant(
-            workspace1, expert_num_tokens
+            workspace1,
+            expert_num_tokens,
+            quant_scale_fmt=quant_scale_fmt,
         )
 
         fp8_m_grouped_gemm_nt_masked(
-            (a2q, a2q_scale), (w2, self.w2_scale), output, expert_num_tokens, expected_m
+            (a2q, a2q_scale),
+            (w2, self.w2_scale),
+            output,
+            expert_num_tokens,
+            expected_m,
         )
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 4c15baf7a8f9..b5ab37534dd7 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -9,6 +9,7 @@
 import importlib
 import os
 from collections.abc import Callable
+from enum import Enum
 from typing import Any, NoReturn
 
 import torch
@@ -20,6 +21,28 @@
 from vllm.utils.math_utils import cdiv
 
 
+class DeepGemmQuantScaleFMT(Enum):
+    # Float32 scales in Float32 tensor
+    FLOAT32 = 0
+    # Compute float32 scales and ceil the scales to UE8M0.
+    # Keep the scales in Float32 tensor.
+    FLOAT32_CEIL_UE8M0 = 1
+    # Compute float32 scales and ceil the scales to UE8M0.
+    # Pack the scales into a int32 tensor where each int32
+    # element contains 4 scale values.
+    UE8M0 = 2
+
+    @staticmethod
+    def from_oracle() -> "DeepGemmQuantScaleFMT":
+        if not is_deep_gemm_e8m0_used():
+            return DeepGemmQuantScaleFMT.FLOAT32
+        return (
+            DeepGemmQuantScaleFMT.UE8M0
+            if current_platform.is_device_capability(100)
+            else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
+        )
+
+
 @functools.cache
 def is_deep_gemm_supported() -> bool:
     """Return `True` if DeepGEMM is supported on the current platform.

From 119c4927b33f78cb8bb2283a57ee0e3a14021777 Mon Sep 17 00:00:00 2001
From: Yannick Schnider <Yannick.Schnider1@ibm.com>
Date: Thu, 13 Nov 2025 19:18:47 +0100
Subject: [PATCH 021/578] [Bugfix] Fix validate model input for decoder models
 (#27099)

Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
Signed-off-by: Yannick Schnider <Yannick.Schnider1@ibm.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 tests/v1/e2e/test_context_length.py | 63 +++++++++++++++++++++++++++++
 vllm/v1/engine/processor.py         | 15 +++++++
 2 files changed, 78 insertions(+)
 create mode 100644 tests/v1/e2e/test_context_length.py

diff --git a/tests/v1/e2e/test_context_length.py b/tests/v1/e2e/test_context_length.py
new file mode 100644
index 000000000000..0ac40bec35fe
--- /dev/null
+++ b/tests/v1/e2e/test_context_length.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for vLLM `vllm/v1/engine/processor.Processor._validate_model_input()`
+handling of maximum context length for decoder models.
+
+This test ensures:
+- A prompt that is one token shorter than the model's maximum context length
+  can be processed successfully when requesting one additional token.
+- A prompt that reaches the model's maximum context length throws a
+  `ValueError` when requesting at least one additional token.
+"""
+
+import pytest
+
+from tests.conftest import VllmRunner
+from tests.utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("model, max_model_len", [("JackFram/llama-160m", 2048)])
+@pytest.mark.parametrize(
+    "prompt_len, max_tokens",
+    [
+        (2047, 1),  # prompt_len = max_model_len - 1 -> allowed
+        (2048, 1),  # prompt_len = max_model_len -> not allowed
+    ],
+)
+def test_decoder_max_context_length_validation(
+    model: str,
+    max_model_len: int,
+    vllm_runner: type[VllmRunner],
+    prompt_len: int,
+    max_tokens: int,
+) -> None:
+    """Check vLLM decoder model input validation for edge cases where
+    the prompt length is (almost) equal to the max model length."""
+
+    prompt_ids = [[43] * prompt_len]
+
+    with vllm_runner(
+        model_name=model,
+        tokenizer_name=model,
+        max_model_len=max_model_len,
+        max_num_seqs=1,
+        tensor_parallel_size=1,
+    ) as vllm_model:
+        if prompt_len + max_tokens <= max_model_len:
+            # Should succeed as constraints are met
+            vllm_model.generate_greedy(prompt_ids, max_tokens)
+        else:
+            # Should raise the ValueError defined in
+            # vllm/v1/engine/processor.Processor_validate_model_input()
+            expected_msg = (
+                f"The decoder prompt (length {prompt_len}) plus the number of "
+                f"requested output tokens (at least 1) is longer than "
+                f"the maximum model length of {max_model_len}. "
+                "Make sure that `max_model_len` is no smaller than the number of "
+                "text tokens (prompt + requested output tokens)."
+            )
+            with pytest.raises(ValueError) as excinfo:
+                vllm_model.generate_greedy(prompt_ids, max_tokens)
+            assert expected_msg in str(excinfo.value)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index f2d992403e1a..69509d5d4712 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -575,6 +575,21 @@ def _validate_model_input(
             # check that chunked prefill does not truncate them
             # max_batch_len = self.scheduler_config.max_num_batched_tokens
 
+        if (
+            prompt_len == max_prompt_len
+            and prompt_type == "decoder"
+            and not model_config.is_multimodal_model
+        ):
+            suggestion = (
+                "Make sure that `max_model_len` is no smaller than the "
+                "number of text tokens (prompt + requested output tokens)."
+            )
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) plus the number of "
+                f"requested output tokens (at least 1) is longer than the maximum "
+                f"model length of {max_prompt_len}. {suggestion}"
+            )
+
     def stat_mm_cache(self) -> MultiModalCacheStats | None:
         return self.input_preprocessor.stat_mm_cache()
 

From f9f3b596f374c4a01acef275ee1f35398bb05164 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 13 Nov 2025 12:20:01 -0600
Subject: [PATCH 022/578] [Attention][Bugfix] Fix FA sink support (#28660)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/v1/attention/backends/flash_attn.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index bfb4a45c2b56..81623549ae85 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -130,6 +130,12 @@ def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool:
             return flash_attn_supports_fp8()
         return kv_cache_dtype in ["auto"]
 
+    @classmethod
+    def supports_sink(cls) -> bool:
+        if not is_flash_attn_varlen_func_available():
+            return False
+        return flash_attn_supports_sinks()
+
     @classmethod
     def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
         return capability >= DeviceCapability(8, 0)

From 5d6ce2b9601f3251487e44eb9e00c098101c4af6 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Fri, 14 Nov 2025 02:21:25 +0800
Subject: [PATCH 023/578] [Perf] Support stream interval for reducing host
 overhead (#27869)

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_output_processor.py | 18 ++++++++++--
 vllm/config/scheduler.py                 |  6 ++++
 vllm/engine/arg_utils.py                 |  6 ++++
 vllm/v1/engine/async_llm.py              |  3 +-
 vllm/v1/engine/llm_engine.py             |  3 +-
 vllm/v1/engine/output_processor.py       | 36 +++++++++++++++++++++++-
 6 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index d77a119ec60f..8e1198b315bd 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -49,10 +49,15 @@ def _ref_convert_id_to_token(
 @pytest.mark.parametrize(
     "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
 )
+@pytest.mark.parametrize("stream_interval", [1, 5, 10])
 def test_incremental_detokenization(
-    request_output_kind: RequestOutputKind, dummy_test_vectors
+    request_output_kind: RequestOutputKind,
+    stream_interval: int,
+    dummy_test_vectors,
 ):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
+    output_processor = OutputProcessor(
+        dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
+    )
     engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
 
     # Make N requests.
@@ -104,9 +109,18 @@ def test_incremental_detokenization(
             if request_id not in gen_strings:
                 gen_strings[request_id] = new_text
                 gen_tokens[request_id] = new_tokens
+                if request_output_kind == RequestOutputKind.DELTA:
+                    assert len(new_tokens) == 1, f"{len(new_tokens)=}"
             else:
                 gen_strings[request_id] += new_text
                 gen_tokens[request_id].extend(new_tokens)
+                if (
+                    request_output_kind == RequestOutputKind.DELTA
+                    and not request_output.finished
+                ):
+                    assert len(new_tokens) >= stream_interval, (
+                        f"{len(new_tokens)=}, {stream_interval=}"
+                    )
 
     # Confirmed tracked values matches what we expected.
     for idx, (ref_gen_str, ref_gen_toks) in enumerate(
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 47aa343527b3..71a06e167fd9 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -142,6 +142,12 @@ class SchedulerConfig:
     speculative decoding and pipeline parallelism.
     """
 
+    stream_interval: int = Field(default=1, ge=1)
+    """The interval (or buffer size) for streaming in terms of token length.
+    A smaller value (1) makes streaming smoother by sending each token immediately,
+    while a larger value (e.g., 10) reduces host overhead and may increase throughput
+    by batching multiple tokens before sending."""
+
     def get_scheduler_cls(self) -> type["SchedulerInterface"]:
         if self.scheduler_cls is None:
             if self.async_scheduling:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ca7f5e5e3e05..b025004ea022 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -558,6 +558,8 @@ class EngineArgs:
 
     async_scheduling: bool | None = SchedulerConfig.async_scheduling
 
+    stream_interval: int = SchedulerConfig.stream_interval
+
     kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
 
     kv_offloading_size: float | None = CacheConfig.kv_offloading_size
@@ -1067,6 +1069,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         scheduler_group.add_argument(
             "--async-scheduling", **scheduler_kwargs["async_scheduling"]
         )
+        scheduler_group.add_argument(
+            "--stream-interval", **scheduler_kwargs["stream_interval"]
+        )
 
         # Compilation arguments
         compilation_kwargs = get_kwargs(CompilationConfig)
@@ -1562,6 +1567,7 @@ def create_engine_config(
             long_prefill_token_threshold=self.long_prefill_token_threshold,
             disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager,
             async_scheduling=self.async_scheduling,
+            stream_interval=self.stream_interval,
         )
 
         if not model_config.is_multimodal_model and self.default_mm_loras:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index aee21fb3fffe..48ea6ef8515c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -120,8 +120,9 @@ def __init__(
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
+        stream_interval = self.vllm_config.scheduler_config.stream_interval
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats
+            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 6224af5700b7..1db83446ba0b 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -96,8 +96,9 @@ def __init__(
         )
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
+        stream_interval = self.vllm_config.scheduler_config.stream_interval
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats
+            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index d8d03f19d466..bdbbfe2595f8 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -104,6 +104,7 @@ def __init__(
         arrival_time: float,
         queue: RequestOutputCollector | None,
         log_stats: bool,
+        stream_interval: int,
         top_p: float | None = None,
         n: int | None = None,
         temperature: float | None = None,
@@ -131,6 +132,10 @@ def __init__(
 
         self.stats = RequestStateStats(arrival_time=arrival_time) if log_stats else None
 
+        # Stream Interval
+        self.stream_interval = stream_interval
+        self.sent_tokens_offset = 0  # Offset of sent tokens
+
     @classmethod
     def from_new_request(
         cls,
@@ -141,6 +146,7 @@ def from_new_request(
         request_index: int,
         queue: RequestOutputCollector | None,
         log_stats: bool,
+        stream_interval: int,
     ) -> "RequestState":
         if sampling_params := request.sampling_params:
             if not sampling_params.detokenize:
@@ -188,6 +194,7 @@ def from_new_request(
             arrival_time=request.arrival_time,
             queue=queue,
             log_stats=log_stats,
+            stream_interval=stream_interval,
         )
 
     def make_request_output(
@@ -205,6 +212,29 @@ def make_request_output(
             # Only the final output is required in FINAL_ONLY mode.
             return None
 
+        if self.stream_interval > 1:
+            assert self.detokenizer is not None
+
+            # Send output request only when
+            # 1. It has finished, or
+            # 2. It is the first token, or
+            # 3. It has reached the stream interval number of tokens
+            if not (
+                finished
+                or self.sent_tokens_offset == 0
+                or len(self.detokenizer.output_token_ids) - self.sent_tokens_offset
+                >= self.stream_interval
+            ):
+                return None
+
+            if self.output_kind == RequestOutputKind.DELTA:
+                # Send tokens from the offset in DELTA mode, otherwise all
+                # tokens are sent.
+                new_token_ids = self.detokenizer.output_token_ids[
+                    self.sent_tokens_offset :
+                ]
+                self.sent_tokens_offset = len(self.detokenizer.output_token_ids)
+
         request_id = self.request_id
         if pooling_output is not None:
             return self._new_request_output(
@@ -310,9 +340,12 @@ def _new_pooling_output(
 class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
-    def __init__(self, tokenizer: AnyTokenizer, log_stats: bool):
+    def __init__(
+        self, tokenizer: AnyTokenizer, log_stats: bool, stream_interval: int = 1
+    ):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
+        self.stream_interval = stream_interval
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates(log_stats)
@@ -385,6 +418,7 @@ def add_request(
             request_index=request_index,
             queue=queue,
             log_stats=self.log_stats,
+            stream_interval=self.stream_interval,
         )
         self.request_states[request_id] = req_state
         if parent_req:

From 968060c15adc0b68a76d37db00acf1273a23b829 Mon Sep 17 00:00:00 2001
From: Qiu <qiuchunshuo@huawei.com>
Date: Fri, 14 Nov 2025 03:29:22 +0800
Subject: [PATCH 024/578] [bugfix] correct local_chunk_len for DCP in
 reorg_kvcache with long context (#28526)

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/v1/attention/backends/mla/common.py | 29 ++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 467c01cd9d06..2ccdd1f143ce 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -337,6 +337,7 @@ class ChunkedContextMetadata:
         local_context_lens_allranks: list[list[int]] | None = None
         padded_local_cu_seq_lens: torch.Tensor | None = None
         cu_seq_lens_lst: list[list[int]] | None = None
+        chunk_size: int | None = None
 
     block_table: torch.Tensor
     query_start_loc: torch.Tensor
@@ -902,6 +903,7 @@ def build(
                             device, non_blocking=True
                         ),
                         cu_seq_lens_lst=cu_seq_lens_cpu.tolist(),
+                        chunk_size=padded_local_max_context_chunk_across_ranks,
                     )
                 else:
                     chunked_context_metadata = chunked_context_metadata_cls(
@@ -986,6 +988,8 @@ def reorg_kvcache(
     local_context_lens_allranks: list[list[int]],
     sum_seq_len: int,
     max_seq_len: int,
+    chunk_size: int,
+    chunk_idx: int,
     toks: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
@@ -1001,6 +1005,9 @@ def reorg_kvcache(
         local_context_lens_allranks: local context lengths on each CP rank.
         sum_seq_len: the sum of cp_chunk_seq_lens_lst.
         max_seq_len: the max value of cp_chunk_seq_lens_lst.
+        chunk_size: the local padded max context chunk from
+            chunked_context_metadata building.
+        chunk_idx: chunk idx of chunked_prefill.
         toks: the number of tokens for local gather cache.
     """
     kv_c_segments = []
@@ -1012,20 +1019,31 @@ def reorg_kvcache(
     ):
         cur_seq_len = 0
         for rank, local_context_len in enumerate(local_context_lens):
-            if local_context_len != 0:
+            # Note(qcs): We split the context into multiple chunks,
+            # depending on the size of the workspace.
+            # local_context in dcp0:   |-----------------|
+            # local_context in dcp1:   |--------------|
+            # n*padded_local_chunk:    |-----|-----|-----|
+            # local_chunk_len in dcp1: |-----|-----|--|
+            # so we need update the last chunk length in dcp1.
+            local_chunk_len = min(
+                max(0, local_context_len - chunk_idx * chunk_size),
+                padded_local_chunk_seq_len,
+            )
+            if local_chunk_len != 0:
                 kv_c_segment = allgatered_kv_c_normed[
                     rank * toks + src_token_idx : rank * toks
                     + src_token_idx
-                    + local_context_len
+                    + local_chunk_len
                 ]
                 k_pe_segment = allgatered_k_pe[
                     rank * toks + src_token_idx : rank * toks
                     + src_token_idx
-                    + local_context_len
+                    + local_chunk_len
                 ]
                 kv_c_segments.append(kv_c_segment)
                 k_pe_segments.append(k_pe_segment)
-                cur_seq_len += local_context_len
+                cur_seq_len += local_chunk_len
         max_seq_len_check = max(max_seq_len_check, cur_seq_len)
         src_token_idx += padded_local_chunk_seq_len
     reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0)
@@ -1676,6 +1694,7 @@ def _context_parallel_compute_prefill_context(
         assert prefill_metadata.chunked_context.local_context_lens_allranks is not None
         assert prefill_metadata.chunked_context.padded_local_cu_seq_lens is not None
         assert prefill_metadata.chunked_context.cu_seq_lens_lst is not None
+        assert prefill_metadata.chunked_context.chunk_size is not None
 
         output = None
         iters = len(prefill_metadata.chunked_context.seq_tot)
@@ -1725,6 +1744,8 @@ def _context_parallel_compute_prefill_context(
                 local_context_lens_allranks=prefill_metadata.chunked_context.local_context_lens_allranks,
                 sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i][-1],
                 max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i],
+                chunk_size=prefill_metadata.chunked_context.chunk_size,
+                chunk_idx=i,
                 toks=toks,
             )
 

From 262d263f6c56fa95e15422d3a475da8efdf67cc1 Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Thu, 13 Nov 2025 12:09:05 -0800
Subject: [PATCH 025/578] [Bugfix] Eliminate tuple inputs to submodules in
 graph partitioning (#28533)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 .buildkite/test-pipeline.yaml         |   1 +
 tests/compile/test_graph_partition.py | 124 ++++++++++++++++++++++++++
 vllm/compilation/backends.py          |  17 +++-
 3 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 tests/compile/test_graph_partition.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index be1b79ddc432..52539728215b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -445,6 +445,7 @@ steps:
     - vllm/
     - tests/compile
   commands:
+    - pytest -v -s compile/test_graph_partition.py
     - pytest -v -s compile/test_config.py
     - pytest -v -s compile/test_pass_manager.py
     - pytest -v -s compile/test_fusion.py
diff --git a/tests/compile/test_graph_partition.py b/tests/compile/test_graph_partition.py
new file mode 100644
index 000000000000..1cd783843a62
--- /dev/null
+++ b/tests/compile/test_graph_partition.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import operator
+
+import pytest
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+
+from vllm.compilation.backends import split_graph
+
+
+def test_getitem_moved_to_producer_subgraph():
+    """
+    Test that getitem operations are moved to the same subgraph as their input,
+    preventing tuple inputs to submodules.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        # torch.split returns a tuple, creating real getitem operations
+        # Should become first submodule that produces tuple
+        chunks = torch.split(x, x.shape[0] // 2, dim=0)
+
+        # Following ops should become second submodule that consumes tuple
+        result_0 = torch.relu(chunks[0])
+        result_1 = torch.relu(chunks[1])
+        return torch.cat([result_0, result_1], dim=0)
+
+    x = torch.randn(4, 3)
+    gm = make_fx(model_fn)(x)
+
+    has_getitem = any(
+        node.op == "call_function" and node.target == operator.getitem
+        for node in gm.graph.nodes
+    )
+    assert has_getitem, "Test setup failed: graph should contain getitem operations"
+
+    # Split on tuple producer aten::split
+    split_ops = ["aten::split.Tensor"]
+    split_gm, split_items = split_graph(gm, split_ops)
+    assert len(split_items) == 2, "Graph should be split into 2 submodules"
+
+    for split_item in split_items:
+        submodule = split_item.graph
+
+        getitem_on_placeholder = []
+        for node in submodule.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == operator.getitem
+                and node.args[0].op == "placeholder"
+            ):
+                getitem_on_placeholder.append(node)
+
+        assert len(getitem_on_placeholder) == 0, (
+            f"Submodule {split_item.submod_name} has getitem operations on "
+            f"placeholder nodes: {[n.name for n in getitem_on_placeholder]}. "
+            "This means tuple inputs were not properly eliminated."
+        )
+
+    new_x = torch.randn(4, 3)
+    output_original = gm(new_x)
+    output_split = split_gm(new_x)
+
+    assert torch.allclose(output_original, output_split), "Output mismatch"
+
+
+def test_no_tuple_inputs_with_multiple_consumers():
+    """
+    Test that when a tuple is consumed by multiple split operations,
+    getitem operations are properly moved to avoid tuple inputs.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        # torch.split returns a tuple, creating real getitem operations
+        # Should become first submodule that produces tuple
+        chunks = torch.split(x, x.shape[0] // 2, dim=0)
+
+        # These should become second submodule consuming tuple
+        result_1 = torch.relu(chunks[0])
+        result_2 = torch.relu(chunks[1])
+
+        # Artificial graph splitting point to create another
+        # independent submodule that consumes tuple later
+        # This would become the third submodule
+        result_1 = torch.sigmoid(result_1)
+
+        # Fourth submodule that consumes tuple
+        result = torch.cat([chunks[0], chunks[1], result_1, result_2])
+        return result
+
+    x = torch.randn(4, 3)
+    gm = make_fx(model_fn)(x)
+
+    has_getitem = any(
+        node.op == "call_function" and node.target == operator.getitem
+        for node in gm.graph.nodes
+    )
+    assert has_getitem, "Test setup failed: graph should contain getitem operations"
+
+    split_ops = ["aten::split.Tensor", "aten::sigmoid"]
+    split_gm, split_items = split_graph(gm, split_ops)
+    assert len(split_items) == 4, "Graph should be split into 4 submodules"
+
+    for split_item in split_items:
+        submodule = split_item.graph
+
+        for node in submodule.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == operator.getitem
+                and node.args[0].op == "placeholder"
+            ):
+                pytest.fail(
+                    f"Submodule {split_item.submod_name} has getitem on "
+                    f"placeholder {node.args[0].name}, indicating it receives "
+                    "a tuple input"
+                )
+
+    new_x = torch.randn(4, 3)
+    output_original = gm(new_x)
+    output_split = split_gm(new_x)
+
+    assert torch.allclose(output_original, output_split), "Output mismatch after split"
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index be69075f94f0..60ef6eef2166 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -4,6 +4,7 @@
 import ast
 import dataclasses
 import hashlib
+import operator
 import os
 import pprint
 import time
@@ -307,12 +308,24 @@ def split_graph(
 ) -> tuple[fx.GraphModule, list[SplitItem]]:
     # split graph by ops
     subgraph_id = 0
-    node_to_subgraph_id = {}
-    split_op_graphs = []
+    node_to_subgraph_id: dict[fx.Node, int] = {}
+    split_op_graphs: list[int] = []
     for node in graph.graph.nodes:
         if node.op in ("output", "placeholder"):
             continue
 
+        # Check if this is a getitem operation on a node from an earlier subgraph.
+        # If so, assign it to the same subgraph as its input to avoid passing entire
+        # tuple as input to submodules, which is against standalone_compile and
+        # AoTAutograd input requirement.
+        if node.op == "call_function" and node.target == operator.getitem:
+            # Assign this getitem to the same subgraph as its input
+            input_node = node.args[0]
+            if input_node.op != "placeholder":
+                assert input_node in node_to_subgraph_id
+                node_to_subgraph_id[node] = node_to_subgraph_id[input_node]
+                continue
+
         if should_split(node, splitting_ops):
             subgraph_id += 1
             node_to_subgraph_id[node] = subgraph_id

From faed7bf07ec831529c5ed54e15b21e30b30dc16e Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Fri, 14 Nov 2025 05:48:08 +0900
Subject: [PATCH 026/578] [Bugfix] [CPU] bump torch to 2.9.0 for Darwin to fix
 segmentation fault (#27791)

Signed-off-by: Kebe <mail@kebe7jun.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 requirements/cpu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 605ce73bff9c..d11787df4d92 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -8,7 +8,7 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.8.0; platform_system == "Darwin"
+torch==2.9.0; platform_system == "Darwin"
 torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch

From 1b622deba73347f044c13fa80a09a5647d21a45c Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 13 Nov 2025 13:01:43 -0800
Subject: [PATCH 027/578] [Misc] Update CODEOWNERS for simon-mo and comaniac
 (#28675)

Signed-off-by: Simon Mo <simon.mo@hey.com>
---
 .github/CODEOWNERS | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f26c782bccf2..bfb0e91fd06e 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,8 +3,8 @@
 
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
@@ -20,15 +20,15 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
-/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
+/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 
 # vLLM V1
 /vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
@@ -36,11 +36,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/offloading @ApostaC
 
 # Test ownership
-/.buildkite/lm-eval-harness @mgoin @simon-mo
+/.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
 /tests/evals @mgoin
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
@@ -49,7 +49,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep

From e64011f29a63ef9c4fc67bad1fd42af4f3cfad35 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 13 Nov 2025 17:19:35 -0500
Subject: [PATCH 028/578] [CI] Bug: Fix ci entrypoint pooling (#28684)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/v1/engine/processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 69509d5d4712..0404f6ff2771 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -579,6 +579,7 @@ def _validate_model_input(
             prompt_len == max_prompt_len
             and prompt_type == "decoder"
             and not model_config.is_multimodal_model
+            and self.model_config.runner_type != "pooling"
         ):
             suggestion = (
                 "Make sure that `max_model_len` is no smaller than the "

From 6e25b1cddfd78eab307acdb5e3ec14475e465d90 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 13 Nov 2025 23:30:59 +0000
Subject: [PATCH 029/578] [KV Connector] Test async mode in scheduler tests
 (#28550)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/v1/core/test_scheduler.py     | 100 ++++++++++++++++++----------
 tests/v1/core/utils.py              |  24 +++++--
 tests/v1/kv_connector/unit/utils.py |  86 +++++++++++++++++++++++-
 3 files changed, 165 insertions(+), 45 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index d5b829e79b8f..d31338220fca 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -31,11 +31,11 @@
     KVCacheConfig,
     KVCacheGroupSpec,
 )
-from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 
-from .utils import EOS_TOKEN_ID, create_requests, create_scheduler
+from .utils import EOS_TOKEN_ID, create_requests, create_scheduler, mock_kv
 
 pytestmark = pytest.mark.cpu_test
 
@@ -888,27 +888,65 @@ def _step_until_done(
         all_finished = all_done
 
 
-def test_kv_connector_basic():
+def _step_until_kv_transfer_finished(scheduler: Scheduler, req_ids: list[str]):
+    """Cycle requests through a KV transfer cyle."""
+
+    # Requests should first transition to WAITING_FOR_REMOTE_KVS
+    output = scheduler.schedule()
+    assert len(scheduler.waiting) == len(req_ids)
+    assert len(scheduler.running) == 0
+    assert len(output.scheduled_new_reqs) == 0
+    for req in scheduler.requests.values():
+        assert req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+
+    # No model execution yet
+    EMPTY_OUTPUT = ModelRunnerOutput(
+        req_ids=[],
+        req_id_to_index={},
+        sampled_token_ids=[],
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, EMPTY_OUTPUT)
+
+    # Simulate KV transfer completion using KVConnectorOutput.finished_recving
+    output = scheduler.schedule()
+    assert len(scheduler.waiting) == len(req_ids)
+    assert len(scheduler.running) == 0
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=[],
+        req_id_to_index={},
+        sampled_token_ids=[],
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+        kv_connector_output=KVConnectorOutput(finished_recving=req_ids),
+    )
+    scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    for req_id in req_ids:
+        assert req_id in scheduler.finished_recving_kv_req_ids
+
+
+@pytest.mark.parametrize("is_async", [False, True])
+def test_kv_connector_basic(is_async: bool):
     """
     Test whether Scheduler with KVConnector schedules tokens, allocates
     memory, and cleans up requests as expected under normal operation.
     """
 
     # Setup Scheduler.
+    BLOCK_SIZE = 16
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(
+            matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async
+        ),
+        block_size=BLOCK_SIZE,
     )
     NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
-    BLOCK_SIZE = scheduler.cache_config.block_size
-
-    # Mock External Cache Hit.
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
-    )
 
     ######################################################
     # FIRST SET OF REQUESTS - External Hit Only
@@ -928,6 +966,9 @@ def test_kv_connector_basic():
         req_ids.append(request.request_id)
         req_to_index[request.request_id] = i
 
+    if is_async:
+        _step_until_kv_transfer_finished(scheduler, req_ids)
+
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
@@ -978,6 +1019,9 @@ def test_kv_connector_basic():
         req_ids.append(request.request_id)
         req_to_index[request.request_id] = i
 
+    if is_async:
+        _step_until_kv_transfer_finished(scheduler, req_ids)
+
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
@@ -1020,17 +1064,10 @@ def test_external_prefix_cache_metrics():
     """
 
     # Setup Scheduler.
+    NUM_MATCHED_NEW_TOKENS = 4
     scheduler = create_scheduler(
         enable_prefix_caching=False,
-        use_kv_connector=True,
-    )
-
-    # Mock connector to simulate a partial external cache hit
-    NUM_MATCHED_NEW_TOKENS = 4
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
+        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
     )
 
     # --- Prepare simple requests ---
@@ -1085,21 +1122,16 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
     # Setup Scheduler With Mock External Cache Hit.
     BLOCK_SIZE = 4
     NUM_BLOCKS = 10
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
         # encoder connector should not affect test results
         use_ec_connector=use_ec_connector,
         ec_role=ec_role,
     )
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
-    )
 
     # Create two requests. The second request will not be able to
     # allocate slots because it will not have enough blocks.
@@ -1174,9 +1206,10 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     BLOCK_SIZE = 2
     # NOTE: there is 1 null block, so this is 6 blocks.
     NUM_BLOCKS = 7
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
         # encoder connector should not affect test results
@@ -1184,13 +1217,6 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
         ec_role=ec_role,
     )
 
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
-    )
-
     # Create two requests.
     # Both can be scheduled at first, but the second request
     # will be preempted and re-scheduled.
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 3692e633322e..65511c17473b 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from tests.v1.kv_connector.unit.utils import MockKVConfig
 from vllm.config import (
     CacheConfig,
     ECTransferConfig,
@@ -33,6 +34,10 @@
 EOS_TOKEN_ID = 50256
 
 
+def mock_kv(matched_tokens: int, is_async: bool):
+    return MockKVConfig(matched_tokens=matched_tokens, is_async=is_async)
+
+
 def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
@@ -40,7 +45,7 @@ def create_scheduler(
     enable_prefix_caching: bool | None = None,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
-    use_kv_connector: bool = False,
+    use_kv_connector: None | bool | MockKVConfig = None,
     num_blocks: int = 10000,
     block_size: int = 16,
     max_model_len: int | None = None,
@@ -94,15 +99,22 @@ def create_scheduler(
         cache_dtype="auto",
         **kwargs_cache,
     )
-    kv_transfer_config = (
-        KVTransferConfig(
+    kv_transfer_config = None
+    if isinstance(use_kv_connector, MockKVConfig):
+        kv_transfer_config = KVTransferConfig(
+            kv_connector="MockKVConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={
+                "matched_tokens": use_kv_connector.matched_tokens,
+                "is_async": use_kv_connector.is_async,
+            },
+        )
+    elif use_kv_connector:
+        kv_transfer_config = KVTransferConfig(
             kv_connector="SharedStorageConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         )
-        if use_kv_connector
-        else None
-    )
 
     speculative_config: SpeculativeConfig | None = None
     if num_speculative_tokens is not None:
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index f0031643aa9d..f35f91bb3adf 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -3,7 +3,8 @@
 import tempfile
 from collections import defaultdict
 from collections.abc import Callable
-from itertools import count
+from dataclasses import dataclass
+from itertools import chain, count
 from typing import Any
 
 import torch
@@ -18,13 +19,18 @@
     VllmConfig,
 )
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
     SharedStorageConnector,
 )
 from vllm.utils.hashing import sha256
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
-from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.core.sched.scheduler import Scheduler, SchedulerOutput
 from vllm.v1.kv_cache_interface import (
     FullAttentionSpec,
     KVCacheConfig,
@@ -307,6 +313,82 @@ def wrapper(*args, **kwargs):
         return attr
 
 
+@dataclass(frozen=True)
+class MockKVConfig:
+    matched_tokens: int = 0
+    is_async: bool = False
+
+
+class MockKVConnectorMetadata(KVConnectorMetadata):
+    def __init__(self):
+        # Scheduler tests check metadata.requests
+        self.requests: list = []
+
+
+class MockKVConnector(KVConnectorBase_V1):
+    """Mock KV connector for scheduler tests, supporting both sync and async mode."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: KVCacheConfig | None = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+        extra_config = self._kv_transfer_config.kv_connector_extra_config
+        self.config = MockKVConfig(
+            matched_tokens=extra_config["matched_tokens"],
+            is_async=extra_config["is_async"],
+        )
+
+    def get_num_new_matched_tokens(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+    ) -> tuple[int | None, bool]:
+        return (self.config.matched_tokens, self.config.is_async)
+
+    def update_state_after_alloc(
+        self,
+        request: Request,
+        blocks: KVCacheBlocks,
+        num_external_tokens: int,
+    ):
+        pass
+
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
+        metadata = MockKVConnectorMetadata()
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for req_id in chain(
+            (req.req_id for req in scheduler_output.scheduled_new_reqs),
+            (
+                req_id
+                for req_id in cached_reqs.req_ids
+                if req_id in cached_reqs.resumed_req_ids
+            ),
+        ):
+            metadata.requests.append({"req_id": req_id})
+        return metadata
+
+    def start_load_kv(self, kv_caches, finished_req_ids):
+        pass
+
+    def wait_for_layer_load(self, layer_name):
+        pass
+
+    def save_kv_layer(self, layer_name, kv_layer, attn_metadata, **kwargs):
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
 KVConnectorFactory.register_connector(
     "TestSharedStorageConnector", __name__, TestSharedStorageConnector.__name__
 )
+
+KVConnectorFactory.register_connector(
+    "MockKVConnector", __name__, MockKVConnector.__name__
+)

From f2b8e1c5510cf3621dc4b910f0eba5289d9fee88 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 13 Nov 2025 18:16:34 -0600
Subject: [PATCH 030/578] Mirrored test group definitions for AMD (2025-11-11)
 (#28573)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/test-amd.yaml | 163 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 153 insertions(+), 10 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 5fd048c2ad0c..e232000511c3 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -226,6 +226,27 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  #- export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
 - label: EPLB Algorithm Test # 5min
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
@@ -238,11 +259,11 @@ steps:
   commands:
   - pytest -v -s distributed/test_eplb_algo.py
 
-- label: EPLB Execution Test # 5min
+- label: EPLB Execution Test # 10min
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
-  timeout_in_minutes: 15
+  timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -250,6 +271,7 @@ steps:
   - tests/distributed/test_eplb_execute.py
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
 
 - label: Metrics, Tracing Test # 12min
   timeout_in_minutes: 20
@@ -273,7 +295,7 @@ steps:
 
 - label: Regression Test # 7min
   timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
   grade: Blocking
   source_file_dependencies:
@@ -288,7 +310,7 @@ steps:
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
-  #grade: Blocking
+  # grade: Blocking
   source_file_dependencies:
   - vllm/
   - tests/engine
@@ -337,6 +359,7 @@ steps:
     - tests/v1
   commands:
     # split the test to avoid interference
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     - pytest -v -s -m 'not cpu_test' v1/core
     - pytest -v -s v1/executor
     - pytest -v -s v1/kv_offload
@@ -344,7 +367,7 @@ steps:
     - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
     - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
     - pytest -v -s -m 'not cpu_test' v1/metrics
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_request.py
@@ -353,6 +376,20 @@ steps:
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
+# TODO: Add the "V1 Test attetion (MI300)" test group
+
+- label: V1 Test attention (H100) # 10min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
 - label: V1 Test others (CPU) # 5 mins
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
@@ -479,10 +516,11 @@ steps:
   - tests/compile
   commands:
   - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/test_multimodal_compile.py
   - pytest -v -s compile/piecewise/
 
-- label: PyTorch Fullgraph Test # 22min
-  timeout_in_minutes: 35
+- label: PyTorch Fullgraph Test # 27min
+  timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -491,8 +529,23 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_full_graph.py
-  - pytest -v -s compile/test_fusions_e2e.py
+  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
 
 - label: Kernels Core Operation Test # 48min
   timeout_in_minutes: 75
@@ -544,6 +597,8 @@ steps:
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
   - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
   commands:
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
@@ -562,10 +617,13 @@ steps:
 
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
+  torch_nightly: true
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
   - vllm/model_executor
   - tests/model_executor
   - tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -861,9 +919,10 @@ steps:
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
 - label: Multi-Modal Accuracy Eval (Small Models) # 10min
+  timeout_in_minutes: 70
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
-  timeout_in_minutes: 15
+  # grade: Blocking
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - vllm/multimodal/
@@ -934,6 +993,7 @@ steps:
 - label: Transformers Nightly Models Test
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
+  # grade: Blocking
   working_dir: "/vllm-workspace/"
   optional: true
   commands:
@@ -961,11 +1021,16 @@ steps:
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
     # Attention
     # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
     - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
     - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
     - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -1002,7 +1067,33 @@ steps:
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     # this runner has 2 GPUs available even though num_gpus=2 is not set
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml 
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusions_e2e.py
+  - tests/compile/test_full_graph.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
     - pytest -v -s tests/compile/test_fusions_e2e.py
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
@@ -1253,6 +1344,7 @@ steps:
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_4
+  # grade: Blocking
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
@@ -1267,6 +1359,9 @@ steps:
 ##### A100 test #####
 
 - label: Distributed Tests (A100) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
   gpu: a100
   optional: true
   num_gpus: 4
@@ -1281,6 +1376,9 @@ steps:
   - pytest -v -s -x lora/test_mixtral.py
 
 - label: LM Eval Large Models # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
   gpu: a100
   optional: true
   num_gpus: 4
@@ -1292,8 +1390,27 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
   gpu: h200
   optional: true
   working_dir: "/vllm-workspace/"
@@ -1305,6 +1422,7 @@ steps:
     - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1315,6 +1433,7 @@ steps:
   commands:
     - pytest -v -s tests/distributed/test_context_parallel.py
     - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
@@ -1330,3 +1449,27 @@ steps:
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
     - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Accuracy
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020

From 4d5943bda63c306275afc1a10edee26da45cd4ef Mon Sep 17 00:00:00 2001
From: Hank_ <37239608+ILikeIneine@users.noreply.github.com>
Date: Fri, 14 Nov 2025 09:24:10 +0800
Subject: [PATCH 031/578] [quantization][config] enable override existing
 quant_config (#28510)

Signed-off-by: Hank <hcc.mayday@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../test_register_quantization_config.py           | 12 +++++++++---
 .../model_executor/layers/quantization/__init__.py | 14 +++++++++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index aeef4c2fd8a7..8da048703df9 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -7,6 +7,7 @@
 Run `pytest tests/quantization/test_register_quantization_config.py`.
 """
 
+import logging
 from typing import Any
 
 import pytest
@@ -100,17 +101,22 @@ def get_quant_method(
         return None
 
 
-def test_register_quantization_config():
+def test_register_quantization_config(caplog_vllm):
     """Test register custom quantization config."""
 
     # The quantization method `custom_quant` should be registered.
     assert get_quantization_config("custom_quant") == CustomQuantConfig
 
     # The quantization method `custom_quant` is already exists,
-    # should raise an error.
-    with pytest.raises(ValueError):
+    # should raise a warning when re-registering it.
+    with caplog_vllm.at_level(logging.WARNING):
         register_quantization_config("custom_quant")(CustomQuantConfig)
 
+    assert any(
+        "The quantization method 'custom_quant' already exists" in message
+        for message in caplog_vllm.messages
+    ), "Expected a warning when re-registering custom_quant"
+
 
 @pytest.mark.parametrize(
     argnames="model",
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index b92fb8d266b7..bb42b10f8718 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -3,8 +3,11 @@
 
 from typing import Literal, get_args
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 
+logger = init_logger(__name__)
+
 QuantizationMethods = Literal[
     "awq",
     "deepspeedfp",
@@ -70,15 +73,20 @@ def register_quantization_config(quantization: str):
 
     def _wrapper(quant_config_cls):
         if quantization in QUANTIZATION_METHODS:
-            raise ValueError(
-                f"The quantization method `{quantization}` is already exists."
+            logger.warning(
+                "The quantization method '%s' already exists and will be "
+                "overwritten by the quantization config %s.",
+                quantization,
+                quant_config_cls,
             )
+        else:
+            QUANTIZATION_METHODS.append(quantization)
+
         if not issubclass(quant_config_cls, QuantizationConfig):
             raise ValueError(
                 "The quantization config must be a subclass of `QuantizationConfig`."
             )
         _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls
-        QUANTIZATION_METHODS.append(quantization)
         return quant_config_cls
 
     return _wrapper

From 2aa75c752bdd9ce3ebc994353fa49146caad1940 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 13 Nov 2025 17:24:28 -0800
Subject: [PATCH 032/578] [ROCm] Bump up the version of amd-smi to 6.4.3
 (#28680)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
---
 requirements/rocm-build.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 51f58e57a785..b977e80be067 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -13,5 +13,5 @@ setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 wheel
 jinja2>=3.1.6
-amdsmi==6.2.4
+amdsmi==6.4.3
 timm>=1.0.17

From 622e6106a9e3d64fb4927e3d9dc6e4f5289d174c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 13 Nov 2025 20:49:55 -0500
Subject: [PATCH 033/578] [CPU][Bugfix] Fix Apple Silicon M1 compilation
 failure (#28681)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 csrc/cpu/cpu_attn_impl.hpp | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index c317453530af..5de8a114b2b5 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -5,6 +5,10 @@
 #include <type_traits>
 #include <cstddef>
 
+#if defined(__APPLE__)
+  #include <sys/sysctl.h>
+#endif
+
 #include "cpu_types.hpp"
 #include "scratchpad_manager.h"
 #include "cpu_attn_macros.h"
@@ -741,9 +745,21 @@ class AttentionScheduler {
 
   static int64_t get_available_l2_size() {
     static int64_t size = []() {
+#if defined(__APPLE__)
+      // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
+      int64_t l2_cache_size = 0;
+      size_t len = sizeof(l2_cache_size);
+      if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
+          l2_cache_size > 0) {
+        return l2_cache_size >> 1;  // use 50% of L2 cache
+      }
+      // Fallback if sysctlbyname fails
+      return 128 * 1024 >> 1;  // use 50% of 128KB
+#else
       long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
       TORCH_CHECK_NE(l2_cache_size, -1);
       return l2_cache_size >> 1;  // use 50% of L2 cache
+#endif
     }();
     return size;
   }
@@ -816,10 +832,14 @@ struct VecTypeTrait<float> {
   using vec_t = vec_op::FP32Vec16;
 };
 
+// ARM only supports BF16 with ARMv8.6-A extension
+#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
+#else
 template <>
 struct VecTypeTrait<c10::BFloat16> {
   using vec_t = vec_op::BF16Vec16;
 };
+#endif
 
 #if !defined(__powerpc__)
 template <>
@@ -1588,9 +1608,17 @@ class AttentionMainLoop {
 
               if (use_sink) {
                 alignas(64) float s_aux_fp32[16];
+#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
+                // ARM without native BF16 support: manual conversion
+                for (int i = 0; i < 16; ++i) {
+                  s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
+                }
+#else
+                // All other platforms have BF16Vec16 available
                 vec_op::BF16Vec16 vec_bf16(curr_s_aux);
                 vec_op::FP32Vec16 vec_fp32(vec_bf16);
                 vec_fp32.save(s_aux_fp32);
+#endif
 
                 float* __restrict__ curr_sum_buffer = sum_buffer;
                 float* __restrict__ curr_max_buffer = max_buffer;

From b39a5026ebac9242740e48debc79ce8db92c868b Mon Sep 17 00:00:00 2001
From: Bradley D <bradleyhd@meta.com>
Date: Thu, 13 Nov 2025 18:44:36 -0800
Subject: [PATCH 034/578] [ci][amd] fix basic models extra init test (#28676)

Signed-off-by: Bradley Davis <bradleyhd@meta.com>
---
 .buildkite/scripts/hardware_ci/run-amd-test.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 0e5b21ddf25b..864eb470bb0a 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -59,7 +59,7 @@ while true; do
         fi
 done
 
-echo "--- Pulling container" 
+echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
@@ -177,13 +177,13 @@ if [[ -z "$render_gid" ]]; then
   exit 1
 fi
 
-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
 if [[ $commands == *"--shard-id="* ]]; then
-  # assign job count as the number of shards used   
-  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
   for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
     # assign shard-id for each shard
-    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
     echo "Shard ${GPU} commands:$commands_gpu"
     echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
     docker run \

From 01bea115c426a86c5e565a1fc0b9563f58e0bd1a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 14 Nov 2025 11:10:10 +0800
Subject: [PATCH 035/578] [Misc] Remove `warn_for_unimplemented_methods`
 (#28613)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/utils/__init__.py        | 45 -----------------------------------
 vllm/v1/worker/worker_base.py |  2 --
 2 files changed, 47 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 9b0045279a67..040c0416c5ea 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import inspect
 import uuid
 import warnings
-from functools import wraps
 from typing import Any, TypeVar
 
 import torch
@@ -69,49 +67,6 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
-    """
-    A replacement for `abc.ABC`.
-    When we use `abc.ABC`, subclasses will fail to instantiate
-    if they do not implement all abstract methods.
-    Here, we only require `raise NotImplementedError` in the
-    base class, and log a warning if the method is not implemented
-    in the subclass.
-    """
-
-    original_init = cls.__init__
-
-    def find_unimplemented_methods(self: object):
-        unimplemented_methods = []
-        for attr_name in dir(self):
-            # bypass inner method
-            if attr_name.startswith("_"):
-                continue
-
-            try:
-                attr = getattr(self, attr_name)
-                # get the func of callable method
-                if callable(attr):
-                    attr_func = attr.__func__
-            except AttributeError:
-                continue
-            src = inspect.getsource(attr_func)
-            if "NotImplementedError" in src:
-                unimplemented_methods.append(attr_name)
-        if unimplemented_methods:
-            method_names = ",".join(unimplemented_methods)
-            msg = f"Methods {method_names} not implemented in {self}"
-            logger.debug(msg)
-
-    @wraps(original_init)
-    def wrapped_init(self, *args, **kwargs) -> None:
-        original_init(self, *args, **kwargs)
-        find_unimplemented_methods(self)
-
-    type.__setattr__(cls, "__init__", wrapped_init)
-    return cls
-
-
 def length_from_prompt_token_ids_or_embeds(
     prompt_token_ids: list[int] | None,
     prompt_embeds: torch.Tensor | None,
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 30ea0ab77bd9..3991c16eefba 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -13,7 +13,6 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import worker_receiver_cache_from_config
-from vllm.utils import warn_for_unimplemented_methods
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.system_utils import update_environment_variables
 from vllm.v1.kv_cache_interface import KVCacheSpec
@@ -33,7 +32,6 @@
 _R = TypeVar("_R")
 
 
-@warn_for_unimplemented_methods
 class WorkerBase:
     """Worker interface that allows vLLM to cleanly separate implementations for
     different hardware. Also abstracts control plane communication, e.g., to

From da14ae0fad3165b88fcdc03a8f59f1813f8e832a Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Fri, 14 Nov 2025 11:15:50 +0800
Subject: [PATCH 036/578] [XPU][CI]disable lm cache uts (#28696)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 .buildkite/scripts/hardware_ci/run-xpu-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 27ed67c4517e..d49f3e2f47cf 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -46,6 +46,6 @@ docker run \
     pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
     pytest -v -s v1/structured_output
     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
     pytest -v -s v1/test_serial_utils.py
 '

From 0aecd9138f45f6f687858ac1e0c5206d30c8425e Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Thu, 13 Nov 2025 21:52:53 -0800
Subject: [PATCH 037/578] [Misc] Update xformers to 0.33.0.post1 (#28678)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 requirements/cuda.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 76874cbd2f48..d63fe9e1e77c 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,6 +9,6 @@ torch==2.9.0
 torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers==0.0.33; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
+xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.2

From 0b25498990f01ea2553c02731d6e2ce2d550156a Mon Sep 17 00:00:00 2001
From: haoyangli-amd <lihaoyang0109@gmail.com>
Date: Fri, 14 Nov 2025 13:56:35 +0800
Subject: [PATCH 038/578] [Misc] add ignore mapper for quark quantization
 (#28275)

Signed-off-by: Haoyang Li <lihaoyang0109@gmail.com>
---
 .../layers/quantization/quark/quark.py               | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 095a66ef10f9..1bb698faf46d 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
-from typing import Any, Optional, cast
+from typing import TYPE_CHECKING, Any, Optional, cast
 
 import torch
 
@@ -34,6 +34,9 @@
 )
 from vllm.platforms import current_platform
 
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
 __all__ = ["QuarkLinearMethod"]
 
 logger = init_logger(__name__)
@@ -54,6 +57,7 @@ def __init__(
         self.kv_cache_group = kv_cache_group
         self.kv_cache_config = kv_cache_config
         self.pack_method = pack_method
+        self.ignore: list[str] = cast(list[str], self.quant_config.get("exclude", []))
 
     def get_linear_method(self) -> "QuarkLinearMethod":
         return QuarkLinearMethod(self)
@@ -74,9 +78,8 @@ def get_quant_method(
         from vllm.attention.layer import Attention  # Avoid circular import
 
         # Check if the layer is skipped for quantization.
-        exclude_layers = cast(list[str], self.quant_config.get("exclude"))
         if should_ignore_layer(
-            prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping
+            prefix, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
         ):
             return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
@@ -90,6 +93,9 @@ def get_quant_method(
             return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix)
         return None
 
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
+
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "QuarkConfig":
         export_config = config.get("export")

From 15ae8e0784d3889c6aa2c487ca00df4e3fde6f44 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 14 Nov 2025 00:34:01 -0600
Subject: [PATCH 039/578] [Bugfix][CI/Test][Spec Decode] Fix illegal memory
 access in offline_inference/spec_decode.py (Issue  27619) (#28432)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
---
 vllm/attention/ops/triton_reshape_and_cache_flash.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/ops/triton_reshape_and_cache_flash.py b/vllm/attention/ops/triton_reshape_and_cache_flash.py
index bbcd560ad56e..5d2ba154ae01 100644
--- a/vllm/attention/ops/triton_reshape_and_cache_flash.py
+++ b/vllm/attention/ops/triton_reshape_and_cache_flash.py
@@ -97,7 +97,6 @@ def triton_reshape_and_cache_flash(
     k_scale: torch.Tensor,  # float32
     v_scale: torch.Tensor,  # float32
 ):
-    num_tokens = key.shape[0]
     num_heads = key.shape[1]
     head_size = key.shape[2]
     block_size = key_cache.shape[1]
@@ -155,7 +154,10 @@ def triton_reshape_and_cache_flash(
 
     # TODO(ngl): maybe replace with static launch grid to avoid overhead if
     #   using cudagraphs
-    grid = lambda meta: (int(num_tokens), triton.cdiv(n, meta["TILE_SIZE"]))
+    grid = lambda meta: (
+        slot_mapping.shape[0],
+        triton.cdiv(n, meta["TILE_SIZE"]),
+    )
 
     reshape_and_cache_kernel_flash[grid](
         key_ptr=key,

From 93103575ce0480f36fc1a3603eb51d9a89f38a00 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 14 Nov 2025 00:41:29 -0600
Subject: [PATCH 040/578] [BugFix][CI/Build][ROCM] Fix import error and apply
 assert in appropriate case in test_struct_output_generate (#28311)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 .../v1/entrypoints/llm/test_struct_output_generate.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 4cd26e7b41d3..a7d769c8542a 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -677,9 +677,14 @@ def test_structured_output_with_reasoning_matrices(
     reasoning, content = run_reasoning_extraction(reasoner, [generated_text])
     print(f"Prompt: {prompt!r}\nReasoning: {reasoning!r}\nContent: {content!r}")
 
-    assert content is not None and reasoning is not None
-    output_json = json.loads(content)
-    jsonschema.validate(instance=output_json, schema=reasoning_schema)
+    if "Qwen3" in model_name:
+        assert content is not None
+
+    assert reasoning is not None
+
+    if content is not None:
+        output_json = json.loads(content)
+        jsonschema.validate(instance=output_json, schema=reasoning_schema)
 
 
 @pytest.mark.skip_global_cleanup

From 529cea343da8662f135a69d9c3157f388f5eb64a Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Fri, 14 Nov 2025 16:55:29 +0800
Subject: [PATCH 041/578] use default CCL_ZE_IPC_EXCHANGE (#28700)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 vllm/v1/worker/xpu_worker.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 31fa3f3bd6ac..26c6f8d06bdc 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -159,12 +159,10 @@ def init_device(self):
         else:
             raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
-        ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "pidfd")
         ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi")
         ENV_LOCAL_WORLD_SIZE = os.getenv(
             "LOCAL_WORLD_SIZE", str(self.parallel_config.world_size)
         )
-        os.environ["CCL_ZE_IPC_EXCHANGE"] = ENV_CCL_ZE_IPC_EXCHANGE
         os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT
         os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE
         os.environ["LOCAL_RANK"] = str(self.local_rank)

From c36bcfe6b37967ab52763f2ddb9400ff4fe3885b Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Fri, 14 Nov 2025 17:01:26 +0800
Subject: [PATCH 042/578] [Bugfix] fix dots.ocr pp support (#28705)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/model_executor/models/dots_ocr.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 25e5588961a6..405af8f8be42 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -780,6 +780,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             architectures=["Qwen2ForCausalLM"],
         )
 
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
     def _parse_and_validate_image_input(
         self, **kwargs: object
     ) -> DotsOCRImageInputs | None:

From bc3e43069aadb1fa301a9f60a22872b6ec4453b9 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 14 Nov 2025 01:11:13 -0800
Subject: [PATCH 043/578] [BugFix] Fix multi-modal async scheduling race
 condition (#28706)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .../shm_object_storage.py                     |  6 +--
 vllm/v1/serial_utils.py                       | 26 ++++++++----
 vllm/v1/worker/gpu_model_runner.py            | 42 +++++++++----------
 3 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py
index 2ec33afb8783..4af2caa16b0d 100644
--- a/vllm/distributed/device_communicators/shm_object_storage.py
+++ b/vllm/distributed/device_communicators/shm_object_storage.py
@@ -342,8 +342,8 @@ def __init__(self):
         from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 
         self.encoder = MsgpackEncoder()
-        self.tensor_decoder = MsgpackDecoder(torch.Tensor)
-        self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem)
+        self.tensor_decoder = MsgpackDecoder(torch.Tensor, share_mem=False)
+        self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem, share_mem=False)
         self._mm_kwargs_item_cls = MultiModalKwargsItem
 
     def serialize(self, value: Any) -> tuple[bytes | list[bytes], int, bytes, int]:
@@ -368,7 +368,7 @@ def deserialize(self, data_view: memoryview) -> Any:
         # pickle.loads do not read past the end of a pickled object
         # within a large buffer, so we can skip storing the metadata size
         type_name, nbytes, len_arr = pickle.loads(data_view)
-        serialized_data = bytearray(data_view[-nbytes:])
+        serialized_data = data_view[-nbytes:]
 
         if type_name == torch.Tensor.__name__:
             obj = []
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 102357ca7c64..cf0b1a41b50f 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -31,6 +31,7 @@
     MultiModalSharedField,
     NestedTensors,
 )
+from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.engine import UtilityResult
 from vllm.v1.utils import tensor_data
 
@@ -282,7 +283,9 @@ class MsgpackDecoder:
     not thread-safe when encoding tensors / numpy arrays.
     """
 
-    def __init__(self, t: Any | None = None):
+    def __init__(self, t: Any | None = None, share_mem: bool = True):
+        self.share_mem = share_mem
+        self.pin_tensors = is_pin_memory_available()
         args = () if t is None else (t,)
         self.decoder = msgpack.Decoder(
             *args, ext_hook=self.ext_hook, dec_hook=self.dec_hook
@@ -347,21 +350,30 @@ def _decode_ndarray(self, arr: Any) -> np.ndarray:
         # zero-copy decode. We assume the ndarray will not be kept around,
         # as it now locks the whole received message buffer in memory.
         buffer = self.aux_buffers[data] if isinstance(data, int) else data
-        return np.frombuffer(buffer, dtype=dtype).reshape(shape)
+        arr = np.frombuffer(buffer, dtype=dtype)
+        if not self.share_mem:
+            arr = arr.copy()
+        return arr.reshape(shape)
 
     def _decode_tensor(self, arr: Any) -> torch.Tensor:
         dtype, shape, data = arr
-        # Copy from inline representation, to decouple the memory storage
-        # of the message from the original buffer. And also make Torch
-        # not complain about a readonly memoryview.
-        buffer = self.aux_buffers[data] if isinstance(data, int) else bytearray(data)
+        is_aux = isinstance(data, int)
+        buffer = self.aux_buffers[data] if is_aux else data
+        buffer = buffer if isinstance(buffer, memoryview) else memoryview(buffer)
         torch_dtype = getattr(torch, dtype)
         assert isinstance(torch_dtype, torch.dtype)
-        if not buffer:  # torch.frombuffer doesn't like empty buffers
+        if not buffer.nbytes:  # torch.frombuffer doesn't like empty buffers
             assert 0 in shape
             return torch.empty(shape, dtype=torch_dtype)
         # Create uint8 array
         arr = torch.frombuffer(buffer, dtype=torch.uint8)
+        # Clone ensures tensor is backed by pytorch-owned memory for safe
+        # future async CPU->GPU transfer.
+        # Pin larger tensors for more efficient CPU->GPU transfer.
+        if not is_aux:
+            arr = arr.clone()
+        elif not self.share_mem:
+            arr = arr.pin_memory() if self.pin_tensors else arr.clone()
         # Convert back to proper shape & type
         return arr.view(torch_dtype).view(shape)
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c9c64137ca04..d0f7f3a501f5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2590,28 +2590,28 @@ def execute_model(
                     )
                 )
 
-            dp_rank = self.parallel_config.data_parallel_rank
-            if ubatch_slices:
-                assert num_tokens_across_dp is not None
-                num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
-                self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
-            elif num_tokens_across_dp is not None:
-                num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
-            else:
-                num_input_tokens = self._get_num_input_tokens(
-                    scheduler_output.total_num_scheduled_tokens
-                )
+                dp_rank = self.parallel_config.data_parallel_rank
+                if ubatch_slices:
+                    assert num_tokens_across_dp is not None
+                    num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
+                    self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
+                elif num_tokens_across_dp is not None:
+                    num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
+                else:
+                    num_input_tokens = self._get_num_input_tokens(
+                        scheduler_output.total_num_scheduled_tokens
+                    )
 
-            (
-                input_ids,
-                inputs_embeds,
-                positions,
-                intermediate_tensors,
-                model_kwargs,
-                ec_connector_output,
-            ) = self._preprocess(
-                scheduler_output, num_input_tokens, intermediate_tensors
-            )
+                (
+                    input_ids,
+                    inputs_embeds,
+                    positions,
+                    intermediate_tensors,
+                    model_kwargs,
+                    ec_connector_output,
+                ) = self._preprocess(
+                    scheduler_output, num_input_tokens, intermediate_tensors
+                )
 
             uniform_decode = (
                 max_num_scheduled_tokens == self.uniform_decode_query_len

From c9a3a02149d83cc2840769228c4e591d39351bb6 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 14 Nov 2025 04:32:03 -0500
Subject: [PATCH 044/578] Add output token counting to gsm8k eval (#28594)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/evals/gsm8k/gsm8k_eval.py | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py
index c7799607912b..0421f8bb1859 100644
--- a/tests/evals/gsm8k/gsm8k_eval.py
+++ b/tests/evals/gsm8k/gsm8k_eval.py
@@ -83,8 +83,12 @@ async def call_vllm_api(
     stop: list[str] | None = None,
     url: str | None = None,
     seed: int | None = None,
-) -> str:
-    """Call vLLM's OpenAI-compatible completions endpoint."""
+) -> tuple[str, int]:
+    """Call vLLM's OpenAI-compatible completions endpoint.
+
+    Returns:
+        Tuple of (response_text, completion_tokens)
+    """
     data = {
         "prompt": prompt,
         "temperature": temperature,
@@ -98,10 +102,12 @@ async def call_vllm_api(
         async with session.post(f"{url}/v1/completions", json=data) as response:
             response.raise_for_status()
             result = await response.json()
-            return result["choices"][0]["text"]
+            text = result["choices"][0]["text"]
+            completion_tokens = result.get("usage", {}).get("completion_tokens", 0)
+            return text, completion_tokens
     except Exception as e:
         print(f"Error calling vLLM API: {e}")
-        return ""
+        return "", 0
 
 
 def evaluate_gsm8k(
@@ -146,10 +152,11 @@ def evaluate_gsm8k(
     # Run evaluation
     async def run_async_evaluation():
         states: list[str] = [""] * num_questions
+        output_tokens: list[int] = [0] * num_questions
 
-        async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
+        async def get_answer(session: aiohttp.ClientSession, i: int) -> tuple[str, int]:
             prompt = few_shot_examples + questions[i]
-            answer = await call_vllm_api(
+            answer, tokens = await call_vllm_api(
                 session=session,
                 prompt=prompt,
                 temperature=temperature,
@@ -159,7 +166,8 @@ async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
                 seed=seed,
             )
             states[i] = answer
-            return answer
+            output_tokens[i] = tokens
+            return answer, tokens
 
         async with aiohttp.ClientSession(
             timeout=aiohttp.ClientTimeout(total=600)
@@ -167,24 +175,28 @@ async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
             tasks = [get_answer(session, i) for i in range(num_questions)]
             await tqdm.gather(*tasks, desc="Evaluating")
 
-        return states
+        return states, output_tokens
 
     print(f"Running GSM8K evaluation: {num_questions} questions, {num_shots}-shot")
 
     tic = time.perf_counter()
-    states = asyncio.run(run_async_evaluation())
+    states, output_tokens = asyncio.run(run_async_evaluation())
     latency = time.perf_counter() - tic
 
     # Compute metrics
     preds = [get_answer_value(state) for state in states]
     accuracy = np.mean(np.array(preds) == np.array(labels))
     invalid_rate = np.mean(np.array(preds) == INVALID)
+    total_output_tokens = sum(output_tokens)
+    tokens_per_second = total_output_tokens / latency if latency > 0 else 0.0
 
     result = {
         "accuracy": accuracy,
         "invalid_rate": invalid_rate,
         "latency": latency,
         "questions_per_second": num_questions / latency,
+        "total_output_tokens": total_output_tokens,
+        "tokens_per_second": tokens_per_second,
         "num_questions": num_questions,
         "num_shots": num_shots,
         "max_tokens": max_tokens,
@@ -236,6 +248,8 @@ def main() -> None:
     print(f"Invalid responses: {result['invalid_rate']:.3f}")
     print(f"Total latency: {result['latency']:.3f} s")
     print(f"Questions per second: {result['questions_per_second']:.3f}")
+    print(f"Total output tokens: {result['total_output_tokens']}")
+    print(f"Output tokens per second: {result['tokens_per_second']:.3f}")
 
     # Optional file saving
     if args.save_results:

From fd75d3e8c0f522178e39845276fd57908760b4d0 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 14 Nov 2025 01:32:31 -0800
Subject: [PATCH 045/578] [Minor] avoid register new custom and just import
 silly_attn (#28578)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/test_config.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index bb66ef5529b1..1e8a882a7f3e 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -15,6 +15,9 @@
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import _is_torch_equal_or_newer
 
+# This import automatically registers `torch.ops.silly.attention`
+from . import silly_attention  # noqa: F401
+
 
 def test_version():
     # Test the version comparison logic using the private function
@@ -257,15 +260,6 @@ def test_should_split():
     splitting_ops = ["aten::add.Tensor"]
     assert not should_split(node, splitting_ops)
 
-    @torch.library.custom_op(
-        "silly::attention",
-        mutates_args=["out"],
-    )
-    def attention(
-        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
-    ) -> None:
-        out.copy_(q + k + v)
-
     q, k, v, out = [torch.randn(1)] * 4
 
     # supports custom ops as OpOverloadPacket

From 8cfbe89b9389e5a10ee08059e6b2855e6c979e4e Mon Sep 17 00:00:00 2001
From: Xing Liu <93360308+xingliu14@users.noreply.github.com>
Date: Fri, 14 Nov 2025 01:32:46 -0800
Subject: [PATCH 046/578] [Misc] fix comment in test_envs (#28529)

Signed-off-by: Xing Liu <xingliu14@gmail.com>
---
 tests/test_envs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_envs.py b/tests/test_envs.py
index 841d7945f912..6a9835a68e7e 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -36,7 +36,7 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
     # Enable envs cache and ignore ongoing environment changes
     enable_envs_cache()
 
-    # __getattr__ is not decorated with functools.cache
+    # __getattr__ is decorated with functools.cache
     assert hasattr(envs.__getattr__, "cache_info")
     start_hits = envs.__getattr__.cache_info().hits
 

From ecf8230d4d196566a76c907949d6569b1ff176ad Mon Sep 17 00:00:00 2001
From: lyn610 <610lyn@gmail.com>
Date: Fri, 14 Nov 2025 17:47:45 +0800
Subject: [PATCH 047/578] [Metrics] Log number of preempted requests (#28522)

Add tracking and periodic logging for the number of preempted requests in the
metrics logger. This helps monitor system behavior under load.

Signed-off-by: Yining Liu <610lyn@gmail.com>
---
 vllm/v1/metrics/loggers.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 1a175e9e110b..21280b9c84cf 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -118,12 +118,14 @@ def _reset(self, now):
         self.num_prompt_tokens: int = 0
         self.num_generation_tokens: int = 0
         self.num_corrupted_reqs: int = 0
+        self.num_preemptions: int = 0
 
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
         self.num_prompt_tokens += iteration_stats.num_prompt_tokens
         self.num_generation_tokens += iteration_stats.num_generation_tokens
         self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs
+        self.num_preemptions += iteration_stats.num_preempted_reqs
 
     def _get_throughput(self, tracked_stats: int, now: float) -> float:
         # Compute summary metrics for tracked stats
@@ -196,18 +198,31 @@ def log(self):
             "Avg generation throughput: %.1f tokens/s",
             "Running: %d reqs",
             "Waiting: %d reqs",
-            "GPU KV cache usage: %.1f%%",
-            "Prefix cache hit rate: %.1f%%",
         ]
         log_args = [
             self.last_prompt_throughput,
             self.last_generation_throughput,
             self.last_scheduler_stats.num_running_reqs,
             self.last_scheduler_stats.num_waiting_reqs,
-            self.last_scheduler_stats.kv_cache_usage * 100,
-            self.prefix_caching_metrics.hit_rate * 100,
         ]
 
+        if self.num_preemptions > 0:
+            log_parts.append("Preemptions: %d")
+            log_args.append(self.num_preemptions)
+
+        log_parts.extend(
+            [
+                "GPU KV cache usage: %.1f%%",
+                "Prefix cache hit rate: %.1f%%",
+            ]
+        )
+        log_args.extend(
+            [
+                self.last_scheduler_stats.kv_cache_usage * 100,
+                self.prefix_caching_metrics.hit_rate * 100,
+            ]
+        )
+
         if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
             log_parts.append("Corrupted: %d reqs")
             log_args.append(self.num_corrupted_reqs)

From 360bd8762f053c59ee19e2fd72cb1e5a28423958 Mon Sep 17 00:00:00 2001
From: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com>
Date: Fri, 14 Nov 2025 03:03:55 -0800
Subject: [PATCH 048/578]  [Frontend] Added chat-style multimodal support to
 /classify. (#27516)

Signed-off-by: WorldExplored <srreyansh.sethi@gmail.com>
Signed-off-by: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com>
Signed-off-by: vnadathur <glvikramn@gmail.com>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: vnadathur <236933696+vnadathur@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: vnadathur <glvikramn@gmail.com>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 .../pooling/openai/test_classification.py     |  10 ++
 .../openai/test_vision_classification.py      |  95 ++++++++++++++
 vllm/entrypoints/openai/api_server.py         |   3 +
 vllm/entrypoints/openai/protocol.py           | 116 +++++++++++++++++-
 .../openai/serving_classification.py          | 100 ++++++++++++---
 vllm/entrypoints/openai/serving_engine.py     |  21 +++-
 6 files changed, 318 insertions(+), 27 deletions(-)
 create mode 100644 tests/entrypoints/pooling/openai/test_vision_classification.py

diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py
index 671bb948780a..25080d4189c2 100644
--- a/tests/entrypoints/pooling/openai/test_classification.py
+++ b/tests/entrypoints/pooling/openai/test_classification.py
@@ -46,6 +46,16 @@ def test_single_input_classification(server: RemoteOpenAIServer, model_name: str
     assert hasattr(output.data[0], "probs")
 
 
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str):
+    response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": "hello", "add_special_tokens": False},
+    )
+    response.raise_for_status()
+    ClassificationResponse.model_validate(response.json())
+
+
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
     input_texts = [
diff --git a/tests/entrypoints/pooling/openai/test_vision_classification.py b/tests/entrypoints/pooling/openai/test_vision_classification.py
new file mode 100644
index 000000000000..f2616e057b17
--- /dev/null
+++ b/tests/entrypoints/pooling/openai/test_vision_classification.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.openai.protocol import ClassificationResponse
+
+VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
+MAXIMUM_VIDEOS = 1
+TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
+
+HF_OVERRIDES = {
+    "text_config": {
+        "architectures": ["Qwen2_5_VLForSequenceClassification"],
+    },
+}
+
+
+@pytest.fixture(scope="module")
+def server_vlm_classify():
+    args = [
+        "--runner",
+        "pooling",
+        "--max-model-len",
+        "5000",
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"video": MAXIMUM_VIDEOS}),
+    ]
+
+    with RemoteOpenAIServer(
+        VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_text_only(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this text request."},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 22
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_video_url(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this video."},
+                {"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 4807
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index fbb2d32a229d..f30c6ef2cd0a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1784,6 +1784,9 @@ async def init_app_state(
             engine_client,
             state.openai_serving_models,
             request_logger=request_logger,
+            chat_template=resolved_chat_template,
+            chat_template_content_format=args.chat_template_content_format,
+            trust_request_chat_template=args.trust_request_chat_template,
             log_error_stack=args.log_error_stack,
         )
         if "classify" in supported_tasks
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 69e757d4764d..45584df8b9e2 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -2000,10 +2000,10 @@ class ScoreResponse(OpenAIBaseModel):
     usage: UsageInfo
 
 
-class ClassificationRequest(OpenAIBaseModel):
+class ClassificationCompletionRequest(OpenAIBaseModel):
     model: str | None = None
     input: list[str] | str
-    truncate_prompt_tokens: int | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
     user: str | None = None
 
     # --8<-- [start:classification-extra-params]
@@ -2015,7 +2015,21 @@ class ClassificationRequest(OpenAIBaseModel):
             "if the served model does not use priority scheduling."
         ),
     )
-
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
     softmax: bool | None = Field(
         default=None,
         description="softmax will be deprecated, please use use_activation instead.",
@@ -2040,6 +2054,102 @@ def to_pooling_params(self):
         )
 
 
+class ClassificationChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:chat-classification-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:chat-classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+ClassificationRequest: TypeAlias = (
+    ClassificationCompletionRequest | ClassificationChatRequest
+)
+
+
 class ClassificationData(OpenAIBaseModel):
     index: int
     label: str | None
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 45bbe732a680..167ee152fece 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -4,13 +4,17 @@
 from http import HTTPStatus
 from typing import cast
 
+import jinja2
 import numpy as np
 from fastapi import Request
-from typing_extensions import override
 
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
     ClassificationData,
     ClassificationRequest,
     ClassificationResponse,
@@ -32,7 +36,10 @@
 
 
 class ClassificationMixin(OpenAIServing):
-    @override
+    chat_template: str | None
+    chat_template_content_format: ChatTemplateContentFormatOption
+    trust_request_chat_template: bool
+
     async def _preprocess(
         self,
         ctx: ServeContext,
@@ -42,31 +49,79 @@ async def _preprocess(
         and prepare model-specific inputs.
         """
         ctx = cast(ClassificationServeContext, ctx)
-        if isinstance(ctx.request.input, str) and not ctx.request.input:
-            return self.create_error_response(
-                "Input cannot be empty for classification",
-                status_code=HTTPStatus.BAD_REQUEST,
-            )
-
-        if isinstance(ctx.request.input, list) and len(ctx.request.input) == 0:
-            return None
-
         try:
             ctx.tokenizer = await self.engine_client.get_tokenizer()
 
-            renderer = self._get_renderer(ctx.tokenizer)
-            ctx.engine_prompts = await renderer.render_prompt(
-                prompt_or_prompts=ctx.request.input,
-                config=self._build_render_config(ctx.request),
-            )
+            request_obj = ctx.request
+
+            if isinstance(request_obj, ClassificationChatRequest):
+                chat_request = request_obj
+                messages = chat_request.messages
+                trust_request_chat_template = getattr(
+                    self,
+                    "trust_request_chat_template",
+                    False,
+                )
+                ret = self._validate_chat_template(
+                    request_chat_template=chat_request.chat_template,
+                    chat_template_kwargs=chat_request.chat_template_kwargs,
+                    trust_request_chat_template=trust_request_chat_template,
+                )
+                if ret:
+                    return ret
+
+                (
+                    _,
+                    _,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    cast(ChatCompletionRequest, chat_request),
+                    ctx.tokenizer,
+                    messages,
+                    chat_template=(
+                        chat_request.chat_template
+                        or getattr(self, "chat_template", None)
+                    ),
+                    chat_template_content_format=cast(
+                        ChatTemplateContentFormatOption,
+                        getattr(self, "chat_template_content_format", "auto"),
+                    ),
+                    add_generation_prompt=False,
+                    continue_final_message=False,
+                    add_special_tokens=chat_request.add_special_tokens,
+                )
+                ctx.engine_prompts = engine_prompts
+
+            elif isinstance(request_obj, ClassificationCompletionRequest):
+                completion_request = request_obj
+                input_data = completion_request.input
+                if input_data in (None, ""):
+                    return self.create_error_response(
+                        "Input or messages must be provided",
+                        status_code=HTTPStatus.BAD_REQUEST,
+                    )
+                if isinstance(input_data, list) and not input_data:
+                    ctx.engine_prompts = []
+                    return None
+
+                renderer = self._get_renderer(ctx.tokenizer)
+                prompt_input = cast(str | list[str], input_data)
+                ctx.engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=prompt_input,
+                    config=self._build_render_config(completion_request),
+                )
+            else:
+                return self.create_error_response(
+                    "Invalid classification request type",
+                    status_code=HTTPStatus.BAD_REQUEST,
+                )
 
             return None
 
-        except (ValueError, TypeError) as e:
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
-    @override
     def _build_response(
         self,
         ctx: ServeContext,
@@ -118,6 +173,7 @@ def _build_render_config(self, request: ClassificationRequest) -> RenderConfig:
         return RenderConfig(
             max_length=self.max_model_len,
             truncate_prompt_tokens=request.truncate_prompt_tokens,
+            add_special_tokens=request.add_special_tokens,
         )
 
 
@@ -130,6 +186,9 @@ def __init__(
         models: OpenAIServingModels,
         *,
         request_logger: RequestLogger | None,
+        chat_template: str | None = None,
+        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
+        trust_request_chat_template: bool = False,
         log_error_stack: bool = False,
     ) -> None:
         super().__init__(
@@ -139,6 +198,10 @@ def __init__(
             log_error_stack=log_error_stack,
         )
 
+        self.chat_template = chat_template
+        self.chat_template_content_format = chat_template_content_format
+        self.trust_request_chat_template = trust_request_chat_template
+
     async def create_classify(
         self,
         request: ClassificationRequest,
@@ -156,7 +219,6 @@ async def create_classify(
 
         return await super().handle(ctx)  # type: ignore
 
-    @override
     def _create_pooling_params(
         self,
         ctx: ClassificationServeContext,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 1456727a3cdd..03f10e5a91e6 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -43,6 +43,8 @@
     ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionResponse,
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
     ClassificationRequest,
     ClassificationResponse,
     CompletionRequest,
@@ -114,13 +116,16 @@
     | DetokenizeRequest
     | EmbeddingCompletionRequest
     | RerankRequest
-    | ClassificationRequest
+    | ClassificationCompletionRequest
     | ScoreRequest
     | TokenizeCompletionRequest
 )
 
 ChatLikeRequest: TypeAlias = (
-    ChatCompletionRequest | EmbeddingChatRequest | TokenizeChatRequest
+    ChatCompletionRequest
+    | EmbeddingChatRequest
+    | TokenizeChatRequest
+    | ClassificationChatRequest
 )
 SpeechToTextRequest: TypeAlias = TranscriptionRequest | TranslationRequest
 AnyRequest: TypeAlias = (
@@ -814,7 +819,11 @@ def _get_message_types(self, request: AnyRequest) -> set[str]:
         if not hasattr(request, "messages"):
             return message_types
 
-        for message in request.messages:
+        messages = request.messages
+        if messages is None or isinstance(messages, (str, bytes)):
+            return message_types
+
+        for message in messages:
             if (
                 isinstance(message, dict)
                 and "content" in message
@@ -907,7 +916,8 @@ def _validate_input(
                 EmbeddingCompletionRequest,
                 ScoreRequest,
                 RerankRequest,
-                ClassificationRequest,
+                ClassificationCompletionRequest,
+                ClassificationChatRequest,
             ),
         ):
             # Note: input length can be up to the entire model context length
@@ -915,7 +925,8 @@ def _validate_input(
             if token_num > self.max_model_len:
                 operations: dict[type[AnyRequest], str] = {
                     ScoreRequest: "score",
-                    ClassificationRequest: "classification",
+                    ClassificationCompletionRequest: "classification",
+                    ClassificationChatRequest: "classification",
                 }
                 operation = operations.get(type(request), "embedding generation")
                 raise ValueError(

From 41b92f7d38d3f056004991c026f6a24846755ef4 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Fri, 14 Nov 2025 19:16:13 +0800
Subject: [PATCH 049/578] [Model][MM] Extract conv layer as CustomOp (#28455)

Signed-off-by: shen-shanshan <467638484@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/conv.py            | 236 ++++++++++++++++++
 vllm/model_executor/models/clip.py            |   3 +-
 vllm/model_executor/models/glm4_1v.py         |  17 +-
 vllm/model_executor/models/qwen2_5_vl.py      |  18 +-
 vllm/model_executor/models/qwen2_vl.py        |  18 +-
 .../models/qwen3_omni_moe_thinker.py          |  17 +-
 vllm/model_executor/models/qwen3_vl.py        |  18 +-
 vllm/model_executor/models/vision.py          |  16 --
 8 files changed, 277 insertions(+), 66 deletions(-)
 create mode 100644 vllm/model_executor/layers/conv.py

diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py
new file mode 100644
index 000000000000..e6f2d2990c24
--- /dev/null
+++ b/vllm/model_executor/layers/conv.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Conv Layer Class."""
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from vllm.model_executor.custom_op import CustomOp
+from vllm.utils.torch_utils import is_torch_equal
+
+
+class ConvLayerBase(CustomOp):
+    """Conv layer base class."""
+
+    num_dim: int
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int | tuple[int, ...],
+        stride: int | tuple[int, ...] = 1,
+        padding: int | tuple[int, ...] = 0,
+        dilation: int | tuple[int, ...] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        *,
+        params_dtype: torch.dtype | None = None,
+    ) -> None:
+        super().__init__()
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        kernel_size = (
+            (kernel_size,) * self.num_dim
+            if isinstance(kernel_size, int)
+            else kernel_size
+        )
+        stride = (stride,) * self.num_dim if isinstance(stride, int) else stride
+        padding = (padding,) * self.num_dim if isinstance(padding, int) else padding
+        dilation = (dilation,) * self.num_dim if isinstance(dilation, int) else dilation
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.padding_mode = padding_mode
+
+        self.enable_linear = (
+            (self.kernel_size == self.stride)
+            and not any(self.padding)
+            and self.groups == 1
+        )
+        self.input_size = in_channels * math.prod(self.kernel_size)
+
+        self.weight = nn.Parameter(
+            torch.empty(
+                out_channels,
+                in_channels // groups,
+                *kernel_size,
+                dtype=params_dtype,
+            ),
+        )
+
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.out_channels, dtype=params_dtype))
+        else:
+            self.register_parameter("bias", None)
+
+    def extra_repr(self) -> str:
+        s = f"in_channels={self.in_channels}, "
+        s += f"out_channels={self.out_channels}, "
+        s += f"kernel_size={self.kernel_size}, "
+        s += f"stride={self.stride}, "
+        s += f"padding={self.padding}, "
+        s += f"bias={self.bias is not None}"
+        return s
+
+
+@CustomOp.register("conv2d")
+class Conv2dLayer(ConvLayerBase):
+    """Conv layer with Conv2d."""
+
+    num_dim = 2
+
+    def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 4
+        B, C, H, W = x.shape
+        K1, K2 = self.kernel_size
+        H, W = H // K1, W // K2
+        x = x.unfold(2, K1, K1).unfold(3, K2, K2)
+        x = x.permute(0, 2, 3, 1, 4, 5).reshape(-1, self.input_size)
+        x = F.linear(
+            x,
+            self.weight.view(self.out_channels, self.input_size),
+            self.bias,
+        )
+        x = x.view(B, H, W, self.out_channels).permute(0, 3, 1, 2)
+        return x
+
+    def _forward_conv(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 4
+        x = F.conv2d(
+            x,
+            self.weight,
+            self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+        return x
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """Expected input shape: (batch_size, in_channels, height, width)"""
+        assert x.dim() == 4
+        if self.enable_linear:
+            return self._forward_mulmat(x)
+        else:
+            return self._forward_conv(x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        # By default, we use CUDNN's convolution ops with optimization.
+        return self._forward_conv(x)
+
+
+class CausalConv2dLayer(Conv2dLayer):
+    """
+    A causal version of nn.Conv2d where each location in the 2D matrix would
+    have no access to locations on its right or down
+    All arguments are the same as nn.Conv2d except padding which should be
+    set as None
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        *,
+        params_dtype: torch.dtype | None = None,
+    ) -> None:
+        if padding is not None:
+            raise ValueError(
+                "Argument padding should be set to None for CausalConv2dLayer."
+            )
+        self._left_padding: int = kernel_size - 1
+        self._right_padding: int = stride - 1
+        padding = 0
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            params_dtype=params_dtype,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        x = F.pad(x, pad=(self._left_padding, self._right_padding, 0, 0))
+        x = super().forward(x)
+        return x
+
+
+@CustomOp.register("conv3d")
+class Conv3dLayer(ConvLayerBase):
+    """Conv layer with Conv3d."""
+
+    num_dim = 3
+
+    def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 5
+        B, C, T, H, W = x.shape
+        K1, K2, K3 = self.kernel_size
+        T, H, W = T // K1, H // K2, W // K3
+        x = x.unfold(2, K1, K1).unfold(3, K2, K2).unfold(4, K3, K3)
+        x = x.permute(0, 2, 3, 4, 1, 5, 6, 7).reshape(-1, self.input_size)
+        x = F.linear(
+            x,
+            self.weight.view(self.out_channels, self.input_size),
+            self.bias,
+        )
+        x = x.view(B, T, H, W, self.out_channels).permute(0, 4, 1, 2, 3)
+        return x
+
+    def _forward_conv(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 5
+        x = F.conv3d(
+            x,
+            self.weight,
+            self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+        return x
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """Expected input shape: (batch_size, in_channels, time, height, width)"""
+        if self.enable_linear:
+            return self._forward_mulmat(x)
+        else:
+            return self._forward_conv(x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        # PyTorch2.9.0 disabled CUDNN's Conv3D, which caused a
+        # significant performance regression.
+        # See: https://github.com/vllm-project/vllm/issues/27406
+        # and https://github.com/pytorch/pytorch/issues/166122
+        # By default, we use CUDNN's convolution ops with optimization.
+        if self.enable_linear and is_torch_equal("2.9.0"):
+            return self._forward_mulmat(x)
+        return self._forward_conv(x)
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 50f476dfd185..5d611deb942d 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -20,6 +20,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -315,7 +316,7 @@ def __init__(self, config: CLIPVisionConfig):
 
         self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index b2d4fe0c0139..6953b805653b 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -56,12 +56,12 @@
 from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
     QKVParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -103,7 +103,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -486,15 +485,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=True,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -893,9 +895,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 23591480b160..7617929e93ac 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -26,7 +26,6 @@
 # limitations under the License.
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
 
-import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import lru_cache, partial
 from typing import Annotated, Any, Literal, TypeAlias
@@ -56,12 +55,12 @@
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
     QKVParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -110,7 +109,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -525,15 +523,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=False,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -957,9 +958,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 13b54bbe1748..5d21e249fc4c 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -25,7 +25,6 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 
-import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from typing import Annotated, Any, Literal, TypeAlias
@@ -54,9 +53,9 @@
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import QuickGELU
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -107,7 +106,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -566,15 +564,18 @@ def __init__(
         self.embed_dim = embed_dim
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             embed_dim,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=False,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.embed_dim)
         return x
 
 
@@ -844,9 +845,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 5df2372a842c..40b80ce2387c 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -22,7 +22,6 @@
 # limitations under the License.
 """Inference-only Qwen3-Omni-Moe model (thinker part)."""
 
-import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from typing import Any
@@ -54,9 +53,9 @@
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -102,7 +101,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_llm_pos_ids_for_vision,
     get_vit_attn_backend,
 )
@@ -138,16 +136,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=True,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         L, C = x.shape
-        x = self.proj(x)
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -566,9 +566,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 5f5bde1dd72d..faeb9f81d961 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -24,7 +24,6 @@
 # limitations under the License.
 """Inference-only Qwen3VL model compatible with HuggingFace weights."""
 
-import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from itertools import islice
@@ -57,9 +56,9 @@
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -114,7 +113,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -139,15 +137,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=True,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -579,9 +580,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 0e814e5c86ad..e5d70eb7bc2f 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -550,19 +550,3 @@ def get_llm_pos_ids_for_vision(
     llm_pos_ids_list.append(_llm_pos_ids + start_idx)
     llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
     return llm_pos_ids
-
-
-# Due to a performance regression with Conv3D in PyTorch2.9, we reshape
-# Conv3D weights to Linear weights for better performance.
-# See: https://github.com/vllm-project/vllm/issues/27406
-# and https://github.com/pytorch/pytorch/issues/166122
-# FIXME(Isotr0py): Revert the PR introduces this workaround
-# (https://github.com/vllm-project/vllm/pull/27418),
-# once the performance issue is resolved in PyTorch.
-def conv3d_to_linear_weight(conv3d_weight: torch.Tensor) -> torch.Tensor:
-    """
-    Reshape Conv3D weight to Linear weight. Only work when kernel_size==stride.
-    """
-    out_channels, in_channels, kt, kh, kw = conv3d_weight.shape
-    linear_weight = conv3d_weight.reshape(out_channels, in_channels * kt * kh * kw)
-    return linear_weight

From 4516d44b7f990b8f92450e73720b89cc8ac155ca Mon Sep 17 00:00:00 2001
From: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
Date: Fri, 14 Nov 2025 19:24:10 +0800
Subject: [PATCH 050/578] [DCP] Support Decode Context Parallel (DCP) for GQA
 with Flashinfer (#25438)

Signed-off-by: gaojc <1055866782@qq.com>
Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com>
Signed-off-by: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
Co-authored-by: gaojingchun (A) <g00955623@china.huawei.com>
Co-authored-by: Jingchun Gao <gaojingchun1@huawei.com>
Co-authored-by: QiuChunshuo <qiuchunshuo@huawei.com>
---
 tests/distributed/test_context_parallel.py |  17 +-
 vllm/config/model.py                       |   8 +
 vllm/utils/flashinfer.py                   |   9 +
 vllm/v1/attention/backends/flashinfer.py   | 343 ++++++++++++++++++---
 vllm/v1/executor/multiproc_executor.py     |   5 +
 5 files changed, 331 insertions(+), 51 deletions(-)

diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index 3576efca591c..b16fd0d06b14 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -39,6 +39,7 @@ class ParallelSetup(NamedTuple):
 class CPTestOptions(NamedTuple):
     multi_node_only: bool
     load_format: str | None = None
+    attn_backend: str | None = None
 
 
 @dataclass
@@ -58,6 +59,7 @@ def detailed(
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
         load_format: str | None = None,
+        attn_backend: str | None = None,
     ):
         parallel_setups = []
         for eager_mode_val in [False]:
@@ -79,7 +81,9 @@ def detailed(
             distributed_backends=["mp"],
             runner=runner,
             test_options=CPTestOptions(
-                multi_node_only=multi_node_only, load_format=load_format
+                multi_node_only=multi_node_only,
+                load_format=load_format,
+                attn_backend=attn_backend,
             ),
         )
 
@@ -117,7 +121,7 @@ def _compare_cp_with_tp(
         chunked_prefill,
     ) = parallel_setup
 
-    multi_node_only, load_format = test_options
+    multi_node_only, load_format, attn_backend = test_options
 
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_transformers_version(on_fail="skip")
@@ -177,6 +181,13 @@ def _compare_cp_with_tp(
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
+    if not attn_backend:
+        cp_env = tp_env = {}
+    else:
+        cp_env = tp_env = {
+            "VLLM_ATTENTION_BACKEND": attn_backend,
+        }
+
     cp_args = [
         *common_args,
         "--tensor-parallel-size",
@@ -205,6 +216,8 @@ def _compare_cp_with_tp(
         model_id,
         cp_args,
         tp_args,
+        cp_env,
+        tp_env,
         method=method,
         max_wait_seconds=720,
     )
diff --git a/vllm/config/model.py b/vllm/config/model.py
index f4ed99689e5b..8ec66b6b3160 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1183,6 +1183,14 @@ def verify_with_parallel_config(
                 f"but got {decode_context_parallel_size}"
             )
 
+            num_q_per_kv = total_num_attention_heads // total_num_kv_heads
+            assert num_q_per_kv % decode_context_parallel_size == 0, (
+                f"Total number of q per kv attn heads ({num_q_per_kv})"
+                " must be divisible by dcp world size when enable "
+                "decode context parallel for GQA "
+                f"({parallel_config.decode_context_parallel_size})."
+            )
+
     def get_sliding_window(self) -> int | None:
         """Get the sliding window size from the HF text config if present."""
         return getattr(self.hf_text_config, "sliding_window", None)
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 62af39513d65..79e5a4c30259 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -259,6 +259,7 @@ def use_trtllm_attention(
     num_kv_heads: int,
     num_tokens: int,
     max_seq_len: int,
+    dcp_world_size: int,
     kv_cache_dtype: str,
     q_dtype: torch.dtype,
     is_prefill: bool,
@@ -272,6 +273,14 @@ def use_trtllm_attention(
     if force_use_trtllm is not None and not force_use_trtllm:
         return False
 
+    # Decode context parallel is not supported
+    if dcp_world_size > 1:
+        logger.warning_once(
+            "Trtllm does not support returning LSE and as a result "
+            "does not support DCP, reverting to FlashInfer"
+        )
+        return False
+
     # The platform is not supported
     if not supports_trtllm_attention():
         if force_use_trtllm:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 0b650e2e0d33..4da1637d96eb 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -10,6 +10,7 @@
 from flashinfer import (
     BatchDecodeWithPagedKVCacheWrapper,
     BatchPrefillWithPagedKVCacheWrapper,
+    BatchPrefillWithRaggedKVCacheWrapper,
     MultiLevelCascadeAttentionWrapper,
 )
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
@@ -24,8 +25,11 @@
     AttentionType,
     MultipleOf,
 )
+from vllm.attention.ops.common import cp_lse_ag_out_rs
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.config.cache import CacheDType
+from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
@@ -50,6 +54,7 @@
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
     KVCacheLayoutType,
+    get_dcp_local_seq_lens,
     get_kv_cache_layout,
     get_per_layer_parameters,
     infer_global_hyperparameters,
@@ -160,6 +165,113 @@ def trtllm_prefill_attn_kvfp8_dequant(
     return mock_kv_cache, mock_block_table
 
 
+class BatchDCPPrefillWrapper:
+    def __init__(
+        self,
+        workspace_buffer: torch.Tensor | None = None,
+    ):
+        self._context = BatchPrefillWithPagedKVCacheWrapper(
+            workspace_buffer, get_kv_cache_layout()
+        )
+        self._new_tokens = BatchPrefillWithRaggedKVCacheWrapper(
+            workspace_buffer, get_kv_cache_layout()
+        )
+
+    def plan(
+        self,
+        qo_indptr_cpu: torch.Tensor,
+        paged_kv_indptr_cpu: torch.Tensor,
+        paged_kv_indices: torch.Tensor,
+        paged_kv_last_page_len_cpu: torch.Tensor,
+        prefill_start: int,
+        page_size: int,
+        num_qo_heads: int,
+        dcp_world_size: int,
+        num_kv_heads: int,
+        head_dim: int,
+        sm_scale: float,
+        window_left: int,
+        logits_soft_cap: float | None,
+        q_data_type: torch.dtype,
+        kv_cache_dtype: torch.dtype,
+        prefill_fixed_split_size: int,
+        disable_split_kv: bool,
+    ):
+        """Plan the prefill operation with given parameters."""
+        self._context.plan(
+            qo_indptr_cpu,
+            paged_kv_indptr_cpu,
+            paged_kv_indices,
+            paged_kv_last_page_len_cpu[prefill_start:],
+            num_qo_heads * dcp_world_size,
+            num_kv_heads,
+            head_dim,
+            page_size,
+            causal=False,  # This is context run
+            sm_scale=sm_scale,
+            window_left=window_left,
+            logits_soft_cap=logits_soft_cap,
+            q_data_type=q_data_type,
+            kv_data_type=kv_cache_dtype,
+            fixed_split_size=prefill_fixed_split_size,
+            disable_split_kv=disable_split_kv,
+        )
+        self._new_tokens.plan(
+            qo_indptr=qo_indptr_cpu,
+            kv_indptr=qo_indptr_cpu,
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim_qk=head_dim,
+            head_dim_vo=head_dim,
+            causal=True,  # This is newtokens run
+            sm_scale=sm_scale,
+            window_left=window_left,
+            logits_soft_cap=logits_soft_cap,
+            q_data_type=q_data_type,
+        )
+
+    def run(
+        self,
+        layer: torch.nn.Module,
+        prefill_query: torch.Tensor,
+        kv_cache_permute: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        out: torch.Tensor,
+    ):
+        prefill_query_across_dcp = get_dcp_group().all_gather(
+            prefill_query.contiguous(), dim=1
+        )
+        output_context_tmp, lse_context_tmp = self._context.run(
+            prefill_query_across_dcp,
+            kv_cache_permute,
+            k_scale=layer._k_scale_float,
+            v_scale=layer._v_scale_float,
+            return_lse=True,
+        )
+        output_context, lse_context = cp_lse_ag_out_rs(
+            output_context_tmp, lse_context_tmp, get_dcp_group(), return_lse=True
+        )
+        lse_context = lse_context.transpose(0, 1).contiguous()
+
+        output_query, lse_query = self._new_tokens.run(
+            prefill_query,
+            key,
+            value,
+            return_lse=True,
+        )
+        lse_query = lse_query.transpose(0, 1).contiguous()
+
+        merge_attn_states(
+            out,
+            output_context,
+            lse_context,
+            output_query,
+            lse_query,
+        )
+        return out
+
+
 class FlashInferBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
@@ -281,7 +393,9 @@ class FlashInferMetadata:
     # For cascade attention (CPU for planning).
     use_cascade: bool
 
-    prefill_wrapper: BatchPrefillWithPagedKVCacheWrapper | None = None
+    prefill_wrapper: (
+        BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
+    ) = None
     decode_wrapper: BatchDecodeWithPagedKVCacheWrapper | None = None
     cascade_wrapper: MultiLevelCascadeAttentionWrapper | None = None
 
@@ -303,7 +417,9 @@ def __init__(
         self.cache_config = vllm_config.cache_config
         self.model_config = vllm_config.model_config
         self._workspace_buffer = None
-        self._prefill_wrapper = None  # Wrapper for prefill/append
+        self._prefill_wrapper: (
+            BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
+        ) = None  # Wrapper for prefill/append
         self._decode_wrapper = None  # Wrapper for decode (general shape)
 
         if vllm_is_batch_invariant():
@@ -341,9 +457,23 @@ def __init__(
                 self.compilation_config.max_cudagraph_capture_size,
             )
 
-        self.num_qo_heads = self.model_config.get_num_attention_heads(
-            self.vllm_config.parallel_config
+        try:
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+            self.dcp_kv_cache_interleave_size = (
+                vllm_config.parallel_config.dcp_kv_cache_interleave_size
+            )
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
+            self.dcp_kv_cache_interleave_size = 1
+
+        self.num_qo_heads = (
+            self.model_config.get_num_attention_heads(self.vllm_config.parallel_config)
+            * self.dcp_world_size
         )
+
         self.num_kv_heads = self.kv_cache_spec.num_kv_heads
         self.head_dim = self.kv_cache_spec.head_size
         self.page_size = self.kv_cache_spec.block_size
@@ -455,11 +585,19 @@ def _get_workspace_buffer(self):
             )
         return self._workspace_buffer
 
-    def _get_prefill_wrapper(self):
+    def _get_prefill_wrapper(
+        self,
+    ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper:
         if self._prefill_wrapper is None:
-            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
-                self._get_workspace_buffer(), get_kv_cache_layout()
-            )
+            if self.dcp_world_size > 1:
+                self._prefill_wrapper = BatchDCPPrefillWrapper(
+                    workspace_buffer=self._get_workspace_buffer(),
+                )
+            else:
+                self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
+                    self._get_workspace_buffer(), get_kv_cache_layout()
+                )
+        assert self._prefill_wrapper is not None
         return self._prefill_wrapper
 
     def _get_decode_wrapper(self, batch_size: int, use_cudagraph: bool = False):
@@ -526,9 +664,29 @@ def build(
         max_seq_len = common_attn_metadata.max_seq_len
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
-        seq_lens_np = seq_lens_cpu.numpy()
         block_table_tensor = common_attn_metadata.block_table_tensor
+        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
 
+        if self.dcp_world_size > 1:
+            if num_prefills > 0:
+                qo_indptr_prefill_cpu = (
+                    qo_indptr_cpu[num_decodes:] - qo_indptr_cpu[num_decodes]
+                )
+                query_lens_prefill_cpu = (
+                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
+                )
+                seq_lens_cpu[num_decodes:] = (
+                    seq_lens_cpu[num_decodes:] - query_lens_prefill_cpu
+                )
+
+            seq_lens_cpu = get_dcp_local_seq_lens(
+                seq_lens_cpu,
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.dcp_kv_cache_interleave_size,
+            )
+
+        seq_lens_np = seq_lens_cpu.numpy()
         num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size
 
         use_cascade = common_prefix_len > 0
@@ -589,7 +747,7 @@ def build(
         # write self.paged_kv_last_page_len_cpu inplace
         paged_kv_last_page_len_np = seq_lens_np % page_size
         self.paged_kv_last_page_len_np[:num_reqs] = np.where(
-            paged_kv_last_page_len_np == 0,
+            (paged_kv_last_page_len_np == 0) & (seq_lens_np != 0),
             page_size,
             paged_kv_last_page_len_np,
         )
@@ -600,13 +758,16 @@ def build(
             self.num_kv_heads,
             num_prefill_tokens,
             max_seq_len,
+            self.dcp_world_size,
             self.cache_dtype,
             self.q_data_type,
             is_prefill=True,
             has_sinks=self.has_sinks,
             has_spec=uses_spec_reorder,
         )
-        decode_use_trtllm = self.use_trtllm_decode_attention
+        decode_use_trtllm = (
+            self.use_trtllm_decode_attention and self.dcp_world_size <= 1
+        )
 
         if not (prefill_use_trtllm and decode_use_trtllm):
             if self.has_sinks:
@@ -651,7 +812,6 @@ def build(
             use_cascade=use_cascade,
         )
 
-        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
         paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[: 1 + num_reqs]
         paged_kv_last_page_len_cpu = self.paged_kv_last_page_len_cpu[:num_reqs]
 
@@ -703,24 +863,52 @@ def build(
                 attn_metadata.max_q_len_prefill = int(query_lens_prefill.max().item())
 
                 if not attn_metadata.prefill_use_trtllm:
-                    attn_metadata.prefill_wrapper.plan(
-                        qo_indptr_cpu,
-                        paged_kv_indptr_cpu,
-                        paged_kv_indices,
-                        paged_kv_last_page_len_cpu[prefill_start:],
-                        self.num_qo_heads,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
-                        causal=True,
-                        sm_scale=self.sm_scale,
-                        window_left=self.window_left,
-                        logits_soft_cap=self.logits_soft_cap,
-                        q_data_type=self.q_data_type,
-                        kv_data_type=self.kv_cache_dtype,
-                        fixed_split_size=self.prefill_fixed_split_size,
-                        disable_split_kv=self.disable_split_kv,
-                    )
+                    if self.dcp_world_size > 1:
+                        assert isinstance(
+                            attn_metadata.prefill_wrapper, BatchDCPPrefillWrapper
+                        )
+                        attn_metadata.prefill_wrapper.plan(
+                            qo_indptr_cpu=qo_indptr_cpu,
+                            paged_kv_indptr_cpu=paged_kv_indptr_cpu,
+                            paged_kv_indices=paged_kv_indices,
+                            paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu,
+                            prefill_start=prefill_start,
+                            page_size=self.page_size,
+                            num_qo_heads=self.num_qo_heads,
+                            dcp_world_size=self.dcp_world_size,
+                            num_kv_heads=self.num_kv_heads,
+                            head_dim=self.head_dim,
+                            sm_scale=self.sm_scale,
+                            window_left=self.window_left,
+                            logits_soft_cap=self.logits_soft_cap,
+                            q_data_type=self.q_data_type,
+                            kv_cache_dtype=self.kv_cache_dtype,
+                            prefill_fixed_split_size=self.prefill_fixed_split_size,
+                            disable_split_kv=self.disable_split_kv,
+                        )
+                    else:
+                        assert isinstance(
+                            attn_metadata.prefill_wrapper,
+                            BatchPrefillWithPagedKVCacheWrapper,
+                        )
+                        attn_metadata.prefill_wrapper.plan(
+                            qo_indptr_cpu,
+                            paged_kv_indptr_cpu,
+                            paged_kv_indices,
+                            paged_kv_last_page_len_cpu[prefill_start:],
+                            self.num_qo_heads,
+                            self.num_kv_heads,
+                            self.head_dim,
+                            self.page_size,
+                            causal=True,
+                            sm_scale=self.sm_scale,
+                            window_left=self.window_left,
+                            logits_soft_cap=self.logits_soft_cap,
+                            q_data_type=self.q_data_type,
+                            kv_data_type=self.kv_cache_dtype,
+                            fixed_split_size=self.prefill_fixed_split_size,
+                            disable_split_kv=self.disable_split_kv,
+                        )
                 else:
                     attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(
                         self.device, non_blocking=True
@@ -770,7 +958,7 @@ def build(
                         paged_kv_indices,
                         self.paged_kv_last_page_len_cpu[:num_input_tokens],
                         seq_lens_cpu[:num_input_tokens],
-                        self.num_qo_heads,
+                        self.num_qo_heads * self.dcp_world_size,
                         self.num_kv_heads,
                         self.head_dim,
                         self.page_size,
@@ -797,6 +985,8 @@ def use_cascade_attention(self, *args, **kwargs) -> bool:
 
 
 class FlashInferImpl(AttentionImpl):
+    can_return_lse_for_decode: bool = True
+
     def __init__(
         self,
         num_heads: int,
@@ -989,6 +1179,8 @@ def forward(
 
         # Inputs and outputs may be padded for CUDA graphs
         query = query[:num_actual_tokens]
+        key = key[:num_actual_tokens]
+        value = value[:num_actual_tokens]
         output_padded = output
         output = output[:num_actual_tokens]
 
@@ -1015,17 +1207,46 @@ def forward(
             assert prefill_wrapper is not None
 
             if not attn_metadata.prefill_use_trtllm:
-                assert prefill_wrapper._causal
-                assert prefill_wrapper._window_left == self.window_left
-                assert prefill_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
-                assert prefill_wrapper._sm_scale == self.scale
-                prefill_wrapper.run(
-                    prefill_query,
-                    kv_cache_permute,
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                    out=output[num_decode_tokens:],
-                )
+                if self.dcp_world_size > 1:
+                    assert isinstance(prefill_wrapper, BatchDCPPrefillWrapper)
+                    assert prefill_wrapper._context._window_left == self.window_left
+                    assert prefill_wrapper._context._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._context._sm_scale == self.scale
+                    assert not prefill_wrapper._context._causal
+                    assert prefill_wrapper._new_tokens._window_left == self.window_left
+                    assert prefill_wrapper._new_tokens._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._new_tokens._sm_scale == self.scale
+                    assert prefill_wrapper._new_tokens._causal
+
+                    prefill_wrapper.run(
+                        layer,
+                        prefill_query,
+                        kv_cache_permute,
+                        key[num_decode_tokens:],
+                        value[num_decode_tokens:],
+                        out=output[num_decode_tokens:],
+                    )
+                else:
+                    assert isinstance(
+                        prefill_wrapper, BatchPrefillWithPagedKVCacheWrapper
+                    )
+                    assert prefill_wrapper._window_left == self.window_left
+                    assert prefill_wrapper._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._sm_scale == self.scale
+                    assert prefill_wrapper._causal
+                    prefill_wrapper.run(
+                        prefill_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output[num_decode_tokens:],
+                    )
             else:
                 # prefill_query may be non-contiguous
                 prefill_query = prefill_query.contiguous()
@@ -1101,13 +1322,37 @@ def forward(
                 assert decode_wrapper._window_left == self.window_left
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
                 assert decode_wrapper._sm_scale == self.scale
-                decode_wrapper.run(
-                    decode_query,
-                    kv_cache_permute,
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                    out=output[:num_decode_tokens],
-                )
+
+                if self.dcp_world_size > 1:
+                    decode_query = get_dcp_group().all_gather(
+                        decode_query.contiguous(), dim=-2
+                    )
+                    output_tmp = torch.empty_like(decode_query)
+                    lse = torch.empty(
+                        (decode_query.size(0), decode_query.size(1)),
+                        dtype=torch.float32,
+                        device=decode_query.device,
+                    )
+                    decode_wrapper.run(
+                        decode_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output_tmp,
+                        lse=lse,
+                        return_lse=True,
+                    )
+                    output[:num_decode_tokens] = cp_lse_ag_out_rs(
+                        output_tmp, lse, get_dcp_group()
+                    )
+                else:
+                    decode_wrapper.run(
+                        decode_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output[:num_decode_tokens],
+                    )
             else:
                 # decode_query may be non-contiguous
                 decode_query = decode_query.contiguous()
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 1e249161c688..881e6ef40aaf 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -31,6 +31,7 @@
 from vllm.distributed.device_communicators.shm_broadcast import Handle, MessageQueue
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.distributed.parallel_state import (
+    get_dcp_group,
     get_dp_group,
     get_ep_group,
     get_pp_group,
@@ -726,6 +727,8 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
         pp_rank = get_pp_group().rank_in_group
         tp_size = get_tp_group().world_size
         tp_rank = get_tp_group().rank_in_group
+        dcp_size = get_dcp_group().world_size
+        dcp_rank = get_dcp_group().rank_in_group
         process_name = "Worker"
         if dp_size > 1:
             process_name += f"_DP{dp_rank}"
@@ -733,6 +736,8 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
             process_name += f"_PP{pp_rank}"
         if tp_size > 1:
             process_name += f"_TP{tp_rank}"
+        if dcp_size > 1:
+            process_name += f"_DCP{dcp_rank}"
         if enable_ep:
             ep_rank = get_ep_group().rank_in_group
             process_name += f"_EP{ep_rank}"

From 9324e10275cce6e0fd189bf1ebb0c399d858e9e1 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Fri, 14 Nov 2025 01:53:42 -1000
Subject: [PATCH 051/578] Fix KV sharing fast prefill with cudagraph enabled
 (#28537)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/v1/e2e/test_kv_sharing_fast_prefill.py | 57 +++++---------------
 vllm/v1/attention/backends/utils.py          | 15 +-----
 vllm/v1/worker/gpu_model_runner.py           |  2 +-
 3 files changed, 17 insertions(+), 57 deletions(-)

diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index f2c6d1c1fd1a..2778b0c5e567 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -4,13 +4,11 @@
 import random
 
 import pytest
-import torch
 
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationMode
-from vllm.distributed import cleanup_dist_env_and_memory
 
-from ...utils import fork_new_process_for_each_test
+from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts
 
 # global seed
 SEED = 42
@@ -45,28 +43,12 @@ def test_prompts():
     return prompts
 
 
-def cleanup(llm: LLM, compilation_config: CompilationConfig):
-    # hacky: below lines are required to free up memory for the next test
-    # when setting VLLM_ENABLE_V1_MULTIPROCESSING=0, del llm is not sufficient
-    # TODO(sarckk): when enforce_eager=False, memory is not freed:
-    # find out why and re-enable test for enforce_eager=False case
-    llm_engine = llm.llm_engine.engine_core.engine_core
-    model_runner = llm_engine.model_executor.driver_worker.worker.model_runner
-    del model_runner.model
-    del model_runner.kv_caches
-    del compilation_config.static_forward_context
-    compilation_config.static_forward_context = {}
-
-    del llm
-    torch.cuda.empty_cache()
-    cleanup_dist_env_and_memory()
-
-
 @fork_new_process_for_each_test
-@pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill")
+@pytest.mark.parametrize("kv_sharing_fast_prefill", [False, True])
+@pytest.mark.parametrize("enforce_eager", [True, False])
 def test_kv_sharing_fast_prefill(
     monkeypatch: pytest.MonkeyPatch,
+    kv_sharing_fast_prefill: bool,
     enforce_eager: bool,
     test_prompts: list[str],
 ):
@@ -79,36 +61,25 @@ def test_kv_sharing_fast_prefill(
         if not enforce_eager
         else CompilationMode.NONE,
     )
+    batch_size = 10
 
     with monkeypatch.context() as m:
         # Make scheduling deterministic for reproducibility
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
-        llm = LLM(
-            model="google/gemma-3n-E2B-it",
-            enforce_eager=enforce_eager,
-            compilation_config=compilation_config,
-            seed=SEED,
-        )
-        ref_responses = llm.generate(test_prompts, sampling_params)
-
-        cleanup(llm, compilation_config)
+        prompts, answer, indices = prep_prompts(batch_size)
 
         llm = LLM(
             model="google/gemma-3n-E2B-it",
             enforce_eager=enforce_eager,
             compilation_config=compilation_config,
             seed=SEED,
-            kv_sharing_fast_prefill=True,
+            kv_sharing_fast_prefill=kv_sharing_fast_prefill,
+        )
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(
+            indices,
+            answer,
+            [response.outputs[0].text for response in responses],
+            accept_rate=1.0,
         )
-        optimized_responses = llm.generate(test_prompts, sampling_params)
-
-        cleanup(llm, compilation_config)
-
-        misses = 0
-
-        for ref_response, optimized_response in zip(ref_responses, optimized_responses):
-            if ref_response.outputs[0].text != optimized_response.outputs[0].text:
-                misses += 1
-
-        assert misses == 0
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index fd37a665cf05..578153cda786 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -965,12 +965,6 @@ def reshape_attn_output_for_spec_decode(attn_output: torch.Tensor) -> torch.Tens
     return attn_output.view(total_tokens, attn_output.shape[2], attn_output.shape[3])
 
 
-KV_SHARING_FAST_PREFILL_METADATA_FIELDS = [
-    ("logits_indices_padded", torch.Tensor | None, None),
-    ("num_logits_indices", int, 0),
-]
-
-
 def subclass_attention_metadata(
     name_prefix: str,
     metadata_cls: Any,
@@ -986,8 +980,8 @@ def subclass_attention_metadata(
 
 @runtime_checkable
 class KVSharingFastPrefillMetadata(Protocol):
-    logits_indices_padded: torch.Tensor
-    num_logits_indices: int
+    logits_indices_padded: torch.Tensor | None = None
+    num_logits_indices: int | None = None
 
 
 def create_fast_prefill_custom_backend(
@@ -1019,11 +1013,6 @@ def __init__(self, metadata, common_attn_metadata):
                     for _field in fields(metadata.__class__):
                         setattr(self, _field.name, getattr(metadata, _field.name))
 
-                    # Set additional fields that will be used in model code
-                    assert (
-                        common_attn_metadata.logits_indices_padded is not None
-                        and common_attn_metadata.num_logits_indices is not None
-                    )
                     self.logits_indices_padded = (
                         common_attn_metadata.logits_indices_padded
                     )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d0f7f3a501f5..341bf58f2da8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1314,7 +1314,7 @@ def _build_attention_metadata(
         :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
         """
         logits_indices_padded = None
-        num_logits_indices = 0
+        num_logits_indices = None
         if logits_indices is not None:
             num_logits_indices = logits_indices.size(0)
             if self.cache_config.kv_sharing_fast_prefill:

From db56a59970a84842da2adc3aa64e436f42448b48 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 14 Nov 2025 07:19:22 -0500
Subject: [PATCH 052/578] [BugFix] Fix FA3 IMA with FULL_AND_PIECEWISE and
 cascade attention (default) (#28702)

---
 tests/kernels/attention/test_cascade_flash_attn.py | 1 +
 vllm/v1/attention/backends/flash_attn.py           | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index 4295f852f95b..20f573821b25 100755
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -170,6 +170,7 @@ def test_cascade(
         logits_soft_cap=soft_cap if soft_cap is not None else 0,
         block_table=block_tables,
         common_prefix_len=common_prefix_len,
+        max_num_splits=0,  # no max
         fa_version=fa_version,
     )
 
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 81623549ae85..a5d4435000d4 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -704,6 +704,7 @@ def forward(
             logits_soft_cap=self.logits_soft_cap,
             block_table=attn_metadata.block_table,
             common_prefix_len=attn_metadata.common_prefix_len,
+            max_num_splits=attn_metadata.max_num_splits,
             fa_version=self.vllm_flash_attn_version,
             prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata,
             suffix_scheduler_metadata=attn_metadata.scheduler_metadata,
@@ -950,6 +951,7 @@ def cascade_attention(
     logits_soft_cap: float,
     block_table: torch.Tensor,
     common_prefix_len: int,
+    max_num_splits: int,
     fa_version: int,
     prefix_scheduler_metadata: torch.Tensor | None = None,
     suffix_scheduler_metadata: torch.Tensor | None = None,
@@ -994,7 +996,7 @@ def cascade_attention(
         # s_aux is incorporated into prefix_lse inside the GPU kernel,
         # enabling its effect during the final attention merge.
         s_aux=s_aux,
-        num_splits=1 if vllm_is_batch_invariant() else 0,
+        num_splits=1 if vllm_is_batch_invariant() else max_num_splits,
     )
 
     descale_shape = (cu_query_lens.shape[0] - 1, key_cache.shape[-2])
@@ -1019,7 +1021,7 @@ def cascade_attention(
         q_descale=q_descale.expand(descale_shape) if q_descale is not None else None,
         k_descale=k_descale.expand(descale_shape) if k_descale is not None else None,
         v_descale=v_descale.expand(descale_shape) if v_descale is not None else None,
-        num_splits=1 if vllm_is_batch_invariant() else 0,
+        num_splits=1 if vllm_is_batch_invariant() else max_num_splits,
     )
 
     # Merge prefix and suffix outputs, and store the result in output.

From 8d3748d3c718dd1dfb1f7e9e0825bc9032bff75a Mon Sep 17 00:00:00 2001
From: Fasal Shah <fassha08@gmail.com>
Date: Fri, 14 Nov 2025 18:13:56 +0530
Subject: [PATCH 053/578] [Doc] Fix macOS installation dependency resolution
 issue (#26721)

Signed-off-by: faisal shah <fashah@redhat.com>
---
 docs/getting_started/installation/cpu.apple.inc.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/getting_started/installation/cpu.apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md
index 7e2ed55008a5..4dc707d5f9a1 100644
--- a/docs/getting_started/installation/cpu.apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
@@ -28,10 +28,15 @@ After installation of XCode and the Command Line Tools, which include Apple Clan
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-uv pip install -r requirements/cpu.txt
+uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
 uv pip install -e .
 ```
 
+!!! tip
+    The `--index-strategy unsafe-best-match` flag is needed to resolve dependencies across multiple package indexes (PyTorch CPU index and PyPI). Without this flag, you may encounter `typing-extensions` version conflicts.
+    
+    The term "unsafe" refers to the package resolution strategy, not security. By default, `uv` only searches the first index where a package is found to prevent dependency confusion attacks. This flag allows `uv` to search all configured indexes to find the best compatible versions. Since both PyTorch and PyPI are trusted package sources, using this strategy is safe and appropriate for vLLM installation.
+
 !!! note
     On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device.
 

From 433c0f86751f20dbdfdeb1a711def99b7ae3df92 Mon Sep 17 00:00:00 2001
From: zhaozx-cn <59479021+zhaozx-cn@users.noreply.github.com>
Date: Fri, 14 Nov 2025 21:33:02 +0800
Subject: [PATCH 054/578] [Model] Fix bailing_moe accuracy problem (#28277)

Signed-off-by: zhaozx-cn <zhaozx2116@163.com>
---
 vllm/model_executor/models/bailing_moe.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index a87813402256..6e1e5b1ddc50 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -39,7 +39,6 @@
     get_pp_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
-    tensor_model_parallel_all_reduce,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import SharedFusedMoE
@@ -330,7 +329,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             final_hidden_states = final_hidden_states + shared_output
 
         if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
         return final_hidden_states.view(num_tokens, hidden_size)
 
 
From 96b23b8e3b5cd5d05345489a304e65f7ab53ef8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 14 Nov 2025 15:40:05 +0100
Subject: [PATCH 055/578] [Bugfix][Nixl] Fix kernel physical<>logical
 block_size issue  (#28677)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/worker/test_gpu_model_runner.py      |  6 +-
 .../kv_connector/v1/nixl_connector.py         | 67 ++++++++++++++++---
 vllm/v1/worker/block_table.py                 | 17 +++--
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index b02d9a657407..b95c8df3469b 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -985,8 +985,10 @@ def test_hybrid_block_table_initialization():
     req_index = 0
     block_table.append_row(kvcache_manager_blocks, req_index)
     # Get expected kernel blocks from the implementation for verification.
-    expected_kernel_blocks = block_table._map_to_kernel_blocks(
-        np.array(kvcache_manager_blocks)
+    expected_kernel_blocks = block_table.map_to_kernel_blocks(
+        np.array(kvcache_manager_blocks),
+        block_table.blocks_per_kv_block,
+        block_table._kernel_block_arange,
     )
     # Verify block table state
     assert block_table.num_blocks_per_row[req_index] == len(expected_kernel_blocks)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 42433c717cf2..3d4547c51453 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -49,6 +49,7 @@
 from vllm.utils.network_utils import make_zmq_path, make_zmq_socket
 from vllm.v1.attention.backends.utils import get_kv_cache_layout
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.worker.block_table import BlockTable
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
@@ -112,6 +113,8 @@ class NixlAgentMetadata(KVConnectorHandshakeMetadata):
 @dataclass
 class ReqMeta:
     local_block_ids: list[int]
+    # To be used when logical block size does not match the kernel block size
+    local_physical_block_ids: list[int]
     remote_block_ids: list[int]
     remote_host: str
     remote_port: int
@@ -139,6 +142,7 @@ def add_new_req(
         assert load_remote_cache ^ save_to_host
         _req = ReqMeta(
             local_block_ids=local_block_ids,
+            local_physical_block_ids=local_block_ids,
             remote_block_ids=kv_transfer_params["remote_block_ids"],
             remote_engine_id=kv_transfer_params["remote_engine_id"],
             remote_host=kv_transfer_params["remote_host"],
@@ -935,6 +939,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
             attn_backend=backend,
         )
         self._use_pallas = self.kv_topo._use_pallas
+        self._physical_blocks_per_logical_kv_block = 1
 
     def _nixl_handshake(
         self,
@@ -1133,6 +1138,22 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
                 if base_addr in seen_base_addresses:
                     continue
 
+                # TODO (NickLucche): Get kernel_block_size in a cleaner way
+                # NHD default "view" for non-MLA cache
+                kernel_block_size = cache.shape[-2] if self.use_mla else cache.shape[-3]
+
+                if self.block_size != kernel_block_size:
+                    logger.info_once(
+                        "User-specified logical block size (%s) does not match"
+                        " physical kernel block size (%s). Using the latter. ",
+                        self.block_size,
+                        kernel_block_size,
+                    )
+                    self._physical_blocks_per_logical_kv_block = (
+                        self.block_size // kernel_block_size
+                    )
+                    self.block_size = kernel_block_size
+
                 seen_base_addresses.append(base_addr)
                 curr_tensor_size_bytes = cache.numel() * cache.element_size()
 
@@ -1479,7 +1500,7 @@ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
         assert self.use_host_buffer
         assert self.copy_blocks is not None
 
-        local_block_ids = meta.local_block_ids
+        local_block_ids = meta.local_physical_block_ids
         self.copy_blocks(
             self.host_xfer_buffers,
             self.device_kv_caches,
@@ -1492,7 +1513,7 @@ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
                 "synced recved kv of request[%s] to device kv buffer,"
                 "local_block_ids: %s. ",
                 req_id,
-                ",".join(map(str, meta.local_block_ids)),
+                ",".join(map(str, local_block_ids)),
             )
 
     def save_kv_to_host(self, metadata: NixlConnectorMetadata):
@@ -1501,19 +1522,22 @@ def save_kv_to_host(self, metadata: NixlConnectorMetadata):
         assert self.copy_blocks is not None
 
         for req_id, meta in metadata.reqs_to_save.items():
+            meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
+                meta.local_block_ids
+            )
             if logger.isEnabledFor(logging.DEBUG):
                 logger.debug(
                     "save_load_kv for request[%s] to host xfer buffer."
                     "local_block_ids: %s. ",
                     req_id,
-                    ",".join(map(str, meta.local_block_ids)),
+                    ",".join(map(str, meta.local_physical_block_ids)),
                 )
             # blocking
             self.copy_blocks(
                 self.device_kv_caches,
                 self.host_xfer_buffers,
-                meta.local_block_ids,
-                meta.local_block_ids,
+                meta.local_physical_block_ids,
+                meta.local_physical_block_ids,
                 "d2h",
             )
 
@@ -1582,7 +1606,7 @@ def get_finished(self) -> tuple[set[str], set[str]]:
             if self.use_host_buffer:
                 self.sync_recved_kv_to_device(req_id, meta)
             if self.enable_permute_local_kv:
-                block_ids_to_permute += meta.local_block_ids
+                block_ids_to_permute += meta.local_physical_block_ids
         if len(block_ids_to_permute) > 0:
             self.permute_device_kv(block_ids_to_permute)
 
@@ -1669,7 +1693,7 @@ def _pop_done_transfers(
                         req_id,
                         xfer_state,
                     )
-                    # mark all blocks for this request as invalid
+                    # mark all (logical)blocks for this request as invalid
                     if meta := self._recving_metadata.pop(req_id, None):
                         self._invalid_block_ids.update(meta.local_block_ids)
                     self._recving_metadata.pop(req_id, None)
@@ -1686,13 +1710,19 @@ def start_load_kv(self, metadata: NixlConnectorMetadata):
         We check for these trnxs to complete in each step().
         """
         for req_id, meta in metadata.reqs_to_recv.items():
+            meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
+                meta.local_block_ids
+            )
+            meta.remote_block_ids = self._logical_to_kernel_block_ids(
+                meta.remote_block_ids
+            )
             remote_engine_id = meta.remote_engine_id
             logger.debug(
                 "start_load_kv for request %s from remote engine %s. "
                 "Num local_block_ids: %s. Num remote_block_ids: %s. ",
                 req_id,
                 remote_engine_id,
-                len(meta.local_block_ids),
+                len(meta.local_physical_block_ids),
                 len(meta.remote_block_ids),
             )
             # always store metadata for failure recovery
@@ -1740,7 +1770,7 @@ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         self._read_blocks(
             request_id=req_id,
             dst_engine_id=meta.remote_engine_id,
-            local_block_ids=meta.local_block_ids,
+            local_block_ids=meta.local_physical_block_ids,
             remote_block_ids=meta.remote_block_ids,
         )
 
@@ -1867,7 +1897,7 @@ def _read_blocks(
                 "Marking blocks as invalid.",
                 request_id,
             )
-            # mark all blocks for this request as invalid
+            # mark all (logical) blocks for this request as invalid
             if meta := self._recving_metadata.get(request_id):
                 self._invalid_block_ids.update(meta.local_block_ids)
             self.xfer_stats.record_failed_transfer()
@@ -1906,6 +1936,23 @@ def _get_block_descs_ids(
         descs_ids = region_ids * num_blocks + block_ids
         return descs_ids.flatten()
 
+    def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]:
+        """
+        Convert logical block ids to kernel physical block ids.
+        This is required when the logical block size (the one set by the user)
+        does not match the one required by the attn backend.
+        """
+        if self._physical_blocks_per_logical_kv_block == 1:
+            # Noop when physical and logical block sizes are the same
+            return block_ids
+        block_ids_np = np.array(block_ids)
+        block_arange = np.arange(0, self._physical_blocks_per_logical_kv_block).reshape(
+            1, -1
+        )
+        return BlockTable.map_to_kernel_blocks(
+            block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange
+        ).tolist()
+
     def get_backend_aware_kv_block_len(self, layer_idx: int):
         """
         Get the block length for one K/V element (K and V have the same size).
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index c28bf542f85c..9f6c19e46430 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -98,7 +98,9 @@ def append_row(
             return
 
         if self.use_hybrid_blocks:
-            block_ids = self._map_to_kernel_blocks(np.array(block_ids))
+            block_ids = self.map_to_kernel_blocks(
+                np.array(block_ids), self.blocks_per_kv_block, self._kernel_block_arange
+            )
 
         num_blocks = len(block_ids)
         start = self.num_blocks_per_row[row_idx]
@@ -188,7 +190,12 @@ def clear(self) -> None:
         self.block_table.gpu.fill_(0)
         self.block_table.cpu.fill_(0)
 
-    def _map_to_kernel_blocks(self, kv_manager_block_ids: np.ndarray) -> np.ndarray:
+    @staticmethod
+    def map_to_kernel_blocks(
+        kv_manager_block_ids: np.ndarray,
+        blocks_per_kv_block: int,
+        kernel_block_arange: np.ndarray,
+    ) -> np.ndarray:
         """Convert kv_manager_block_id IDs to kernel block IDs.
 
         Example:
@@ -203,12 +210,12 @@ def _map_to_kernel_blocks(self, kv_manager_block_ids: np.ndarray) -> np.ndarray:
             # kv_manager_block_id 1 → kernel block id [2, 3]
             # kv_manager_block_id 2 → kernel block id [4, 5]
         """
-        if not self.use_hybrid_blocks:
+        if blocks_per_kv_block == 1:
             return kv_manager_block_ids
 
         kernel_block_ids = (
-            kv_manager_block_ids.reshape(-1, 1) * self.blocks_per_kv_block
-            + self._kernel_block_arange
+            kv_manager_block_ids.reshape(-1, 1) * blocks_per_kv_block
+            + kernel_block_arange
         )
 
         return kernel_block_ids.reshape(-1)

From 511a6b611d2e7e6b13e09c050147b367434f1a54 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 14 Nov 2025 22:41:02 +0800
Subject: [PATCH 056/578] [Config] Clean up SchedulerConfig initialization
 (#28665)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../models/language/generation/test_hybrid.py |   7 +-
 tests/v1/core/test_scheduler.py               |   2 +
 tests/v1/sample/test_logprobs.py              |   1 +
 vllm/config/scheduler.py                      | 102 +++------
 vllm/engine/arg_utils.py                      | 208 ++++++++++++------
 vllm/platforms/cpu.py                         |   4 +-
 vllm/platforms/tpu.py                         |   4 +-
 vllm/platforms/xpu.py                         |   4 +-
 vllm/utils/__init__.py                        |  11 +-
 9 files changed, 181 insertions(+), 162 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 681b380e6a15..37830093cd3c 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -348,9 +348,14 @@ def test_fp32_cache_state(
 
 
 # Helper functions for the APC tests
-def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1):
+def _get_vllm_runner_params(
+    model: str,
+    max_model_len: int,
+    tensor_parallel_size: int = 1,
+):
     return {
         "model_name": model,
+        "enable_chunked_prefill": True,
         "enable_prefix_caching": False,
         "max_model_len": max_model_len,
         "tensor_parallel_size": tensor_parallel_size,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index d31338220fca..287e735b5491 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -2256,6 +2256,8 @@ def test_chunked_prefill_disabled_for_encoder_decoder(
     scheduler_config = SchedulerConfig(
         enable_chunked_prefill=enable_chunked_prefill,
         is_encoder_decoder=is_encoder_decoder,
+        # Must <= max_num_batched_tokens if chunked prefill is disabled
+        max_model_len=SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
     )
 
     # `is_encoder_decoder` should only be used during construction
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 354fff22dc2a..42584938bc06 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -47,6 +47,7 @@ def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
         max_num_batched_tokens=16,
         max_num_seqs=16,
         max_model_len=128,
+        enable_chunked_prefill=True,
         enforce_eager=True,
         # TODO: enable this once we support it for
         # prompt logprobs.
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 71a06e167fd9..5117344a6844 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -4,7 +4,7 @@
 import hashlib
 from collections.abc import Callable
 from dataclasses import InitVar
-from typing import TYPE_CHECKING, Any, Literal, cast
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
 
 from pydantic import Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
@@ -12,11 +12,6 @@
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils import (
-    DEFAULT_MAX_NUM_BATCHED_TOKENS,
-    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
-)
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 if TYPE_CHECKING:
@@ -33,25 +28,32 @@
 class SchedulerConfig:
     """Scheduler configuration."""
 
+    DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
+    DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
+
     runner_type: RunnerType = "generate"
     """The runner type to launch for the model."""
 
-    max_num_batched_tokens: int = Field(default=None, ge=1)
+    max_num_batched_tokens: int = Field(default=DEFAULT_MAX_NUM_BATCHED_TOKENS, ge=1)
     """Maximum number of tokens to be processed in a single iteration.
 
-    This config has no static default. If left unspecified by the user, it will
-    be set in `EngineArgs.create_engine_config` based on the usage context."""
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
-    max_num_seqs: int = Field(default=None, ge=1)
+    max_num_seqs: int = Field(default=DEFAULT_MAX_NUM_SEQS, ge=1)
     """Maximum number of sequences to be processed in a single iteration.
 
-    This config has no static default. If left unspecified by the user, it will
-    be set in `EngineArgs.create_engine_config` based on the usage context."""
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
-    max_model_len: int = Field(default=None, ge=1)
-    """Maximum length of a sequence (including prompt and generated text). This
-    is primarily set in `ModelConfig` and that value should be manually
-    duplicated here."""
+    max_model_len: int = Field(default=8192, ge=1)
+    """Maximum length of a sequence (including prompt and generated text).
+
+    The default value here is mainly for convenience when testing.
+    In real usage, this should duplicate `ModelConfig.max_model_len` via
+    `EngineArgs`."""
 
     max_num_partial_prefills: int = Field(default=1, ge=1)
     """For chunked prefill, the maximum number of sequences that can be
@@ -76,9 +78,13 @@ class SchedulerConfig:
     NOTE: This will be replaced by speculative config in the future; it is
     present to enable correctness tests until then."""
 
-    enable_chunked_prefill: bool = Field(default=None)
+    enable_chunked_prefill: bool = True
     """If True, prefill requests can be chunked based
-    on the remaining max_num_batched_tokens."""
+    on the remaining `max_num_batched_tokens`.
+
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
     is_multimodal_model: bool = False
     """True if the model is multimodal."""
@@ -111,9 +117,6 @@ class SchedulerConfig:
     - "priority" means requests are handled based on given priority (lower
     value means earlier handling) and time of arrival deciding any ties)."""
 
-    chunked_prefill_enabled: bool = Field(init=False)
-    """True if chunked prefill is enabled."""
-
     disable_chunked_mm_input: bool = False
     """If set to true and chunked prefill is enabled, we do not want to
     partially schedule a multimodal item. Only used in V1
@@ -188,15 +191,7 @@ def compute_hash(self) -> str:
         hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
-    @field_validator(
-        "max_num_batched_tokens",
-        "max_num_seqs",
-        "max_model_len",
-        "enable_chunked_prefill",
-        "scheduler_cls",
-        "async_scheduling",
-        mode="wrap",
-    )
+    @field_validator("scheduler_cls", "async_scheduling", mode="wrap")
     @classmethod
     def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
         """Skip validation if the value is `None` when initialisation is delayed."""
@@ -205,16 +200,9 @@ def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
         return handler(value)
 
     def __post_init__(self, is_encoder_decoder: bool) -> None:
-        if self.max_model_len is None:
-            self.max_model_len = 8192
-
-        if self.max_num_seqs is None:
-            self.max_num_seqs = 128
-
         if is_encoder_decoder:
             # Chunked prefill should be disabled for encoder-decoder models.
             self.disable_chunked_mm_input = True
-            self.chunked_prefill_enabled = False
             self.enable_chunked_prefill = False
             self.long_prefill_token_threshold = 0
             logger.info(
@@ -222,37 +210,6 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 " prefix caching; disabling both."
             )
 
-        if self.max_num_batched_tokens is None:
-            if self.enable_chunked_prefill:
-                self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
-            else:
-                # If max_model_len is too short, use
-                # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
-                # for higher throughput.
-                self.max_num_batched_tokens = max(
-                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS
-                )
-
-            if self.runner_type == "pooling":
-                # Choose specific value for higher throughput
-                self.max_num_batched_tokens = max(
-                    self.max_num_batched_tokens,
-                    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
-            if self.is_multimodal_model:
-                # The value needs to be at least the number of multimodal tokens
-                self.max_num_batched_tokens = max(
-                    self.max_num_batched_tokens,
-                    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
-
-            # When using default settings,
-            # Ensure max_num_batched_tokens does not exceed model limit.
-            # Some models (e.g., Whisper) have embeddings tied to max length.
-            self.max_num_batched_tokens = min(
-                self.max_num_seqs * self.max_model_len, self.max_num_batched_tokens
-            )
-
         self.max_num_encoder_input_tokens = self.max_num_batched_tokens
         self.encoder_cache_size = self.max_num_batched_tokens
 
@@ -262,7 +219,6 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 self.max_num_batched_tokens,
             )
 
-        self.chunked_prefill_enabled = self.enable_chunked_prefill
         if self.max_num_partial_prefills > 1:
             if self.long_prefill_token_threshold == 0:
                 self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
@@ -276,6 +232,14 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 self.long_prefill_token_threshold,
             )
 
+    @property
+    def chunked_prefill_enabled(self) -> bool:
+        return self.enable_chunked_prefill
+
+    @chunked_prefill_enabled.setter
+    def chunked_prefill_enabled(self, value: bool):
+        self.enable_chunked_prefill = value
+
     @model_validator(mode="after")
     def _verify_args(self) -> Self:
         if (
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b025004ea022..cacebc530b6e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -428,11 +428,11 @@ class EngineArgs:
     cpu_offload_gb: float = CacheConfig.cpu_offload_gb
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
     kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
-    max_num_batched_tokens: int | None = SchedulerConfig.max_num_batched_tokens
+    max_num_batched_tokens: int | None = None
     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
     max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
     long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
-    max_num_seqs: int | None = SchedulerConfig.max_num_seqs
+    max_num_seqs: int | None = None
     max_logprobs: int = ModelConfig.max_logprobs
     logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
     disable_log_stats: bool = False
@@ -485,7 +485,7 @@ class EngineArgs:
     model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
     ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
 
-    enable_chunked_prefill: bool | None = SchedulerConfig.enable_chunked_prefill
+    enable_chunked_prefill: bool | None = None
     disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
     disable_hybrid_kv_cache_manager: bool = (
@@ -1738,41 +1738,41 @@ def _check_feature_supported(self, model_config: ModelConfig):
                 )
                 _raise_unsupported_error(feature_name=name)
 
-    def _set_default_args(
-        self, usage_context: UsageContext, model_config: ModelConfig
-    ) -> None:
-        """Set Default Arguments for V1 Engine."""
-
-        # V1 uses chunked prefills and prefix caching by default
-        # for non-pooling tasks.
-        # For pooling tasks the default is False
+    @classmethod
+    def get_chunked_prefill_prefix_caching_defaults(
+        cls,
+        model_config: ModelConfig,
+    ) -> tuple[bool, bool]:
         if model_config.runner_type != "pooling":
-            self.enable_chunked_prefill = True
-
-            if self.enable_prefix_caching is None:
-                # Disable prefix caching default for hybrid models
-                # since the feature is still experimental.
-                if model_config.is_hybrid:
-                    self.enable_prefix_caching = False
-                else:
-                    self.enable_prefix_caching = True
+            default_chunked_prefill = True
+
+            # Disable prefix caching default for hybrid models
+            # since the feature is still experimental.
+            default_prefix_caching = not model_config.is_hybrid
         else:
+            assert model_config.pooler_config is not None
+
             pooling_type = model_config.pooler_config.pooling_type
-            is_causal = getattr(model_config.hf_config, "is_causal", True)
             incremental_prefill_supported = (
                 pooling_type is not None
                 and pooling_type.lower() == "last"
-                and bool(is_causal)
+                and getattr(model_config.hf_config, "is_causal", True)
             )
 
-            action = "Enabling" if incremental_prefill_supported else "Disabling"
+            default_chunked_prefill = incremental_prefill_supported
+            default_prefix_caching = incremental_prefill_supported
+
+        return default_chunked_prefill, default_prefix_caching
+
+    @classmethod
+    def get_batch_defaults(
+        cls,
+        world_size: int,
+    ) -> tuple[dict[UsageContext | None, int], dict[UsageContext | None, int]]:
+        from vllm.usage.usage_lib import UsageContext
 
-            if self.enable_chunked_prefill is None:
-                self.enable_chunked_prefill = incremental_prefill_supported
-                logger.info("(%s) chunked prefill by default", action)
-            if self.enable_prefix_caching is None:
-                self.enable_prefix_caching = incremental_prefill_supported
-                logger.info("(%s) prefix caching by default", action)
+        default_max_num_batched_tokens: dict[UsageContext | None, int]
+        default_max_num_seqs: dict[UsageContext | None, int]
 
         # When no user override, set the default values based on the usage
         # context.
@@ -1793,8 +1793,6 @@ def _set_default_args(
         # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
         # throughput, see PR #17885 for more details.
         # So here we do an extra device name check to prevent such regression.
-        from vllm.usage.usage_lib import UsageContext
-
         if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
             # For GPUs like H100 and MI300x, use larger default values.
             default_max_num_batched_tokens = {
@@ -1818,22 +1816,26 @@ def _set_default_args(
 
         # tpu specific default values.
         if current_platform.is_tpu():
-            default_max_num_batched_tokens_tpu = {
-                UsageContext.LLM_CLASS: {
-                    "V6E": 2048,
-                    "V5E": 1024,
-                    "V5P": 512,
-                },
-                UsageContext.OPENAI_API_SERVER: {
-                    "V6E": 1024,
-                    "V5E": 512,
-                    "V5P": 256,
-                },
-            }
+            chip_name = current_platform.get_device_name()
+
+            if chip_name == "V6E":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 2048,
+                    UsageContext.OPENAI_API_SERVER: 1024,
+                }
+            elif chip_name == "V5E":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 1024,
+                    UsageContext.OPENAI_API_SERVER: 512,
+                }
+            elif chip_name == "V5P":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 512,
+                    UsageContext.OPENAI_API_SERVER: 256,
+                }
 
         # cpu specific default values.
         if current_platform.is_cpu():
-            world_size = self.pipeline_parallel_size * self.tensor_parallel_size
             default_max_num_batched_tokens = {
                 UsageContext.LLM_CLASS: 4096 * world_size,
                 UsageContext.OPENAI_API_SERVER: 2048 * world_size,
@@ -1843,44 +1845,104 @@ def _set_default_args(
                 UsageContext.OPENAI_API_SERVER: 128 * world_size,
             }
 
-        use_context_value = usage_context.value if usage_context else None
-        if (
-            self.max_num_batched_tokens is None
-            and usage_context in default_max_num_batched_tokens
+        return default_max_num_batched_tokens, default_max_num_seqs
+
+    def _set_default_args(
+        self, usage_context: UsageContext, model_config: ModelConfig
+    ) -> None:
+        """Set Default Arguments for V1 Engine."""
+        (
+            default_chunked_prefill,
+            default_prefix_caching,
+        ) = self.get_chunked_prefill_prefix_caching_defaults(model_config)
+
+        if self.enable_chunked_prefill is None:
+            self.enable_chunked_prefill = default_chunked_prefill
+
+            logger.debug(
+                "%s chunked prefill by default",
+                "Enabling" if default_chunked_prefill else "Disabling",
+            )
+        elif (
+            model_config.runner_type == "pooling"
+            and self.enable_chunked_prefill
+            and not default_chunked_prefill
         ):
-            if current_platform.is_tpu():
-                chip_name = current_platform.get_device_name()
-                if chip_name in default_max_num_batched_tokens_tpu[usage_context]:
-                    self.max_num_batched_tokens = default_max_num_batched_tokens_tpu[
-                        usage_context
-                    ][chip_name]
-                else:
-                    self.max_num_batched_tokens = default_max_num_batched_tokens[
-                        usage_context
-                    ]
-            else:
-                if not self.enable_chunked_prefill:
-                    self.max_num_batched_tokens = model_config.max_model_len
-                else:
-                    self.max_num_batched_tokens = default_max_num_batched_tokens[
-                        usage_context
-                    ]
+            logger.warning(
+                "This model does not officially support chunked prefill. "
+                "Enabling this manually may cause the engine to crash "
+                "or produce incorrect outputs.",
+            )
+
+        if self.enable_prefix_caching is None:
+            self.enable_prefix_caching = default_prefix_caching
+
             logger.debug(
-                "Setting max_num_batched_tokens to %d for %s usage context.",
+                "%s prefix caching by default",
+                "Enabling" if default_prefix_caching else "Disabling",
+            )
+        elif (
+            model_config.runner_type == "pooling"
+            and self.enable_prefix_caching
+            and not default_prefix_caching
+        ):
+            logger.warning(
+                "This model does not officially support prefix caching. "
+                "Enabling this manually may cause the engine to crash "
+                "or produce incorrect outputs.",
+            )
+
+        world_size = self.pipeline_parallel_size * self.tensor_parallel_size
+        (
+            default_max_num_batched_tokens,
+            default_max_num_seqs,
+        ) = self.get_batch_defaults(world_size)
+
+        orig_max_num_batched_tokens = self.max_num_batched_tokens
+        orig_max_num_seqs = self.max_num_seqs
+
+        if self.max_num_batched_tokens is None:
+            self.max_num_batched_tokens = default_max_num_batched_tokens.get(
+                usage_context,
+                SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
+            )
+
+        if self.max_num_seqs is None:
+            self.max_num_seqs = default_max_num_seqs.get(
+                usage_context,
+                SchedulerConfig.DEFAULT_MAX_NUM_SEQS,
+            )
+
+        if orig_max_num_batched_tokens is None:
+            if not self.enable_chunked_prefill:
+                # If max_model_len is too short, use the default for higher throughput.
+                self.max_num_batched_tokens = max(
+                    model_config.max_model_len,
+                    self.max_num_batched_tokens,
+                )
+
+            # When using default settings,
+            # Ensure max_num_batched_tokens does not exceed model limit.
+            # Some models (e.g., Whisper) have embeddings tied to max length.
+            self.max_num_batched_tokens = min(
+                self.max_num_seqs * model_config.max_model_len,
                 self.max_num_batched_tokens,
-                use_context_value,
             )
 
-        if self.max_num_seqs is None and usage_context in default_max_num_seqs:
-            self.max_num_seqs = min(
-                default_max_num_seqs[usage_context],
-                self.max_num_batched_tokens or sys.maxsize,
+            logger.debug(
+                "Defaulting max_num_batched_tokens to %d for %s usage context.",
+                self.max_num_batched_tokens,
+                usage_context.value if usage_context else None,
             )
 
+        if orig_max_num_seqs is None:
+            assert self.max_num_batched_tokens is not None  # For type checking
+            self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens)
+
             logger.debug(
-                "Setting max_num_seqs to %d for %s usage context.",
+                "Defaulting max_num_seqs to %d for %s usage context.",
                 self.max_num_seqs,
-                use_context_value,
+                usage_context.value if usage_context else None,
             )
 
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index cf954768689f..fdfa1c19789c 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -15,7 +15,6 @@
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import CpuArchEnum, Platform, PlatformEnum
 
@@ -339,10 +338,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "prefill and prefix caching to be disabled."
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
                 vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
     @classmethod
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index b997bb9e6999..4ab037fdb77e 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -10,7 +10,6 @@
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import Platform, PlatformEnum
 
@@ -186,10 +185,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "prefill and prefix caching to be disabled."
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
                 vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
     @classmethod
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 5552e4ca4b2f..ad4beb28bdae 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -9,7 +9,6 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
@@ -185,10 +184,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "prefill and prefix caching to be disabled."
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
                 vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
     @classmethod
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 040c0416c5ea..3ef44e770320 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3,7 +3,7 @@
 
 import uuid
 import warnings
-from typing import Any, TypeVar
+from typing import Any
 
 import torch
 
@@ -39,12 +39,6 @@ def __dir__() -> list[str]:
 
 logger = init_logger(__name__)
 
-# This value is chosen to have a balance between ITL and TTFT. Note it is
-# not optimized for throughput.
-DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
-POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
-MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
-
 # Constants related to forcing the attention backend selection
 
 # String name of register which may be set in order to
@@ -60,9 +54,6 @@ def __dir__() -> list[str]:
 STR_INVALID_VAL: str = "INVALID"
 
 
-T = TypeVar("T")
-
-
 def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 

From 3f8a8740656f2c0079b9e2b1623a0758a61104af Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Fri, 14 Nov 2025 08:02:44 -0800
Subject: [PATCH 057/578] [Kernels] Enable FlashInfer FP8 Blockscale on SM90
 (for TEP DSR1) (#27134)

Signed-off-by: Duncan Moss <djm.moss@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 .../fused_moe/flashinfer_cutlass_moe.py       |  23 ++-
 .../flashinfer_cutlass_prepare_finalize.py    | 147 ++++++++++++------
 .../model_executor/layers/quantization/fp8.py |  48 ++++--
 .../quantization/utils/flashinfer_utils.py    |  29 +++-
 4 files changed, 179 insertions(+), 68 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 85ce77fb1f7f..943695f921ad 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -57,6 +57,7 @@ def __init__(
         tp_rank: int = 0,
         tp_size: int = 1,
         use_dp: bool = False,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
         super().__init__(quant_config)
         assert quant_config.quant_dtype in ("nvfp4", torch.float8_e4m3fn, None), (
@@ -69,6 +70,10 @@ def __init__(
         self.tp_size = tp_size
         self.out_dtype = out_dtype
         self.use_dp = use_dp
+        # Enables DeepSeek-style FP8 block-scale path:
+        # - pass per-block weight scales to the kernel
+        # - skip input activation quantization (kernel applies scaling)
+        self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
 
     @property
     def activation_formats(
@@ -147,7 +152,12 @@ def apply(
             "Only activation silu is supported in FlashInferExperts"
         )
 
-        if self.quant_dtype == torch.float8_e4m3fn:
+        # Select quantization metadata based on FP8 format/path
+        if (
+            self.quant_dtype == torch.float8_e4m3fn
+            and not self.use_deepseek_fp8_block_scale
+        ):
+            # FP8 per-tensor path: use global alphas/scales; do not pass input_sf
             quant_scales = [
                 self.g1_alphas,
                 self.a2_gscale,
@@ -176,6 +186,15 @@ def apply(
             # FlashInfer API requires weight to be long for nvfp4
             fc1_expert_weights = w1.view(torch.long)
             fc2_expert_weights = w2.view(torch.long)
+        elif self.use_deepseek_fp8_block_scale:
+            # FP8 block-scale path: provide block-scale weights, omit a1q_scale
+            quant_scales = [
+                self.w1_scale,
+                self.w2_scale,
+            ]
+            a1q_scale = None
+            fc1_expert_weights = w1
+            fc2_expert_weights = w2
         else:
             quant_scales = None
             a1q_scale = None
@@ -196,6 +215,8 @@ def apply(
             ep_size=self.ep_size,
             ep_rank=self.ep_rank,
             output=output,
+            # Informs FlashInfer to use the block-scale decoding path when True
+            use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
         )
 
 
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index bc9aab5208d9..762890867e60 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -28,11 +28,15 @@ def __init__(
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
         super().__init__()
         self.num_dispatchers_ = num_dispatchers
         self.use_dp = use_dp
         self.local_tokens = None
+        # Toggle for DeepSeek-style FP8 block-scale path where activations are
+        # not quantized here and weight block scales are consumed by the kernel.
+        self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
 
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
@@ -73,8 +77,9 @@ def __init__(
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
-        super().__init__(use_dp, num_dispatchers)
+        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
         self.alltoall_info = None
 
         # Initialize all2all_manager only for DP case
@@ -97,15 +102,19 @@ def prepare(
         )
 
         if not self.use_dp:
-            # Non-DP case: standard quantization
-            a1q, a1q_scale = moe_kernel_quantize_input(
-                a1,
-                quant_config.a1_gscale,
-                quant_config.quant_dtype,
-                quant_config.per_act_token_quant,
-                quant_config.block_shape,
-                is_fp4_scale_swizzled=not self.use_dp,
-            )
+            # Non-DP case: quantize activations unless using block-scale path
+            if not self.use_deepseek_fp8_block_scale:
+                a1q, a1q_scale = moe_kernel_quantize_input(
+                    a1,
+                    quant_config.a1_gscale,
+                    quant_config.quant_dtype,
+                    quant_config.per_act_token_quant,
+                    quant_config.block_shape,
+                    is_fp4_scale_swizzled=not self.use_dp,
+                )
+            else:
+                a1q = a1
+                a1q_scale = None
         else:
             # DP case: use FlashInfer AllToAll
             global_num_tokens_cpu = get_local_sizes()
@@ -122,6 +131,7 @@ def prepare(
                     top_k,
                     num_experts,
                     quant_config,
+                    use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
                 )
             )
 
@@ -154,8 +164,9 @@ def __init__(
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
-        super().__init__(use_dp, num_dispatchers)
+        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
 
     def prepare(
         self,
@@ -173,22 +184,42 @@ def prepare(
         if not self.use_dp and quant_config.quant_dtype == "nvfp4":
             return a1, None, None, topk_ids, topk_weights
 
-        a1q, a1q_scale = moe_kernel_quantize_input(
-            a1,
-            quant_config.a1_gscale,
-            quant_config.quant_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-            is_fp4_scale_swizzled=not self.use_dp,
-        )
+        if not self.use_deepseek_fp8_block_scale:
+            a1q, a1q_scale = moe_kernel_quantize_input(
+                a1,
+                quant_config.a1_gscale,
+                quant_config.quant_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
+                is_fp4_scale_swizzled=not self.use_dp,
+            )
+        else:
+            # Block-scale path: pass activations through, omit per-token scales
+            a1q = a1
+            a1q_scale = None
 
         if self.use_dp:
-            topk_weights, topk_ids, a1q, a1q_scale = get_dp_group().all_gatherv(
-                [topk_weights, topk_ids, a1q, a1q_scale],
-                dim=0,
-                sizes=get_local_sizes(),
-            )
-        if quant_config.quant_dtype == "nvfp4":
+            # Build gather list conditionally - omit a1q_scale if None
+            # (block-scale path)
+            gather_list = [topk_weights, topk_ids, a1q]
+            if a1q_scale is not None:
+                gather_list.append(a1q_scale)
+                gathered = get_dp_group().all_gatherv(
+                    gather_list,
+                    dim=0,
+                    sizes=get_local_sizes(),
+                )
+                topk_weights, topk_ids, a1q, a1q_scale = gathered
+            else:
+                gathered = get_dp_group().all_gatherv(
+                    gather_list,
+                    dim=0,
+                    sizes=get_local_sizes(),
+                )
+                topk_weights, topk_ids, a1q = gathered
+                a1q_scale = None
+
+        if quant_config.quant_dtype == "nvfp4" and a1q_scale is not None:
             a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
 
         return a1q, a1q_scale, None, topk_ids, topk_weights
@@ -221,6 +252,7 @@ def flashinfer_alltoall_dispatch(
     top_k: int,
     num_experts: int,
     quant_config: FusedMoEQuantConfig,
+    use_deepseek_fp8_block_scale: bool = False,
 ):
     from flashinfer.comm.trtllm_alltoall import MnnvlMoe
 
@@ -250,30 +282,42 @@ def flashinfer_alltoall_dispatch(
     )
     topk_weights = topk_weights.view(dtype=orig_topk_weights_dtype)
 
-    x, x_sf = moe_kernel_quantize_input(
-        x,
-        gs,
-        quant_config.quant_dtype,
-        quant_config.per_act_token_quant,
-        quant_config.block_shape,
-        is_fp4_scale_swizzled=False,  # delay swizzle to after comm
-    )
-    x = MnnvlMoe.mnnvl_moe_alltoallv(
-        x,
-        alltoall_info,
-        all2all_manager.workspace_tensor,
-        ep_rank,
-        ep_size,
-    )
+    if not use_deepseek_fp8_block_scale:
+        x, x_sf = moe_kernel_quantize_input(
+            x,
+            gs,
+            quant_config.quant_dtype,
+            quant_config.per_act_token_quant,
+            quant_config.block_shape,
+            is_fp4_scale_swizzled=False,  # delay swizzle to after comm
+        )
+        x = MnnvlMoe.mnnvl_moe_alltoallv(
+            x,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
 
-    x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
-        x_sf,
-        alltoall_info,
-        all2all_manager.workspace_tensor,
-        ep_rank,
-        ep_size,
-    )
-    x_sf = nvfp4_block_scale_interleave(x_sf)
+        x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
+            x_sf,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
+        if quant_config.quant_dtype == "nvfp4":
+            x_sf = nvfp4_block_scale_interleave(x_sf)
+    else:
+        # Block-scale path: pass activations through without quantization
+        x_sf = None
+        x = MnnvlMoe.mnnvl_moe_alltoallv(
+            x,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
     return alltoall_info, topk_ids, topk_weights, x, x_sf
 
 
@@ -304,6 +348,7 @@ def create_flashinfer_prepare_finalize(
     use_dp: bool,
     use_nvfp4: bool = False,
     enable_alltoallv: bool = False,
+    use_deepseek_fp8_block_scale: bool = False,
 ) -> FlashInferCutlassMoEPrepareAndFinalize:
     """Factory function to create the appropriate FlashInfer implementation."""
     if use_nvfp4:
@@ -311,5 +356,7 @@ def create_flashinfer_prepare_finalize(
             return FlashInferAllToAllMoEPrepareAndFinalize(use_dp)
         else:
             return FlashInferAllGatherMoEPrepareAndFinalize(use_dp)
-    # Fp8 only supports AllGather
-    return FlashInferAllGatherMoEPrepareAndFinalize(use_dp)
+    # FP8 path currently supported via AllGather; optionally enable block-scale
+    return FlashInferAllGatherMoEPrepareAndFinalize(
+        use_dp=use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+    )
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index bbd0a4df1048..0479bec33840 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -3,6 +3,7 @@
 
 from collections.abc import Callable
 from enum import Enum
+from functools import partial
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
@@ -122,10 +123,13 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
     Select the primary FP8 MoE backend
     Note: Shape-specific fallbacks may still occur at runtime.
     """
-    # prefer FlashInfer backends when available and enabled on supported GPUs
+    # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
     if (
         current_platform.is_cuda()
-        and current_platform.is_device_capability(100)
+        and (
+            current_platform.is_device_capability(100)
+            or current_platform.is_device_capability(90)
+        )
         and envs.VLLM_USE_FLASHINFER_MOE_FP8
         and has_flashinfer_moe()
     ):
@@ -134,14 +138,14 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
             logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
             return Fp8MoeBackend.FLASHINFER_TRTLLM
         else:
-            if block_quant:
+            if block_quant and current_platform.is_device_capability(100):
                 raise ValueError(
                     "FlashInfer FP8 MoE throughput backend does not "
                     "support block quantization. Please use "
                     "VLLM_FLASHINFER_MOE_BACKEND=latency "
                     "instead."
                 )
-            logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM100")
+            logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM90/SM100")
             return Fp8MoeBackend.FLASHINFER_CUTLASS
 
     # weight-only path for older GPUs without native FP8
@@ -641,6 +645,16 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
             self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
         elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
             self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
+            if self.block_quant:
+                assert self.weight_block_size == [128, 128], (
+                    f"Only support weight_block_size == [128, 128], "
+                    f"got {self.weight_block_size}"
+                )
+            self.flashinfer_moe_fn = partial(
+                flashinfer_cutlass_moe_fp8,
+                moe=self.moe,
+                use_deepseek_fp8_block_scale=self.block_quant,
+            )
 
         self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
         self.allow_cutlass_block_scaled_grouped_gemm = (
@@ -1012,8 +1026,15 @@ def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
         ):
             return None
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            if self.block_quant:
+                assert self.weight_block_size == [128, 128], (
+                    f"Only support weight_block_size == [128, 128], "
+                    f"got {self.weight_block_size}"
+                )
+            # Wire block-scale flag through prepare/finalize when using CUTLASS
             prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
-                self.moe
+                self.moe,
+                use_deepseek_fp8_block_scale=self.block_quant,
             )
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
@@ -1062,9 +1083,11 @@ def select_gemm_impl(
             )
 
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            # Select GEMM experts with block-scale when weights are block-quantized
             experts = select_cutlass_fp8_gemm_impl(
                 self.moe,
                 self.moe_quant_config,
+                use_deepseek_fp8_block_scale=self.block_quant,
             )
             logger.debug_once("Using %s", experts.__class__.__name__)
             return experts
@@ -1251,16 +1274,17 @@ def apply(
                 workspace=layer.workspace,
             )
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert not self.block_quant
-            assert not renormalize and custom_routing_function is not None
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
             )
-            assert scoring_func == "sigmoid", (
-                f"Expected 'sigmoid' scoring func but got {scoring_func}"
-            )
-
-            result = flashinfer_cutlass_moe_fp8(
+            if not self.block_quant:
+                assert not renormalize and custom_routing_function is not None
+                assert scoring_func == "sigmoid", (
+                    f"Expected 'sigmoid' scoring func but got {scoring_func}"
+                )
+            # Delegate to CUTLASS FlashInfer path; function already bound with
+            # use_deepseek_fp8_block_scale for block-quant when applicable
+            result = self.flashinfer_moe_fn(
                 x,
                 layer,
                 topk_weights,
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index e49d374f154d..d9e9b4240271 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -17,6 +17,7 @@
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
     create_flashinfer_prepare_finalize,
 )
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -190,17 +191,22 @@ def register_moe_scaling_factors(layer: torch.nn.Module) -> None:
 
 
 def build_flashinfer_fp8_cutlass_moe_prepare_finalize(
-    moe: FusedMoEConfig | None,
+    moe: FusedMoEConfig | None, use_deepseek_fp8_block_scale: bool = False
 ) -> mk.FusedMoEPrepareAndFinalize:
     """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel"""
     use_dp = moe.moe_parallel_config.dp_size > 1 if moe is not None else False
-    return create_flashinfer_prepare_finalize(use_dp)
+    # Propagate block-scale flag so prepare/finalize can skip act quantization
+    # and inform the kernel to consume per-block weight scales.
+    return create_flashinfer_prepare_finalize(
+        use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+    )
 
 
 def select_cutlass_fp8_gemm_impl(
     moe: FusedMoEConfig | None,
     quant_config: FusedMoEQuantConfig,
     out_dtype: torch.dtype | None = None,
+    use_deepseek_fp8_block_scale: bool = False,
 ) -> mk.FusedMoEPermuteExpertsUnpermute:
     """Return a GEMM *experts* implementation for fused-MoE layers"""
 
@@ -212,12 +218,14 @@ def select_cutlass_fp8_gemm_impl(
             ep_size=moe.moe_parallel_config.ep_size,
             tp_rank=moe.moe_parallel_config.tp_rank,
             tp_size=moe.moe_parallel_config.tp_size,
+            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         )
 
     assert out_dtype is not None, "If moe config is None, out_dtype must be passed"
     return FlashInferExperts(
         out_dtype=out_dtype,
         quant_config=quant_config,
+        use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
     )
 
 
@@ -231,14 +239,22 @@ def flashinfer_cutlass_moe_fp8(
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
+    use_deepseek_fp8_block_scale: bool = False,
+    moe: FusedMoEConfig | None = None,
 ) -> torch.Tensor:
     quant_config = layer.quant_method.get_fused_moe_quant_config(layer)
     assert quant_config is not None
 
+    # Construct modular kernel with block-scale support when requested.
     fused_experts = mk.FusedMoEModularKernel(
-        build_flashinfer_fp8_cutlass_moe_prepare_finalize(moe=None),
+        build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+            moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+        ),
         select_cutlass_fp8_gemm_impl(
-            moe=None, quant_config=quant_config, out_dtype=hidden_states.dtype
+            moe=moe,
+            quant_config=quant_config,
+            out_dtype=hidden_states.dtype,
+            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         ),
     )
 
@@ -258,7 +274,10 @@ def flashinfer_cutlass_moe_fp8(
 
 def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
     flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
-    if flashinfer_moe_backend == "throughput":
+    # Prefer CUTLASS on SM90 to cover both SM90/SM100 generations
+    if flashinfer_moe_backend == "throughput" or current_platform.is_device_capability(
+        90
+    ):
         return FlashinferMoeBackend.CUTLASS
     elif flashinfer_moe_backend == "latency":
         return FlashinferMoeBackend.TENSORRT_LLM

From c934caee88f65258aac00d71d9ae0ecc4a4e1cd7 Mon Sep 17 00:00:00 2001
From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com>
Date: Sat, 15 Nov 2025 00:07:20 +0800
Subject: [PATCH 058/578] [Fix] improve aspect ratio in dummy image generation
 and add common  VLM tests for PaddleOCR-VL (#28711)

Signed-off-by: dongbo910220 <1275604947@qq.com>
---
 .../multimodal/generation/test_common.py       | 18 ++++++++++++++++++
 vllm/model_executor/models/paddleocr_vl.py     |  3 +--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 22083d9f1614..95b64b380db0 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -12,6 +12,7 @@
 from packaging.version import Version
 from transformers import (
     AutoModel,
+    AutoModelForCausalLM,
     AutoModelForImageTextToText,
     AutoModelForTextToWaveform,
 )
@@ -691,6 +692,23 @@
         patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
         hf_model_kwargs={"revision": "refs/pr/5"},
     ),
+    "paddleocr_vl": VLMTestInfo(
+        models=["PaddlePaddle/PaddleOCR-VL"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        img_idx_to_prompt=lambda idx: (
+            "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
+        ),
+        multi_image_prompt=(
+            "Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
+            "Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
+            "Describe these two images separately."
+        ),
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForCausalLM,
+        image_size_factors=[(), (0.25,)],
+    ),
     "phi3v": VLMTestInfo(
         models=["microsoft/Phi-3.5-vision-instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 183f458658aa..3ef6470070d1 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -232,8 +232,7 @@ def get_image_size_with_most_features(self) -> ImageSize:
         # Find factors of max_num_tokens close to its square root
         # to create a dummy image with a reasonable aspect ratio.
         h_patches = int(math.sqrt(max_num_tokens))
-        while max_num_tokens % h_patches != 0:
-            h_patches -= 1
+        max_num_tokens -= max_num_tokens % h_patches
         w_patches = max_num_tokens // h_patches
         return ImageSize(height=h_patches * factor, width=w_patches * factor)
 

From 5f3cd7f7f20a8e4445d70cbd1f5475175ef391e3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 14 Nov 2025 16:34:14 +0000
Subject: [PATCH 059/578] [Docs] Update the name of `Transformers backend` ->
 `Transformers modeling backend` (#28725)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .github/CODEOWNERS                            |  2 +-
 docs/contributing/model/README.md             |  2 +-
 .../frameworks/hf_inference_endpoints.md      |  4 +--
 docs/models/supported_models.md               | 26 +++++++++----------
 tests/models/test_transformers.py             |  4 +--
 vllm/config/model.py                          |  8 +++---
 vllm/lora/layers/base_linear.py               |  2 +-
 vllm/model_executor/models/adapters.py        |  4 +--
 .../models/transformers/__init__.py           |  4 +--
 .../models/transformers/base.py               |  9 ++++---
 .../models/transformers/causal.py             |  2 +-
 .../models/transformers/legacy.py             |  2 +-
 .../model_executor/models/transformers/moe.py |  4 +--
 .../models/transformers/multimodal.py         | 12 +++++----
 .../models/transformers/pooling.py            |  2 +-
 .../models/transformers/utils.py              |  2 +-
 16 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index bfb0e91fd06e..6e178bb690c5 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -57,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
 
-# Transformers backend
+# Transformers modeling backend
 /vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor
 
diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md
index d8c40c519573..13f3edb7e1af 100644
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@@ -1,7 +1,7 @@
 # Summary
 
 !!! important
-    Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
+    Many decoder language models can now be automatically loaded using the [Transformers modeling backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
 
 vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
 
diff --git a/docs/deployment/frameworks/hf_inference_endpoints.md b/docs/deployment/frameworks/hf_inference_endpoints.md
index d39bb9a899c8..05df0dacd8f1 100644
--- a/docs/deployment/frameworks/hf_inference_endpoints.md
+++ b/docs/deployment/frameworks/hf_inference_endpoints.md
@@ -156,7 +156,7 @@ In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.o
 
 ## Advanced Deployment Details
 
-With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
+With the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
 
 Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts.
 
@@ -167,4 +167,4 @@ The platform integrates seamlessly with the Hugging Face Hub, allowing you to de
 - Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog
 - Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index)
 - Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm)
-- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
+- Understand the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index c1eb207efcd1..0439e9cf2364 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -15,9 +15,9 @@ These models are what we list in [supported text models](#list-of-text-only-lang
 
 ### Transformers
 
-vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
+vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers modeling backend".
 
-Currently, the Transformers backend works for the following:
+Currently, the Transformers modeling backend works for the following:
 
 - Modalities: embedding models, language models and vision-language models*
 - Architectures: encoder-only, decoder-only, mixture-of-experts
@@ -25,7 +25,7 @@ Currently, the Transformers backend works for the following:
 
 _*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
 
-If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
+If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers modeling backend, it will be compatible with the following features of vLLM:
 
 - All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
 - Any combination of the following vLLM parallelisation schemes:
@@ -44,7 +44,7 @@ llm.apply_model(lambda model: print(type(model)))
 
 If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
 
-If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
+If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers modeling backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
 
 !!! note
     For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
@@ -53,12 +53,12 @@ If a model has a vLLM implementation but you would prefer to use the Transformer
 
 If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!
 
-For a model to be compatible with the Transformers backend for vLLM it must:
+For a model to be compatible with the Transformers modeling backend for vLLM it must:
 
 - be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
     - The model directory must have the correct structure (e.g. `config.json` is present).
     - `config.json` must contain `auto_map.AutoModel`.
-- be a Transformers backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
+- be a Transformers modeling backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
     - Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
 
 If the compatible model is:
@@ -66,13 +66,13 @@ If the compatible model is:
 - on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md).
 - in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference](../serving/offline_inference.md) or `vllm serve <MODEL_DIR>` for the [openai-compatible-server](../serving/openai_compatible_server.md).
 
-This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
+This means that, with the Transformers modeling backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
 
 #### Writing custom models
 
-This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
+This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers modeling backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
 
-To make your model compatible with the Transformers backend, it needs:
+To make your model compatible with the Transformers modeling backend, it needs:
 
 1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
     - If your model is encoder-only:
@@ -134,7 +134,7 @@ Here is what happens in the background when this model is loaded:
 
 1. The config is loaded.
 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
-3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
+3. `MyModel` is loaded into one of the Transformers modeling backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
 
 That's it!
 
@@ -182,7 +182,7 @@ To determine whether a given model is natively supported, you can check the `con
 If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
 
 Models do not _need_ to be natively supported to be used in vLLM.
-The [Transformers backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+The [Transformers modeling backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
 
 !!! tip
     The easiest way to check if your model is really supported at runtime is to run the program below:
@@ -451,7 +451,7 @@ th {
 | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
 | `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
 
-Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
+Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
@@ -720,7 +720,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
 | `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
 
-Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
+Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index a18f5b607763..ae5befd2c00b 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test the functionality of the Transformers backend."""
+"""Test the functionality of the Transformers modeling backend."""
 
 from typing import Any
 
@@ -85,7 +85,7 @@ def test_models(
     required = Version("5.0.0.dev")
     if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
         pytest.skip(
-            "MoE models with the Transformers backend require "
+            "MoE models with the Transformers modeling backend require "
             f"transformers>={required}, but got {installed}"
         )
 
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 8ec66b6b3160..b3a28af6de38 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -732,7 +732,7 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
         return self
 
     def _get_transformers_backend_cls(self) -> str:
-        """Determine which Transformers backend class will be used if
+        """Determine which Transformers modeling backend class will be used if
         `model_impl` is set to `transformers` or `auto`."""
         cls = "Transformers"
         # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal
@@ -746,8 +746,8 @@ def _get_transformers_backend_cls(self) -> str:
         # User specified value take precedence
         if self.runner != "auto":
             runner = self.runner
-        # Only consider Transformers backend pooling classes if we're wrapping an
-        # architecture that defaults to pooling. Otherwise, we return the LM class
+        # Only consider Transformers modeling backend pooling classes if we're wrapping
+        # an architecture that defaults to pooling. Otherwise, we return the LM class
         # and use adapters.
         if runner == "pooling" and task in {"embed", "classify"}:
             if task == "embed":
@@ -759,7 +759,7 @@ def _get_transformers_backend_cls(self) -> str:
         return cls
 
     def using_transformers_backend(self) -> bool:
-        """Check if the model is using the Transformers backend class."""
+        """Check if the model is using the Transformers modeling backend class."""
         used_cls = self._model_info.architecture
         transformers_backend_cls = self._get_transformers_backend_cls()
         return used_cls == transformers_backend_cls
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index d619a0edc124..3db4165e2017 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -121,7 +121,7 @@ def set_lora(
     def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
-        # In transformers backend, x and output have extra batch dimension like
+        # In Transformers modeling backend, x and output have extra batch dimension like
         # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
         # therefore we need to flatten the batch dimensions.
         if x.ndim == 3 and output.ndim == 3:
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index f742090df71f..a9cc49451a1d 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -429,7 +429,7 @@ def load_weights_using_from_2_way_softmax(
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
         # have this attribute, we fallback to get_input_embeddings(), which is used by
-        # the Transformers backend.
+        # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
             if hasattr(model.model, "embed_tokens")
@@ -487,7 +487,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
         # have this attribute, we fallback to get_input_embeddings(), which is used by
-        # the Transformers backend.
+        # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
             if hasattr(model.model, "embed_tokens")
diff --git a/vllm/model_executor/models/transformers/__init__.py b/vllm/model_executor/models/transformers/__init__.py
index 365b5eb08893..93cd8ff50766 100644
--- a/vllm/model_executor/models/transformers/__init__.py
+++ b/vllm/model_executor/models/transformers/__init__.py
@@ -120,8 +120,8 @@ def __getattr__(name: str):
     """Handle imports of non-existent classes with a helpful error message."""
     if name not in globals():
         raise AttributeError(
-            "The Transformers backend does not currently have a class to handle "
-            f"the requested model type: {name}. Please open an issue at "
+            "The Transformers modeling backend does not currently have a class to "
+            f"handle the requested model type: {name}. Please open an issue at "
             "https://github.com/vllm-project/vllm/issues/new"
         )
     return globals()[name]
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index 63096e57f8ee..f4ba4758bcc4 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend base class."""
+"""Transformers modeling backend base class."""
 
 from collections.abc import Iterable
 from typing import TYPE_CHECKING
@@ -118,7 +118,7 @@ def __init_subclass__(cls, *args, **kwargs):
 
     def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
         super().__init__()
-        logger.info("Using Transformers backend.")
+        logger.info("Using Transformers modeling backend.")
 
         self.config = vllm_config.model_config.hf_config
         self.text_config = self.config.get_text_config()
@@ -147,7 +147,8 @@ def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
             # Check for unsupported quantization methods.
             if quant_method_name == "mxfp4":
                 raise NotImplementedError(
-                    "Transformers backend does not support MXFP4 quantization yet."
+                    "Transformers modeling backend does "
+                    "not support MXFP4 quantization yet."
                 )
             # Skip loading extra bias for GPTQ models.
             if "gptq" in quant_method_name:
@@ -458,6 +459,6 @@ def check_version(min_version: str, feature: str):
         required = Version(min_version)
         if installed < required:
             raise ImportError(
-                f"Transformers backend requires transformers>={required} "
+                f"Transformers modeling backend requires transformers>={required} "
                 f"for {feature}, but got {installed}"
             )
diff --git a/vllm/model_executor/models/transformers/causal.py b/vllm/model_executor/models/transformers/causal.py
index 42fd11117c73..b2865ed0c7ff 100644
--- a/vllm/model_executor/models/transformers/causal.py
+++ b/vllm/model_executor/models/transformers/causal.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for causal language models."""
+"""Transformers modeling backend mixin for causal language models."""
 
 from typing import TYPE_CHECKING
 
diff --git a/vllm/model_executor/models/transformers/legacy.py b/vllm/model_executor/models/transformers/legacy.py
index a453870a2687..aca630be5615 100644
--- a/vllm/model_executor/models/transformers/legacy.py
+++ b/vllm/model_executor/models/transformers/legacy.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for legacy models."""
+"""Transformers modeling backend mixin for legacy models."""
 
 from typing import TYPE_CHECKING
 
diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py
index 8e39eb0b9902..4973014c3d4e 100644
--- a/vllm/model_executor/models/transformers/moe.py
+++ b/vllm/model_executor/models/transformers/moe.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for Mixture of Experts (MoE) models."""
+"""Transformers modeling backend mixin for Mixture of Experts (MoE) models."""
 
 from typing import TYPE_CHECKING, Any
 
@@ -39,7 +39,7 @@
 
 @CustomOp.register("transformers_fused_moe")
 class TransformersFusedMoE(FusedMoE):
-    """Custom FusedMoE for the Transformers backend."""
+    """Custom FusedMoE for the Transformers modeling backend."""
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
index 9b0463f41fa8..ccf605371987 100644
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for multi-modal models."""
+"""Transformers modeling backend mixin for multi-modal models."""
 
 from collections.abc import Mapping
 from typing import TYPE_CHECKING
@@ -310,9 +310,9 @@ def forward(
         return model_output
 
     def get_language_model(self) -> torch.nn.Module:
-        """Transformers backend multimodal classes do not contain a separate vLLM
-        language model class. Therefore, in order to return a language model vLLM class,
-        we use a wrapper to give `self` the same interface as a text model."""
+        """Transformers modeling backend multimodal classes do not contain a separate
+        vLLM language model class. Therefore, in order to return a language model vLLM
+        class, we use a wrapper to give `self` the same interface as a text model."""
 
         # Exclude self and object
         bases = self.__class__.mro()[1:-1]
@@ -385,7 +385,9 @@ def get_mrope_input_positions(
             for k, v in kwargs.items()
             if k not in {"image_grid_thw", "video_grid_thw"}
         ):
-            raise NotImplementedError("Transformers backend only supports images.")
+            raise NotImplementedError(
+                "Transformers modeling backend only supports images."
+            )
 
         image_grid_thw = kwargs.get("image_grid_thw", [])
         video_grid_thw = kwargs.get("video_grid_thw", [])
diff --git a/vllm/model_executor/models/transformers/pooling.py b/vllm/model_executor/models/transformers/pooling.py
index 8117bbac013e..4c2a74bccb6a 100644
--- a/vllm/model_executor/models/transformers/pooling.py
+++ b/vllm/model_executor/models/transformers/pooling.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixins for pooling models."""
+"""Transformers modeling backend mixins for pooling models."""
 
 from typing import TYPE_CHECKING
 
diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py
index 267a6e06e6bb..517eb54d53ac 100644
--- a/vllm/model_executor/models/transformers/utils.py
+++ b/vllm/model_executor/models/transformers/utils.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend utilities."""
+"""Transformers modeling backend utilities."""
 
 from contextlib import contextmanager
 from pathlib import Path

From d54a18a47e7cb6a126a022914c7965f84e15217c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 14 Nov 2025 11:37:18 -0500
Subject: [PATCH 060/578] [CI][CPU] Smoke test for Apple Silicon using GHA
 MacOS runner (#28688)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .github/workflows/macos-smoke-test.yml | 73 ++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 .github/workflows/macos-smoke-test.yml

diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
new file mode 100644
index 000000000000..f56fdc0dbe79
--- /dev/null
+++ b/.github/workflows/macos-smoke-test.yml
@@ -0,0 +1,73 @@
+name: macOS Apple Silicon Smoke Test
+
+on:
+  workflow_dispatch:  # Manual trigger
+
+jobs:
+  macos-m1-smoke-test:
+    runs-on: macos-latest
+    timeout-minutes: 20
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          uv pip install -r requirements/cpu-build.txt
+          uv pip install -r requirements/cpu.txt
+
+      - name: Build vLLM
+        run: uv pip install -v -e .
+        env:
+          CMAKE_BUILD_PARALLEL_LEVEL: 4
+
+      - name: Verify installation
+        run: |
+          python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+          python -c "import torch; print(f'PyTorch: {torch.__version__}')"
+
+      - name: Smoke test vllm serve
+        timeout-minutes: 10
+        run: |
+          # Start server in background
+          vllm serve Qwen/Qwen3-0.6B \
+            --max-model-len=2048 \
+            --load-format=dummy \
+            --enforce-eager \
+            --port 8000 &
+
+          SERVER_PID=$!
+
+          # Wait for server to start
+          for i in {1..30}; do
+            if curl -s http://localhost:8000/health > /dev/null; then
+              echo "Server started successfully"
+              break
+            fi
+            if [ "$i" -eq 30 ]; then
+              echo "Server failed to start"
+              kill "$SERVER_PID"
+              exit 1
+            fi
+            sleep 2
+          done
+
+          # Test health endpoint
+          curl -f http://localhost:8000/health
+
+          # Test completion
+          curl -f http://localhost:8000/v1/completions \
+            -H "Content-Type: application/json" \
+            -d '{
+              "model": "Qwen/Qwen3-0.6B",
+              "prompt": "Hello",
+              "max_tokens": 5
+            }'
+
+          # Cleanup
+          kill "$SERVER_PID"

From 6f1e7f7226447f606a0731376a2d0bd080aa2767 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 14 Nov 2025 17:58:01 +0100
Subject: [PATCH 061/578] [DisaggEverything] Tokens in<>out `/generate`
 endpoint (#24261)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../online_serving/token_generation_client.py |  49 ++++
 requirements/docs.txt                         |   4 +
 .../entrypoints/openai/test_serving_tokens.py | 262 +++++++++++++++++
 vllm/engine/arg_utils.py                      |   5 +
 vllm/entrypoints/openai/api_server.py         |  81 ++++++
 vllm/entrypoints/openai/cli_args.py           |   5 +
 vllm/entrypoints/openai/protocol.py           |  77 +++++
 vllm/entrypoints/openai/serving_engine.py     |   4 +
 vllm/entrypoints/openai/serving_tokens.py     | 269 ++++++++++++++++++
 vllm/sampling_params.py                       |   2 +
 vllm/v1/engine/__init__.py                    |   8 +-
 vllm/v1/serial_utils.py                       |  65 ++++-
 12 files changed, 822 insertions(+), 9 deletions(-)
 create mode 100644 examples/online_serving/token_generation_client.py
 create mode 100644 tests/entrypoints/openai/test_serving_tokens.py
 create mode 100644 vllm/entrypoints/openai/serving_tokens.py

diff --git a/examples/online_serving/token_generation_client.py b/examples/online_serving/token_generation_client.py
new file mode 100644
index 000000000000..88ee43c5d9cd
--- /dev/null
+++ b/examples/online_serving/token_generation_client.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import httpx
+from transformers import AutoTokenizer
+
+GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
+DUMMY_API_KEY = "empty"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+transport = httpx.HTTPTransport()
+headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
+client = httpx.Client(
+    transport=transport,
+    base_url=GEN_ENDPOINT,
+    timeout=600,
+    headers=headers,
+)
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "How many countries are in the EU?"},
+]
+
+
+def main(client):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": token_ids,
+        "sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
+        "stream": False,
+    }
+    resp = client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    print(data)
+    print("-" * 50)
+    print("Token generation results:")
+    res = tokenizer.decode(data["choices"][0]["token_ids"])
+    print(res)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main(client)
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 0fd6dbe22c51..32e004b2b64b 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -10,3 +10,7 @@ mkdocs-minify-plugin
 regex
 ruff
 pydantic
+
+# For generating argparse docs.
+# Adding requirements here should only be used as a last resort.
+msgspec  # Need for multiple inheritance involving msgspec.Struct
\ No newline at end of file
diff --git a/tests/entrypoints/openai/test_serving_tokens.py b/tests/entrypoints/openai/test_serving_tokens.py
new file mode 100644
index 000000000000..62d843e35b86
--- /dev/null
+++ b/tests/entrypoints/openai/test_serving_tokens.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import httpx
+import pytest
+import pytest_asyncio
+from transformers import AutoTokenizer
+
+from vllm.config import ModelConfig
+from vllm.v1.engine.detokenizer import check_stop_strings
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+GEN_ENDPOINT = "/inference/v1/generate"
+
+
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        seed=0,
+        dtype="bfloat16",
+    )
+    return config.get_vocab_size()
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    return AutoTokenizer.from_pretrained(MODEL_NAME)
+
+
+@pytest.fixture(scope="module")
+def messages():
+    return [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "How many countries are in the EU?"},
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(request):
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+    ]
+
+    extra_args = getattr(request, "param", None)
+    if extra_args is not None:
+        args = args + (
+            list(extra_args)
+            if isinstance(extra_args, (list, tuple))
+            else [str(extra_args)]
+        )
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    async with httpx.AsyncClient(
+        transport=transport,
+        base_url=server.url_root,
+        timeout=600,
+        headers=headers,
+    ) as c:
+        yield c
+
+
+@pytest.mark.asyncio
+async def test_generate_endpoint(client):
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 5},
+        "stream": False,
+    }
+    resp = await client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    assert "choices" in data
+
+
+@pytest.mark.asyncio
+async def test_same_response_as_chat_completions(client, tokenizer, messages):
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    for ignore_eos in [True, False]:
+        payload = {
+            "model": MODEL_NAME,
+            "token_ids": token_ids,
+            "sampling_params": {
+                "max_tokens": 24,
+                "temperature": 0.0,
+                # NOTE coordinator will set this to skip detokenization
+                "detokenize": False,
+                "ignore_eos": ignore_eos,
+            },
+            "stream": False,
+        }
+        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+        generate_data = generate_resp.json()
+        generate_res = tokenizer.decode(
+            generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+        )
+
+        payload = {
+            "model": MODEL_NAME,
+            "messages": messages,
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "stream": False,
+            "ignore_eos": ignore_eos,
+            "chat_template_kwargs": dict(enable_thinking=False),
+        }
+        completions_resp = await client.post("/v1/chat/completions", json=payload)
+        completions_data = completions_resp.json()
+        completions_res = completions_data["choices"][0]["message"]["content"]
+
+        assert generate_res == completions_res
+
+
+@pytest.mark.asyncio
+async def test_stop_string_workflow(client, tokenizer, messages):
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": token_ids,
+        "sampling_params": {
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "detokenize": False,
+            # stop strings are only supported when detokenize is True.
+            "stop": ["27 member"],
+        },
+        # TODO stream test is much more interesting
+        "stream": False,
+    }
+    with pytest.raises(httpx.HTTPStatusError):
+        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+        generate_resp.raise_for_status()
+
+    payload["sampling_params"]["stop"] = None
+    generate_resp = await client.post(
+        GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"}
+    )
+    generate_data = generate_resp.json()
+    generate_res = tokenizer.decode(
+        generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+    )
+
+    # NOTE This is under the responsibility of the coordinator
+    # stop_checker = StopChecker(
+    #     max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer
+    # )
+    stop_str, truncate_to = check_stop_strings(
+        generate_res, len(generate_res), ["27 member"], False
+    )
+    assert stop_str == "27 member"
+    # abort request that hit stop string (requires tokens-only mode)
+    # res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501
+    # res.raise_for_status()
+    generate_res = generate_res[:truncate_to]
+
+    # Get stop_str response from chat completions
+    payload = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "max_tokens": 24,
+        "temperature": 0.0,
+        "stream": False,
+        "stop": ["27 member"],
+        "chat_template_kwargs": dict(enable_thinking=False),
+    }
+    completions_resp = await client.post("/v1/chat/completions", json=payload)
+    completions_data = completions_resp.json()
+    completions_res = completions_data["choices"][0]["message"]["content"]
+    assert generate_res == completions_res
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "server",
+    [
+        [
+            "--enable-lora",
+            "--lora-modules",
+            "Alice=charent/self_cognition_Alice",
+            "Bob=charent/self_cognition_Bob",
+            "--max-lora-rank",
+            "64",
+            "--max-cpu-loras",
+            "2",
+        ]
+    ],
+    indirect=True,
+)
+async def test_generate_with_lora_adapter(client, tokenizer, messages):
+    # Verify adapters are listed
+    models_resp = await client.get("/v1/models")
+    models_resp.raise_for_status()
+    models = {m["id"] for m in models_resp.json().get("data", [])}
+    assert {"Alice", "Bob"}.issubset(models)
+
+    # Generate using a LoRA adapter by specifying its name as the model
+    payload = {
+        "model": "Alice",
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 5},
+        "stream": False,
+    }
+    resp = await client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    assert "choices" in data
+
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    payload = {
+        "model": "Alice",
+        "token_ids": token_ids,
+        "sampling_params": {
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "detokenize": False,
+        },
+        "stream": False,
+    }
+    generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+    generate_data = generate_resp.json()
+    generate_res = tokenizer.decode(
+        generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+    )
+
+    payload = {
+        "model": "Alice",
+        "messages": messages,
+        "max_tokens": 24,
+        "temperature": 0.0,
+        "stream": False,
+        "chat_template_kwargs": dict(enable_thinking=False),
+    }
+    completions_resp = await client.post("/v1/chat/completions", json=payload)
+    completions_data = completions_resp.json()
+    completions_res = completions_data["choices"][0]["message"]["content"]
+
+    assert generate_res == completions_res
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cacebc530b6e..999ed780c20b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -566,6 +566,7 @@ class EngineArgs:
     kv_offloading_backend: KVOffloadingBackend | None = (
         CacheConfig.kv_offloading_backend
     )
+    tokens_only: bool = False
 
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
@@ -1495,6 +1496,10 @@ def create_engine_config(
             else ParallelConfig.data_parallel_rpc_port
         )
 
+        if self.tokens_only and not model_config.skip_tokenizer_init:
+            model_config.skip_tokenizer_init = True
+            logger.info("Skipping tokenizer initialization for tokens-only mode.")
+
         # Forward the deprecated CLI args to the EPLB config.
         if self.num_redundant_experts is not None:
             self.eplb_config.num_redundant_experts = self.num_redundant_experts
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f30c6ef2cd0a..3e59af717d95 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -65,6 +65,8 @@
     EmbeddingResponse,
     ErrorInfo,
     ErrorResponse,
+    GenerateRequest,
+    GenerateResponse,
     IOProcessorResponse,
     PoolingBytesResponse,
     PoolingRequest,
@@ -96,6 +98,7 @@
 from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
 from vllm.entrypoints.openai.serving_score import ServingScores
 from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization
+from vllm.entrypoints.openai.serving_tokens import ServingTokens
 from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription,
     OpenAIServingTranslation,
@@ -357,6 +360,10 @@ def engine_client(request: Request) -> EngineClient:
     return request.app.state.engine_client
 
 
+def generate_tokens(request: Request) -> ServingTokens | None:
+    return request.app.state.serving_tokens
+
+
 @router.get("/health", response_class=Response)
 async def health(raw_request: Request) -> Response:
     """Health check."""
@@ -1228,6 +1235,41 @@ async def is_scaling_elastic_ep(raw_request: Request):
 ]
 
 
+@router.post(
+    "/inference/v1/generate",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def generate(request: GenerateRequest, raw_request: Request):
+    handler = generate_tokens(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support generate tokens API"
+        )
+    try:
+        generator = await handler.serve_tokens(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+
+    elif isinstance(generator, GenerateResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
 if envs.VLLM_TORCH_PROFILER_DIR:
     logger.warning_once(
         "Torch Profiler is enabled in the API server. This should ONLY be "
@@ -1629,6 +1671,31 @@ async def log_response(request: Request, call_next):
             )
 
     app = sagemaker_standards.bootstrap(app)
+    # Optional endpoints
+    if args.tokens_only:
+
+        @app.post("/abort_requests")
+        async def abort_requests(raw_request: Request):
+            """
+            Abort one or more requests. To be used in a
+            Disaggregated Everything setup.
+            """
+            try:
+                body = await raw_request.json()
+            except json.JSONDecodeError as e:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail=f"JSON decode error: {e}",
+                ) from e
+            request_ids = body.get("request_ids")
+            if request_ids is None:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail="Missing 'request_ids' in request body",
+                )
+            # Abort requests in background
+            asyncio.create_task(engine_client(raw_request).abort(request_ids))
+            return Response(status_code=200)
 
     return app
 
@@ -1851,6 +1918,20 @@ async def init_app_state(
         if "generate" in supported_tasks
         else None
     )
+    state.serving_tokens = (
+        ServingTokens(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+            log_error_stack=args.log_error_stack,
+            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+            enable_log_outputs=args.enable_log_outputs,
+            force_no_detokenize=args.tokens_only,
+        )
+        if "generate" in supported_tasks
+        else None
+    )
 
     state.enable_server_load_tracking = args.enable_server_load_tracking
     state.server_load_metrics = 0
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 476587c17823..946362ce2ef0 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -189,6 +189,11 @@ class FrontendArgs:
     Helps mitigate header abuse. Default: 256."""
     log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
     """If set to True, log the stack trace of error responses"""
+    tokens_only: bool = False
+    """
+    If set to True, only enable the Tokens In<>Out endpoint. 
+    This is intended for use in a Disaggregated Everything setup.
+    """
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 45584df8b9e2..65bd15ba387b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -3220,3 +3220,80 @@ class TranslationResponseVerbose(OpenAIBaseModel):
 
     words: list[TranslationWord] | None = None
     """Extracted words and their corresponding timestamps."""
+
+
+####### Tokens IN <> Tokens OUT #######
+class GenerateRequest(BaseModel):
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    token_ids: list[int]
+    """The token ids to generate text from."""
+
+    # features: MultiModalFeatureSpec
+    # TODO (NickLucche): implement once Renderer work is completed
+    features: str | None = None
+    """The processed MM inputs for the model."""
+
+    sampling_params: SamplingParams
+    """The sampling parameters for the model."""
+
+    model: str | None = None
+
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    cache_salt: str | None = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit)."
+        ),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
+
+
+class GenerateResponseChoice(BaseModel):
+    index: int
+    logprobs: ChatCompletionLogProbs | None = None
+    # per OpenAI spec this is the default
+    finish_reason: str | None = "stop"
+    token_ids: list[int] | None = None
+
+
+class GenerateResponse(BaseModel):
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    choices: list[GenerateResponseChoice]
+
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 03f10e5a91e6..c50b0c4a23e1 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -58,6 +58,8 @@
     ErrorResponse,
     FunctionCall,
     FunctionDefinition,
+    GenerateRequest,
+    GenerateResponse,
     IOProcessorRequest,
     PoolingResponse,
     RerankRequest,
@@ -134,6 +136,7 @@
     | SpeechToTextRequest
     | ResponsesRequest
     | IOProcessorRequest
+    | GenerateRequest
 )
 
 AnyResponse: TypeAlias = (
@@ -145,6 +148,7 @@
     | PoolingResponse
     | ClassificationResponse
     | ScoreResponse
+    | GenerateResponse
 )
 
 
diff --git a/vllm/entrypoints/openai/serving_tokens.py b/vllm/entrypoints/openai/serving_tokens.py
new file mode 100644
index 000000000000..69a526b9b70d
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_tokens.py
@@ -0,0 +1,269 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import time
+from collections.abc import AsyncGenerator
+from collections.abc import Sequence as GenericSequence
+
+from fastapi import Request
+
+# yapf: disable
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionLogProb,
+    ChatCompletionLogProbs,
+    ChatCompletionLogProbsContent,
+    ErrorResponse,
+    GenerateRequest,
+    GenerateResponse,
+    GenerateResponseChoice,
+    PromptTokenUsageInfo,
+    RequestResponseMetadata,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.logger import init_logger
+from vllm.logprobs import Logprob
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.utils.collection_utils import as_list
+
+logger = init_logger(__name__)
+
+
+class ServingTokens(OpenAIServing):
+    """Provides Tokens IN <> Tokens OUT functionality to vLLM API."""
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        *,
+        request_logger: RequestLogger | None,
+        force_no_detokenize: bool = False,
+        return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
+        enable_prompt_tokens_details: bool = False,
+        enable_log_outputs: bool = False,
+    ):
+        super().__init__(engine_client=engine_client,
+                         models=models,
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids,
+                         log_error_stack=log_error_stack)
+        self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        self.enable_log_outputs = enable_log_outputs
+        self.force_no_detokenize = force_no_detokenize
+        if force_no_detokenize:
+            logger.info("Tokens-only mode is enabled, skipping detokenization "
+            "step for incoming requests.")
+
+    async def serve_tokens(
+        self,
+        request: GenerateRequest,
+        raw_request: Request | None = None
+    ) -> GenerateResponse | ErrorResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            logger.error("Error with model %s", error_check_ret)
+            return error_check_ret
+
+        # If the engine is dead, raise the engine's DEAD_ERROR.
+        # This is required for the streaming case, where we return a
+        # success status before we actually start generating text :).
+        if self.engine_client.errored:
+            raise self.engine_client.dead_error
+
+        lora_request = None
+        lora_request = self._maybe_get_adapters(request,
+            supports_default_mm_loras=True)
+
+        model_name = self.models.model_name(lora_request)
+
+        request_id = "generate-tokens-" \
+                     f"{self._base_request_id(raw_request, request.request_id)}"
+
+        request_metadata = RequestResponseMetadata(request_id=request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        # TODO(NickLucche): Change to EngineCoreRequest once Renderer work is
+        # completed
+        engine_prompt = EngineTokensPrompt(prompt_token_ids=request.token_ids)
+        if request.features is not None:
+            engine_prompt["multi_modal_data"] = None
+
+        if hasattr(request, "cache_salt") and request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
+        # Schedule the request and get the result generator.
+        result_generator: AsyncGenerator[RequestOutput, None] | None = None
+        try:
+            sampling_params = request.sampling_params
+            if self.force_no_detokenize:
+                sampling_params.detokenize = False
+
+            self._log_inputs(request_id,
+                             request.token_ids,
+                             params=sampling_params,
+                             lora_request=lora_request)
+
+            trace_headers = (None if raw_request is None else await
+                             self._get_trace_headers(raw_request.headers))
+
+            result_generator = self.engine_client.generate(
+                engine_prompt,
+                sampling_params,
+                request_id,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=request.priority,
+            )
+
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        # TODO(NickLucche): Implement streaming response
+
+        try:
+            assert result_generator is not None
+            return await self.serve_tokens_full_generator(
+                request, result_generator, request_id, model_name,
+                request_metadata)
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+    async def serve_tokens_full_generator(
+        self,
+        request: GenerateRequest,
+        result_generator: AsyncGenerator[RequestOutput, None],
+        request_id: str,
+        model_name: str,
+        request_metadata: RequestResponseMetadata,
+    ) -> ErrorResponse | GenerateResponse:
+
+        created_time = int(time.time())
+        final_res: RequestOutput | None = None
+        sampling_params: SamplingParams = request.sampling_params
+
+        try:
+            async for res in result_generator:
+                final_res = res
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        assert final_res is not None
+
+        choices: list[GenerateResponseChoice] = []
+        num_generated_tokens = 0
+        for output in final_res.outputs:
+            token_ids = output.token_ids
+            out_logprobs = output.logprobs
+
+            # This is top_logprobs in completions API
+            if sampling_params.logprobs:
+                assert out_logprobs is not None, "Did not output logprobs"
+                logprobs = self._create_tokens_logprobs(
+                    token_ids=token_ids,
+                    top_logprobs=out_logprobs,
+                    num_output_top_logprobs=sampling_params.logprobs,
+                )
+            else:
+                logprobs = None
+
+            choice_data = GenerateResponseChoice(
+                index=output.index,
+                logprobs=logprobs,
+                finish_reason=output.finish_reason
+                if output.finish_reason else "stop",
+                token_ids=as_list(output.token_ids))
+
+            choices.append(choice_data)
+            num_generated_tokens += len(output.token_ids)
+
+        assert final_res.prompt_token_ids is not None
+        num_prompt_tokens = len(final_res.prompt_token_ids)
+        if final_res.encoder_prompt_token_ids is not None:
+            num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
+
+        usage = UsageInfo(prompt_tokens=num_prompt_tokens,
+                          completion_tokens=num_generated_tokens,
+                          total_tokens=num_prompt_tokens +
+                          num_generated_tokens)
+        if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
+            # This info is not available at the /coordinator level
+            usage.prompt_tokens_details = PromptTokenUsageInfo(
+                cached_tokens=final_res.num_cached_tokens)
+
+        request_metadata.final_usage_info = usage
+
+        response = GenerateResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=choices,
+            usage=usage,
+            prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
+            kv_transfer_params=final_res.kv_transfer_params,
+        )
+
+        # Log complete response if output logging is enabled
+        if self.enable_log_outputs and self.request_logger:
+            for choice in choices:
+                # Get the corresponding output token IDs
+                output_token_ids = None
+                if choice.index < len(final_res.outputs):
+                    output_token_ids = final_res.outputs[
+                        choice.index].token_ids
+
+                if output_token_ids:
+                    # Log token_ids only.
+                    self.request_logger.log_outputs(
+                        request_id=request_id,
+                        outputs="", 
+                        output_token_ids=output_token_ids,
+                        finish_reason=choice.finish_reason,
+                        is_streaming=False,
+                        delta=False,
+                    )
+
+        return response
+
+    def _create_tokens_logprobs(
+        self,
+        token_ids: GenericSequence[int],
+        top_logprobs: GenericSequence[dict[int, Logprob] | None],
+        num_output_top_logprobs: int | None = None,
+    ) -> ChatCompletionLogProbs:
+        """Create OpenAI-style logprobs."""
+        logprobs_content: list[ChatCompletionLogProbsContent] = []
+
+        for i, token_id in enumerate(token_ids):
+            token = f"token_id:{token_id}"
+            step_top_logprobs = top_logprobs[i]
+            if step_top_logprobs is None or step_top_logprobs.get(
+                    token_id) is None:
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(token=token, ))
+            else:
+                step_token = step_top_logprobs[token_id]
+
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(
+                        token=token,
+                        logprob=max(step_token.logprob, -9999.0),
+                        top_logprobs=[
+                            ChatCompletionLogProb(
+                                token=token,
+                                logprob=max(p[1].logprob, -9999.0),
+                            ) for i, p in enumerate(step_top_logprobs.items())
+                            if num_output_top_logprobs
+                            and i < num_output_top_logprobs
+                        ]))
+
+        return ChatCompletionLogProbs(content=logprobs_content)
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 4b2a3bc4dbaa..dd820840410e 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -15,6 +15,7 @@
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.v1.serial_utils import PydanticMsgspecMixin
 
 logger = init_logger(__name__)
 
@@ -122,6 +123,7 @@ class RequestOutputKind(Enum):
 
 
 class SamplingParams(
+    PydanticMsgspecMixin,
     msgspec.Struct,
     omit_defaults=True,  # type: ignore[call-arg]
     # required for @cached_property.
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 058a4bcaecb5..3f621d77c024 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -15,6 +15,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+from vllm.v1.serial_utils import UtilityResult
 
 # These are possible values of RequestOutput.finish_reason,
 # so form part of the external API.
@@ -131,13 +132,6 @@ def finished(self) -> bool:
         return self.finish_reason is not None
 
 
-class UtilityResult:
-    """Wrapper for special handling when serializing/deserializing."""
-
-    def __init__(self, r: Any = None):
-        self.result = r
-
-
 class UtilityOutput(
     msgspec.Struct,
     array_like=True,  # type: ignore[call-arg]
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index cf0b1a41b50f..0a6806390451 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -8,7 +8,7 @@
 from functools import partial
 from inspect import isclass
 from types import FunctionType
-from typing import Any, TypeAlias
+from typing import Any, TypeAlias, get_type_hints
 
 import cloudpickle
 import msgspec
@@ -16,6 +16,8 @@
 import torch
 import zmq
 from msgspec import msgpack
+from pydantic import GetCoreSchemaHandler
+from pydantic_core import core_schema
 
 from vllm import envs
 from vllm.logger import init_logger
@@ -32,7 +34,6 @@
     NestedTensors,
 )
 from vllm.utils.platform_utils import is_pin_memory_available
-from vllm.v1.engine import UtilityResult
 from vllm.v1.utils import tensor_data
 
 logger = init_logger(__name__)
@@ -104,6 +105,13 @@ def _decode_type_info_recursive(
     return convert_fn(type_info, data)
 
 
+class UtilityResult:
+    """Wrapper for special handling when serializing/deserializing."""
+
+    def __init__(self, r: Any = None):
+        self.result = r
+
+
 class MsgpackEncoder:
     """Encoder with custom torch tensor and numpy array serialization.
 
@@ -469,3 +477,56 @@ def run_method(
     else:
         func = partial(method, obj)  # type: ignore
     return func(*args, **kwargs)
+
+
+class PydanticMsgspecMixin:
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls, source_type: Any, handler: GetCoreSchemaHandler
+    ) -> core_schema.CoreSchema:
+        """
+        Make msgspec.Struct compatible with Pydantic, respecting defaults.
+        Handle JSON=>msgspec.Struct. Used when exposing msgspec.Struct to the
+        API as input or in `/docs`. Note this is cached by Pydantic and not
+        called on every validation.
+        """
+        msgspec_fields = {f.name: f for f in msgspec.structs.fields(source_type)}
+        type_hints = get_type_hints(source_type)
+
+        # Build the Pydantic typed_dict_field for each msgspec field
+        fields = {}
+        for name, hint in type_hints.items():
+            msgspec_field = msgspec_fields[name]
+
+            # typed_dict_field using the handler to get the schema
+            field_schema = handler(hint)
+
+            # Add default value to the schema.
+            if msgspec_field.default_factory is not msgspec.NODEFAULT:
+                wrapped_schema = core_schema.with_default_schema(
+                    schema=field_schema,
+                    default_factory=msgspec_field.default_factory,
+                )
+                fields[name] = core_schema.typed_dict_field(wrapped_schema)
+            elif msgspec_field.default is not msgspec.NODEFAULT:
+                wrapped_schema = core_schema.with_default_schema(
+                    schema=field_schema,
+                    default=msgspec_field.default,
+                )
+                fields[name] = core_schema.typed_dict_field(wrapped_schema)
+            else:
+                # No default, so Pydantic will treat it as required
+                fields[name] = core_schema.typed_dict_field(field_schema)
+        return core_schema.no_info_after_validator_function(
+            cls._validate_msgspec,
+            core_schema.typed_dict_schema(fields),
+        )
+
+    @classmethod
+    def _validate_msgspec(cls, value: Any) -> Any:
+        """Validate and convert input to msgspec.Struct instance."""
+        if isinstance(value, cls):
+            return value
+        if isinstance(value, dict):
+            return cls(**value)
+        return msgspec.convert(value, type=cls)

From 8cc40f89926f0f49d320c4ef078c70cf535c589e Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Fri, 14 Nov 2025 12:13:37 -0500
Subject: [PATCH 062/578] [Attention] Bump FA for removed method (#28429)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 cmake/external_projects/vllm_flash_attn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 29db9fa273a4..567c8959f045 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8e1b01d56210dc72030a2d0d41c2d8d266ba6309
+          GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From a17e36f2236a15012c8ddfedbf076a526de4b17b Mon Sep 17 00:00:00 2001
From: Mohammad Othman <48595863+OthmanMohammad@users.noreply.github.com>
Date: Fri, 14 Nov 2025 19:35:45 +0200
Subject: [PATCH 063/578] Fix typo in comment: existance -> existence (#28737)

Signed-off-by: Mohammad Othman <emranm226@hotmail.com>
---
 vllm/_aiter_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 5508e59bcd2f..7c35bf1857ba 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -30,7 +30,7 @@ def if_aiter_supported(func: Callable) -> Callable:
 
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
-        # checks the platform, device arch and aiter library existance.
+        # checks the platform, device arch and aiter library existence.
 
         if current_platform.is_rocm() and IS_AITER_FOUND:
             from vllm.platforms.rocm import on_gfx9

From 085424808ef705efbf59e7b18bc010f53d9d7f75 Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Fri, 14 Nov 2025 18:54:38 +0100
Subject: [PATCH 064/578] Remove audio optional dependency for mistral-common
 (#28722)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 docs/contributing/model/transcription.md     | 2 +-
 docs/models/supported_models.md              | 3 +++
 examples/offline_inference/audio_language.py | 1 +
 requirements/common.txt                      | 2 +-
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
index a590ecd6a1a2..fca941acd507 100644
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -249,7 +249,7 @@ No extra registration is required beyond having your model class available via t
 ## Examples in-tree
 
 - Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
-- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
+- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
 - Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
 
 ## Test with the API
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 0439e9cf2364..9cdf644c3cc5 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -785,6 +785,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
 | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
 
+!!! note
+    `VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed.
+
 ### Pooling Models
 
 See [this page](./pooling_models.md) for more information on how to use pooling models.
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 53d69bbdbdc7..04e6f99f8957 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -43,6 +43,7 @@ class ModelRequestData(NamedTuple):
 
 
 # Voxtral
+# Make sure to install mistral-common[audio].
 def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
     from mistral_common.audio import Audio
     from mistral_common.protocol.instruct.chunk import (
diff --git a/requirements/common.txt b/requirements/common.txt
index 90efb79a845d..ad92ba3ad827 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -31,7 +31,7 @@ partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
 gguf >= 0.13.0
-mistral_common[image,audio] >= 1.8.5
+mistral_common[image] >= 1.8.5
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12

From cdd7025961cf79480f885804c21e7d60866fb33f Mon Sep 17 00:00:00 2001
From: czhu-cohere <conway.zhu@cohere.com>
Date: Fri, 14 Nov 2025 12:59:11 -0500
Subject: [PATCH 065/578] [kernel] Improve FP8 PTPC on Hopper for larger shapes
 (#28692)

Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
---
 .../c3x/scaled_mm_sm90_fp8_dispatch.cuh       | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
index 4ff3e65f2b2e..b8433214be1b 100644
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
@@ -116,6 +116,26 @@ struct sm90_fp8_config_default {
                                ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };
 
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M8192_K6144 {
+  // M >= 8192, K >= 6144
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_256, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
 template <typename InType, typename OutType, bool EnableBias>
 struct sm90_fp8_config_M128 {
   // M in (64, 128]
@@ -273,6 +293,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
   using Cutlass3xGemmDefault =
       typename sm90_fp8_config_default<InType, OutType,
                                        EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM8192_K6144 =
+      typename sm90_fp8_config_M8192_K6144<InType, OutType,
+                                           EnableBias>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
       typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
 
@@ -291,6 +314,7 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
 
   uint32_t const m = a.size(0);
   uint32_t const n = b.size(1);
+  uint32_t const k = a.size(1);
 
   if (m <= 16) {
     // m in [1, 16]
@@ -312,6 +336,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
     // m in (64, 128]
     return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
         out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m >= 8192 && k >= 6144) {
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM8192_K6144>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
   } else {
     // m in (128, inf)
     return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(

From 9261eb3dc19e985806a47ab2eb03035557f29c1f Mon Sep 17 00:00:00 2001
From: Chen Wang <Chen.Wang1@ibm.com>
Date: Fri, 14 Nov 2025 13:08:30 -0500
Subject: [PATCH 066/578] docs(lora_resolvers): clarify multi-resolver order
 and storage path requirement (#28153)

Signed-off-by: Chen Wang <Chen.Wang1@ibm.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .markdownlint.yaml                    |   2 +
 docs/.nav.yml                         |   5 +-
 docs/design/lora_resolver_plugins.md  | 220 ++++++++++++++++++++++++++
 vllm/plugins/lora_resolvers/README.md |  16 --
 4 files changed, 226 insertions(+), 17 deletions(-)
 create mode 100644 docs/design/lora_resolver_plugins.md
 delete mode 100644 vllm/plugins/lora_resolvers/README.md

diff --git a/.markdownlint.yaml b/.markdownlint.yaml
index cd9df57cd980..d0d3179766ef 100644
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -3,6 +3,8 @@ MD007:
 MD013: false
 MD024:
   siblings_only: true
+MD031:
+  list_items: false
 MD033: false
 MD045: false
 MD046: false
diff --git a/docs/.nav.yml b/docs/.nav.yml
index c103ed476d76..3151ea0e2ec2 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -46,7 +46,10 @@ nav:
       - contributing/model/multimodal.md
       - contributing/model/transcription.md
     - CI: contributing/ci
-    - Design Documents: design
+    - Design Documents:
+      - Plugins:
+        - design/*plugin*.md
+      - design/*
   - API Reference:
     - api/README.md
     - api/vllm
diff --git a/docs/design/lora_resolver_plugins.md b/docs/design/lora_resolver_plugins.md
new file mode 100644
index 000000000000..bd0dc6dc9c7b
--- /dev/null
+++ b/docs/design/lora_resolver_plugins.md
@@ -0,0 +1,220 @@
+# LoRA Resolver Plugins
+
+This directory contains vLLM's LoRA resolver plugins built on the `LoRAResolver` framework.
+They automatically discover and load LoRA adapters from a specified local storage path, eliminating the need for manual configuration or server restarts.
+
+## Overview
+
+LoRA Resolver Plugins provide a flexible way to dynamically load LoRA adapters at runtime. When vLLM
+receives a request for a LoRA adapter that hasn't been loaded yet, the resolver plugins will attempt
+to locate and load the adapter from their configured storage locations. This enables:
+
+- **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
+- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source.
+- **Automatic Discovery**: Seamless integration with existing LoRA workflows
+- **Scalable Deployment**: Centralized adapter management across multiple vLLM instances
+
+## Prerequisites
+
+Before using LoRA Resolver Plugins, ensure the following environment variables are configured:
+
+### Required Environment Variables
+
+1. **`VLLM_ALLOW_RUNTIME_LORA_UPDATING`**: Must be set to `true` or `1` to enable dynamic LoRA loading
+   ```bash
+   export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
+   ```
+
+2. **`VLLM_PLUGINS`**: Must include the desired resolver plugins (comma-separated list)
+   ```bash
+   export VLLM_PLUGINS=lora_filesystem_resolver
+   ```
+
+3. **`VLLM_LORA_RESOLVER_CACHE_DIR`**: Must be set to a valid directory path for filesystem resolver
+   ```bash
+   export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
+   ```
+
+### Optional Environment Variables
+
+- **`VLLM_PLUGINS`**: If not set, all available plugins will be loaded. If set to empty string, no plugins will be loaded.
+
+## Available Resolvers
+
+### lora_filesystem_resolver
+
+The filesystem resolver is installed with vLLM by default and enables loading LoRA adapters from a local directory structure.
+
+#### Setup Steps
+
+1. **Create the LoRA adapter storage directory**:
+   ```bash
+   mkdir -p /path/to/lora/adapters
+   ```
+
+2. **Set environment variables**:
+   ```bash
+   export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
+   export VLLM_PLUGINS=lora_filesystem_resolver
+   export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
+   ```
+
+3. **Start vLLM server**:
+   Your base model can be `meta-llama/Llama-2-7b-hf`. Please make sure you set up the Hugging Face token in your env var `export HF_TOKEN=xxx235`.
+   ```bash
+   python -m vllm.entrypoints.openai.api_server \
+       --model your-base-model \
+       --enable-lora
+   ```
+
+#### Directory Structure Requirements
+
+The filesystem resolver expects LoRA adapters to be organized in the following structure:
+
+```text
+/path/to/lora/adapters/
+├── adapter1/
+│   ├── adapter_config.json
+│   ├── adapter_model.bin
+│   └── tokenizer files (if applicable)
+├── adapter2/
+│   ├── adapter_config.json
+│   ├── adapter_model.bin
+│   └── tokenizer files (if applicable)
+└── ...
+```
+
+Each adapter directory must contain:
+
+- **`adapter_config.json`**: Required configuration file with the following structure:
+  ```json
+  {
+    "peft_type": "LORA",
+    "base_model_name_or_path": "your-base-model-name",
+    "r": 16,
+    "lora_alpha": 32,
+    "target_modules": ["q_proj", "v_proj"],
+    "bias": "none",
+    "modules_to_save": null,
+    "use_rslora": false,
+    "use_dora": false
+  }
+  ```
+
+- **`adapter_model.bin`**: The LoRA adapter weights file
+
+#### Usage Example
+
+1. **Prepare your LoRA adapter**:
+   ```bash
+   # Assuming you have a LoRA adapter in /tmp/my_lora_adapter
+   cp -r /tmp/my_lora_adapter /path/to/lora/adapters/my_sql_adapter
+   ```
+
+2. **Verify the directory structure**:
+   ```bash
+   ls -la /path/to/lora/adapters/my_sql_adapter/
+   # Should show: adapter_config.json, adapter_model.bin, etc.
+   ```
+
+3. **Make a request using the adapter**:
+   ```bash
+   curl http://localhost:8000/v1/completions \
+       -H "Content-Type: application/json" \
+       -d '{
+           "model": "my_sql_adapter",
+           "prompt": "Generate a SQL query for:",
+           "max_tokens": 50,
+           "temperature": 0.1
+       }'
+   ```
+
+#### How It Works
+
+1. When vLLM receives a request for a LoRA adapter named `my_sql_adapter`
+2. The filesystem resolver checks if `/path/to/lora/adapters/my_sql_adapter/` exists
+3. If found, it validates the `adapter_config.json` file
+4. If the configuration matches the base model and is valid, the adapter is loaded
+5. The request is processed normally with the newly loaded adapter
+6. The adapter remains available for future requests
+
+## Advanced Configuration
+
+### Multiple Resolvers
+
+You can configure multiple resolver plugins to load adapters from different sources:
+
+'lora_s3_resolver' is an example of a custom resolver you would need to implement
+
+```bash
+export VLLM_PLUGINS=lora_filesystem_resolver,lora_s3_resolver
+```
+
+All listed resolvers are enabled; at request time, vLLM tries them in order until one succeeds.
+
+### Custom Resolver Implementation
+
+To implement your own resolver plugin:
+
+1. **Create a new resolver class**:
+   ```python
+   from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+   from vllm.lora.request import LoRARequest
+   
+   class CustomResolver(LoRAResolver):
+       async def resolve_lora(self, base_model_name: str, lora_name: str) -> Optional[LoRARequest]:
+           # Your custom resolution logic here
+           pass
+   ```
+
+2. **Register the resolver**:
+   ```python
+   def register_custom_resolver():
+       resolver = CustomResolver()
+       LoRAResolverRegistry.register_resolver("Custom Resolver", resolver)
+   ```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **"VLLM_LORA_RESOLVER_CACHE_DIR must be set to a valid directory"**
+   - Ensure the directory exists and is accessible
+   - Check file permissions on the directory
+
+2. **"LoRA adapter not found"**
+   - Verify the adapter directory name matches the requested model name
+   - Check that `adapter_config.json` exists and is valid JSON
+   - Ensure `adapter_model.bin` exists in the directory
+
+3. **"Invalid adapter configuration"**
+   - Verify `peft_type` is set to "LORA"
+   - Check that `base_model_name_or_path` matches your base model
+   - Ensure `target_modules` is properly configured
+
+4. **"LoRA rank exceeds maximum"**
+   - Check that `r` value in `adapter_config.json` doesn't exceed `max_lora_rank` setting
+
+### Debugging Tips
+
+1. **Enable debug logging**:
+   ```bash
+   export VLLM_LOGGING_LEVEL=DEBUG
+   ```
+
+2. **Verify environment variables**:
+   ```bash
+   echo $VLLM_ALLOW_RUNTIME_LORA_UPDATING
+   echo $VLLM_PLUGINS
+   echo $VLLM_LORA_RESOLVER_CACHE_DIR
+   ```
+
+3. **Test adapter configuration**:
+   ```bash
+   python -c "
+   import json
+   with open('/path/to/lora/adapters/my_adapter/adapter_config.json') as f:
+       config = json.load(f)
+   print('Config valid:', config)
+   "
+   ```
diff --git a/vllm/plugins/lora_resolvers/README.md b/vllm/plugins/lora_resolvers/README.md
deleted file mode 100644
index 48f27dddea07..000000000000
--- a/vllm/plugins/lora_resolvers/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# LoRA Resolver Plugins
-
-This directory contains vLLM general plugins for dynamically discovering and loading LoRA adapters
-via the LoRAResolver plugin framework.
-
-Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins
-to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins.
-
-## lora_filesystem_resolver
-
-This LoRA Resolver is installed with vLLM by default.
-To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request
-for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory
-for a subdirectory `foobar` containing a LoRA adapter. If such an adapter exists, it will
-load that adapter, and then service the request as normal. That adapter will then be available
-for future requests as normal.

From 964d65deedb9ae0480fecdb2e726ba16d63409d7 Mon Sep 17 00:00:00 2001
From: Fardin Hoque <kfhfar@amazon.com>
Date: Fri, 14 Nov 2025 10:27:56 -0800
Subject: [PATCH 067/578] LLaMA4 LoRA Adapter Enablement (#28602)

Signed-off-by: Fardin Hoque <kfhfar@amazon.com>
Co-authored-by: Wei Wei <wwei6@meta.com>
---
 vllm/model_executor/models/mllama4.py | 36 +++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 14e741f32258..e25a104d822a 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -35,6 +35,7 @@
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -45,6 +46,7 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.utils import initialize_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
@@ -68,11 +70,15 @@
     MixtureOfExperts,
     MultiModalEmbeddings,
     SupportsEagle3,
+    SupportsLoRA,
     SupportsMultiModal,
     SupportsPP,
 )
 from .llama4 import Llama4ForCausalLM
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import (
+    AutoWeightsLoader,
+    maybe_prefix,
+)
 from .vision import run_dp_sharded_vision_model
 
 
@@ -724,7 +730,12 @@ def get_dummy_mm_data(
     dummy_inputs=Mllama4DummyInputsBuilder,
 )
 class Llama4ForConditionalGeneration(
-    nn.Module, SupportsMultiModal, SupportsPP, MixtureOfExperts, SupportsEagle3
+    nn.Module,
+    SupportsMultiModal,
+    SupportsPP,
+    MixtureOfExperts,
+    SupportsEagle3,
+    SupportsLoRA,
 ):
     merge_by_field_config = True
 
@@ -1067,6 +1078,17 @@ def _load_other_weights(
 
         return updated_params
 
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.text_config.num_local_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -1113,3 +1135,13 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         )
 
         return updated_params
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector.",
+            tower_model="vision_model.",
+        )

From a425dc256e4c2f76f98be136cd898b43f02e6a32 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Fri, 14 Nov 2025 10:30:50 -0800
Subject: [PATCH 068/578] [Bugfix] [ROCm] [AITER]: Fix aiter block quant not
 compatible with torch compile dynamo (#28716)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/rocm/aiter/test_grouped_quant.py        | 137 ++++++++++++++++++
 vllm/_aiter_ops.py                            |  48 +++++-
 .../layers/quantization/utils/fp8_utils.py    |   2 +-
 3 files changed, 180 insertions(+), 7 deletions(-)
 create mode 100644 tests/rocm/aiter/test_grouped_quant.py

diff --git a/tests/rocm/aiter/test_grouped_quant.py b/tests/rocm/aiter/test_grouped_quant.py
new file mode 100644
index 000000000000..c7f0f1eda355
--- /dev/null
+++ b/tests/rocm/aiter/test_grouped_quant.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# This is a test for the AITER group_fp8_quant op.
+# It tests if the AITER op is
+# 1. correctly defined the relationship between
+#    implementation and fake function
+# 2. can be used with torch.compile
+# 3. can be used with CUDA graphs
+# This file will be skipped if AITER is not installed
+# and the platform is not ROCm.
+
+import importlib.util
+
+import pytest
+import torch
+
+# this import statement is needed to ensure the ops are registered
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.platforms import current_platform
+
+# Check if aiter package is installed
+aiter_available = importlib.util.find_spec("aiter") is not None
+
+pytestmark = pytest.mark.skipif(
+    not (current_platform.is_rocm() and aiter_available),
+    reason="AITER ops are only available on ROCm with aiter package installed",
+)
+
+
+def test_rocm_aiter_group_fp8_quant_fake_implementation():
+    """Test that the fake implementation is correctly
+    defined for torch.ops.vllm.rocm_aiter_group_fp8_quant."""
+    # Create test tensors
+    M = 128
+    N = 4096
+    group_size = 128
+
+    input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+    # Verify the op's fake implementation using torch.library.opcheck
+    # This checks that the fake function returns tensors with correct shapes and dtypes
+    torch.library.opcheck(
+        torch.ops.vllm.rocm_aiter_group_fp8_quant,
+        (input_tensor, group_size),
+        test_utils=("test_faketensor",),
+    )
+
+
+def test_rocm_aiter_group_fp8_quant_torch_compile_with_cudagraph():
+    """Test that rocm_aiter_ops.group_fp8_quant
+    with group size 128 can be used with
+    torch.compile in cudagraph mode."""
+    # Create test tensors
+    M = 128
+    N = 4096
+    group_size = 128
+
+    input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+    # Define a function that uses the op
+    def group_fp8_quant_fn(x):
+        return rocm_aiter_ops.group_fp8_quant(x, group_size)
+
+    # Compile with cudagraph mode
+    compiled_fn = torch.compile(
+        group_fp8_quant_fn,
+        fullgraph=True,
+        backend="inductor",
+        mode="reduce-overhead",
+        dynamic=False,
+    )
+
+    # Run eager mode
+    x_fp8_eager, scales_eager = group_fp8_quant_fn(input_tensor)
+
+    # Run compiled version (first run will trigger compilation)
+    x_fp8_compiled, scales_compiled = compiled_fn(input_tensor)
+
+    # Verify shapes match
+    assert x_fp8_compiled.shape == x_fp8_eager.shape
+    assert scales_compiled.shape == scales_eager.shape
+
+    # Verify expected shapes
+    assert x_fp8_compiled.shape == (M, N)
+    expected_scale_cols = (N + group_size - 1) // group_size
+    assert scales_compiled.shape == (M, expected_scale_cols)
+
+    # Verify results match
+    assert torch.allclose(
+        x_fp8_compiled.to(torch.float32),
+        x_fp8_eager.to(torch.float32),
+        rtol=1e-2,
+        atol=1e-2,
+    )
+    assert torch.allclose(scales_compiled, scales_eager, rtol=1e-3, atol=1e-3)
+
+    # Test with different input (reusing compiled graph)
+    input_tensor_2 = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+    x_fp8_eager_2, scales_eager_2 = group_fp8_quant_fn(input_tensor_2)
+    x_fp8_compiled_2, scales_compiled_2 = compiled_fn(input_tensor_2)
+
+    # Verify second run also produces correct results
+    assert torch.allclose(
+        x_fp8_compiled_2.to(torch.float32),
+        x_fp8_eager_2.to(torch.float32),
+        rtol=1e-2,
+        atol=1e-2,
+    )
+    assert torch.allclose(scales_compiled_2, scales_eager_2, rtol=1e-3, atol=1e-3)
+
+
+def test_rocm_aiter_group_fp8_quant_different_shapes():
+    """Test rocm_aiter_ops.group_fp8_quant with different input shapes."""
+    group_size = 128
+
+    test_shapes = [
+        (64, 2048),
+        (256, 8192),
+        (32, 1024),
+        (512, 4096),
+    ]
+
+    for M, N in test_shapes:
+        input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+        x_fp8, scales = rocm_aiter_ops.group_fp8_quant(input_tensor, group_size)
+
+        # Verify shapes
+        assert x_fp8.shape == (M, N)
+        expected_scale_cols = (N + group_size - 1) // group_size
+        assert scales.shape == (M, expected_scale_cols)
+
+        # Verify dtypes
+        from aiter import dtypes
+
+        assert x_fp8.dtype == dtypes.fp8
+        assert scales.dtype == torch.float32
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 7c35bf1857ba..e53e4ae6e529 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -43,6 +43,36 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def _rocm_aiter_group_fp8_quant_impl(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size"
+    from aiter import QuantType, dtypes, get_hip_quant
+
+    aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128)
+    return aiter_per1x128_quant(x.contiguous(), quant_dtype=dtypes.fp8)
+
+
+def _rocm_aiter_group_fp8_quant_fake(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter import dtypes
+
+    M, N = x.shape
+    x_fp8 = torch.empty((M, N), dtype=dtypes.fp8, device=x.device)
+    out_bs = torch.empty(
+        (
+            M,
+            (N + group_size - 1) // group_size,
+        ),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    return x_fp8, out_bs
+
+
 def _rocm_aiter_fused_moe_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -512,6 +542,14 @@ def register_ops_once() -> None:
             )
 
             # register all the custom ops here
+            direct_register_custom_op(
+                op_name="rocm_aiter_group_fp8_quant",
+                op_func=_rocm_aiter_group_fp8_quant_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_group_fp8_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
             direct_register_custom_op(
                 op_name="rocm_aiter_asm_moe_tkw1",
                 op_func=_rocm_aiter_asm_moe_tkw1_impl,
@@ -887,14 +925,12 @@ def triton_gemm_a8w8_blockscale(
         return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
 
     @staticmethod
-    def per_1x128_fp8_quant(
+    def group_fp8_quant(
         input_2d: torch.Tensor,
+        group_size: int = 128,
     ) -> tuple[torch.Tensor, ...]:
-        """Only applies quantization method for fp8 data type only."""
-        from aiter import QuantType, dtypes, get_hip_quant
-
-        aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128)
-        return aiter_per1x128_quant(input_2d.contiguous(), quant_dtype=dtypes.fp8)
+        assert group_size == 128, "Group size must be 128"
+        return torch.ops.vllm.rocm_aiter_group_fp8_quant(input_2d, group_size)
 
     @staticmethod
     def is_triton_gemm_w8a8_tuned(n: int, k: int) -> bool:
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 541c6c631053..ae63b4a76726 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -342,7 +342,7 @@ def _run_aiter(
             )
         # MI300 uses tuned AITER ASM/C++ kernel
         else:
-            q_input, input_scale = rocm_aiter_ops.per_1x128_fp8_quant(input_2d)
+            q_input, input_scale = rocm_aiter_ops.group_fp8_quant(input_2d)
 
         return gemm_a8w8_blockscale_op(
             q_input,

From 67187554dd478ba76e79d7a6f8bf02be01290de3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 14 Nov 2025 18:39:19 +0000
Subject: [PATCH 069/578] [Docs] Enable some more markdown lint rules for the
 docs (#28731)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .markdownlint.yaml                             | 3 ---
 docs/contributing/benchmarks.md                | 2 --
 docs/contributing/ci/update_pytorch_version.md | 2 +-
 docs/deployment/frameworks/chatbox.md          | 4 ++--
 docs/deployment/frameworks/dify.md             | 6 +++---
 docs/design/fused_moe_modular_kernel.md        | 8 ++++----
 6 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/.markdownlint.yaml b/.markdownlint.yaml
index d0d3179766ef..937487f47364 100644
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -6,9 +6,6 @@ MD024:
 MD031:
   list_items: false
 MD033: false
-MD045: false
 MD046: false
-MD051: false
 MD052: false
-MD053: false
 MD059: false
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index dca01eab5b42..ec0dfc4199d1 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -10,8 +10,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu
 - **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
 - **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
 
-[Benchmark CLI]: #benchmark-cli
-
 ## Benchmark CLI
 
 This section guides you through running benchmark tests with the extensive
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index f983c25f26ee..09fd85a466ee 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -95,7 +95,7 @@ when manually triggering a build on Buildkite. This branch accomplishes two thin
 to warm it up so that future builds are faster.
 
 <p align="center" width="100%">
-    <img width="60%" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
+    <img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
 </p>
 
 ## Update dependencies
diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md
index 002935da5600..5f7cef1a87df 100644
--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@@ -29,8 +29,8 @@ pip install vllm
     - API Path: `/chat/completions`
     - Model: `qwen/Qwen1.5-0.5B-Chat`
 
-    ![](../../assets/deployment/chatbox-settings.png)
+    ![Chatbox settings screen](../../assets/deployment/chatbox-settings.png)
 
 1. Go to `Just chat`, and start to chat:
 
-    ![](../../assets/deployment/chatbox-chat.png)
+    ![Chatbot chat screen](../../assets/deployment/chatbox-chat.png)
diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md
index 820ef0cbed9f..673cbf4b6a24 100644
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@@ -46,12 +46,12 @@ And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compos
     - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
     - **Completion Mode**: `Completion`
 
-    ![](../../assets/deployment/dify-settings.png)
+    ![Dify settings screen](../../assets/deployment/dify-settings.png)
 
 1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
 
-    ![](../../assets/deployment/dify-create-chatbot.png)
+    ![Dify create chatbot screen](../../assets/deployment/dify-create-chatbot.png)
 
 1. Click the chatbot you just created to open the chat interface and start interacting with the model:
 
-    ![](../../assets/deployment/dify-chat.png)
+    ![Dify chat screen](../../assets/deployment/dify-chat.png)
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 76df0d8d8a38..e1a96be6c344 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -19,9 +19,9 @@ The input activation format completely depends on the All2All Dispatch being use
 
 The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
 
-![](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png "FusedMoE Non-Batched")
+![FusedMoE Non-Batched](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png)
 
-![](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png "FusedMoE Batched")
+![FusedMoE Batched](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png)
 
 !!! note
     The main difference, in terms of operations, between the Batched and Non-Batched cases is the Permute / Unpermute operations. All other operations remain.
@@ -57,7 +57,7 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
 The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive`  and `finalize` functions.
 The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers.  Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts.  The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
 
-![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
+![FusedMoEPrepareAndFinalize Blocks](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png)
 
 ### FusedMoEPermuteExpertsUnpermute
 
@@ -88,7 +88,7 @@ The core FusedMoE implementation performs a series of operations. It would be in
 It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section.
 `FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use.
 
-![](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png "FusedMoEPermuteExpertsUnpermute Blocks")
+![FusedMoEPermuteExpertsUnpermute Blocks](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png)
 
 ### FusedMoEModularKernel
 

From e2741f6cbce6dc4c364d0a8d77375259d72a21ef Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Nov 2025 02:39:57 +0800
Subject: [PATCH 070/578] [Chore] Rename
 `SchedulerConfig.chunked_prefill_enabled` (#28735)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/v1/core/test_scheduler.py     |  1 -
 tests/v1/e2e/test_spec_decode.py    | 10 ++++------
 tests/v1/engine/test_engine_core.py |  2 +-
 vllm/config/scheduler.py            | 11 ++++++++---
 vllm/config/vllm.py                 |  6 +++---
 vllm/platforms/cpu.py               |  2 +-
 vllm/v1/core/sched/scheduler.py     |  2 +-
 vllm/v1/engine/core.py              |  2 +-
 vllm/v1/worker/gpu_model_runner.py  |  4 ++--
 9 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 287e735b5491..04e738293cd7 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -2282,7 +2282,6 @@ def _validate_chunked_prefill_settings_for_encoder_decoder(
 ) -> None:
     """Validate chunked prefill settings in the scheduler config for
     encoder-decoder models."""
-    assert scheduler_config.chunked_prefill_enabled is expect_enabled
     assert scheduler_config.enable_chunked_prefill is expect_enabled
     if is_encoder_decoder:
         # Encoder-decoder models should automatically disable chunked multimodal
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 4a6b84ae4817..6cffaafb127e 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -272,7 +272,7 @@ def test_speculators_model_integration(
 
 
 @pytest.mark.parametrize(
-    ["model_setup", "mm_enabled", "chunked_prefill_enabled"],
+    ["model_setup", "mm_enabled", "enable_chunked_prefill"],
     [
         (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False),
         pytest.param(
@@ -358,7 +358,7 @@ def test_eagle_correctness(
     sampling_config: SamplingParams,
     model_setup: tuple[str, str, str, int],
     mm_enabled: bool,
-    chunked_prefill_enabled: bool,
+    enable_chunked_prefill: bool,
     attn_backend: str,
 ):
     if attn_backend == "TREE_ATTN":
@@ -396,9 +396,7 @@ def test_eagle_correctness(
 
         method, model_name, spec_model_name, tp_size = model_setup
         max_model_len = 2048
-        max_num_batched_tokens = max_model_len
-        if chunked_prefill_enabled:
-            max_num_batched_tokens = 128
+        max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
 
         ref_llm = LLM(
             model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
@@ -420,7 +418,7 @@ def test_eagle_correctness(
             },
             max_model_len=max_model_len,
             max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=chunked_prefill_enabled,
+            enable_chunked_prefill=enable_chunked_prefill,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 4e852dca95eb..3ba8ab26f552 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -571,7 +571,7 @@ def test_encoder_instance_zero_kv_cache(
         )
 
         # Check 5: Verify chunked prefill is disabled
-        assert not vllm_config.scheduler_config.chunked_prefill_enabled, (
+        assert not vllm_config.scheduler_config.enable_chunked_prefill, (
             "Encoder instance should disable chunked prefill (no KV cache)"
         )
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 5117344a6844..444568994a95 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -8,7 +8,7 @@
 
 from pydantic import Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
-from typing_extensions import Self
+from typing_extensions import Self, deprecated
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
@@ -233,6 +233,11 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
             )
 
     @property
+    @deprecated(
+        "`SchedulerConfig.chunked_prefill_enabled` has been renamed to "
+        "`SchedulerConfig.enable_chunked_prefill`. "
+        "The old name will be removed in v0.12."
+    )
     def chunked_prefill_enabled(self) -> bool:
         return self.enable_chunked_prefill
 
@@ -244,7 +249,7 @@ def chunked_prefill_enabled(self, value: bool):
     def _verify_args(self) -> Self:
         if (
             self.max_num_batched_tokens < self.max_model_len
-            and not self.chunked_prefill_enabled
+            and not self.enable_chunked_prefill
         ):
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
@@ -271,7 +276,7 @@ def _verify_args(self) -> Self:
             )
 
         if self.max_num_partial_prefills > 1:
-            if not self.chunked_prefill_enabled:
+            if not self.enable_chunked_prefill:
                 raise ValueError(
                     "Chunked prefill must be enabled to set "
                     "max_num_partial_prefills > 1."
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index f581267f73f7..1e6e455210c8 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -411,7 +411,7 @@ def __post_init__(self):
 
         if (
             self.model_config is not None
-            and self.scheduler_config.chunked_prefill_enabled
+            and self.scheduler_config.enable_chunked_prefill
             and self.model_config.dtype == torch.float32
             and current_platform.get_device_capability() == (7, 5)
         ):
@@ -584,7 +584,7 @@ def __post_init__(self):
         ):
             for reason in disable_chunked_prefill_reasons:
                 logger.info(reason)
-            self.scheduler_config.chunked_prefill_enabled = False
+            self.scheduler_config.enable_chunked_prefill = False
             self.scheduler_config.long_prefill_token_threshold = 0
 
             if self.cache_config is not None:
@@ -1026,7 +1026,7 @@ def __str__(self):
             f"seed={self.model_config.seed}, "
             f"served_model_name={self.model_config.served_model_name}, "
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
-            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
+            f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, "  # noqa
             f"pooler_config={self.model_config.pooler_config!r}, "
             f"compilation_config={self.compilation_config!r}"
         )
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index fdfa1c19789c..1da34629472c 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -192,7 +192,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         scheduler_config = vllm_config.scheduler_config
         if (
-            scheduler_config.chunked_prefill_enabled
+            scheduler_config.enable_chunked_prefill
             or cache_config.enable_prefix_caching
         ) and cache_config.cache_dtype != "auto":
             raise RuntimeError(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4fcc7955df19..ba7ad0c09173 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -497,7 +497,7 @@ def schedule(self) -> SchedulerOutput:
                     # chunked prefill has to be enabled explicitly to allow
                     # pooling requests to be chunked
                     if (
-                        not self.scheduler_config.chunked_prefill_enabled
+                        not self.scheduler_config.enable_chunked_prefill
                         and num_new_tokens > token_budget
                     ):
                         self.waiting.pop_request()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ffb5232e770d..a6965182fc2c 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -124,7 +124,7 @@ def __init__(
             # Encoder models without KV cache don't support
             # chunked prefill. But do SSM models?
             logger.info("Disabling chunked prefill for model without KVCache")
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.enable_chunked_prefill = False
 
         scheduler_block_size = (
             vllm_config.cache_config.block_size
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 341bf58f2da8..9b3e5b668aab 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2031,7 +2031,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]:
 
         supported_tasks = list(model.pooler.get_supported_tasks())
 
-        if self.scheduler_config.chunked_prefill_enabled:
+        if self.scheduler_config.enable_chunked_prefill:
             if "token_embed" in supported_tasks:
                 supported_tasks.remove("token_embed")
             if "token_classify" in supported_tasks:
@@ -3825,7 +3825,7 @@ def _dummy_pooler_run(
         supported_pooling_tasks = self.get_supported_pooling_tasks()
 
         if not supported_pooling_tasks:
-            if self.scheduler_config.chunked_prefill_enabled:
+            if self.scheduler_config.enable_chunked_prefill:
                 raise RuntimeError(
                     f"Model {self.model_config.model} does not support "
                     "any pooling tasks with chunked prefill enabled. "

From cec275efcef62a9fb6ea5c3445572dddf9736206 Mon Sep 17 00:00:00 2001
From: GuanH <60228748+GuanH@users.noreply.github.com>
Date: Sat, 15 Nov 2025 02:44:27 +0800
Subject: [PATCH 071/578] [Bugfix] resolve Qwen3-VL GPTQModel quantized model
 loading failure (#28663)

Signed-off-by: GuanH <guansdrailib@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/models/qwen3_vl.py | 4 +++-
 vllm/model_executor/models/utils.py    | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index faeb9f81d961..f1c020ab5813 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1138,7 +1138,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
 
         self.quant_config = quant_config
-        self.model = Qwen3LLMModel(vllm_config=vllm_config, prefix=prefix)
+        self.model = Qwen3LLMModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index f14b79f2886c..e5663c8a057a 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -117,9 +117,10 @@ class AutoWeightsLoader:
     environment variable `VLLM_LOGGING_LEVEL=DEBUG`.
     """
 
-    # Models trained using early version ColossalAI
-    # may include these tensors in checkpoint. Skip them.
+    # Models trained using early version ColossalAI or quantized by
+    # GPTQModel may include these tensors in checkpoint. Skip them.
     ROTARY_EMBEDS_UNUSED_WEIGHTS = [
+        "rotary_pos_emb.inv_freq",
         "rotary_emb.inv_freq",
         "rotary_emb.cos_cached",
         "rotary_emb.sin_cached",

From fd4555089a7ea3094499d9a6a9cec1c1b6903674 Mon Sep 17 00:00:00 2001
From: Andrey Khalyavin <halyavin@gmail.com>
Date: Fri, 14 Nov 2025 21:58:18 +0300
Subject: [PATCH 072/578] [BugFix] Fix misprint introduced by modular_kernel
 refactoring. (#28728)

Signed-off-by: Andrey Khalyavin <halyavin@yandex-team.ru>
---
 vllm/model_executor/layers/fused_moe/modular_kernel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index a3142f37053f..093affe51f50 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -1060,7 +1060,7 @@ def input_chunk_range(chunk_idx: int) -> tuple[int, int]:
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
                 a1q_scale=_slice_scales(a1q_scale, s, e),
-                a2_scale=_slice_scales(self.fused_experts.a2_scale, e, e),
+                a2_scale=_slice_scales(self.fused_experts.a2_scale, s, e),
                 workspace13=workspace13,
                 workspace2=workspace2,
                 expert_tokens_meta=c_expert_tokens_meta,

From 8977ffb5e6428a3e682d47d9ca8342ccab9916f8 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 14 Nov 2025 11:06:01 -0800
Subject: [PATCH 073/578] [ROCm][Bugfix] Fix compilation errors with
 fused_qknorm_rope_kernel.cu (#28682)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
---
 csrc/fused_qknorm_rope_kernel.cu | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu
index 83017250ebcd..baff8363162e 100644
--- a/csrc/fused_qknorm_rope_kernel.cu
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -37,6 +37,16 @@
 
 #ifdef USE_ROCM
   #define FINAL_MASK 0xffffffffffffffffULL
+
+  #if defined(HIP_VERSION) && HIP_VERSION < 70000000
+// On ROCm versions before 7.0, __syncwarp isn't defined. The below
+// implementation is copy/pasted from the implementation in ROCm 7.0
+__device__ inline void __syncwarp() {
+  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
+  __builtin_amdgcn_wave_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
+}
+  #endif
 #else
   #define FINAL_MASK 0xffffffff
 #endif

From f08eab2acc17da9e86d20673bd801659ca912749 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 14 Nov 2025 15:29:55 -0500
Subject: [PATCH 074/578] [CI] Fix macos smoke test uv cache issue (#28736)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .github/workflows/macos-smoke-test.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index f56fdc0dbe79..8d40aa587bf0 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -11,9 +11,12 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - uses: astral-sh/setup-uv@v4
+      - uses: astral-sh/setup-uv@v7
         with:
           enable-cache: true
+          cache-dependency-glob: |
+            requirements/**/*.txt
+            pyproject.toml
           python-version: '3.12'
 
       - name: Install dependencies

From 0de4f217abe2c73ce6df52743365302466f7bc00 Mon Sep 17 00:00:00 2001
From: Marcin Ostrowski <mostrowskix@habana.ai>
Date: Fri, 14 Nov 2025 22:13:53 +0100
Subject: [PATCH 075/578] [Bugfix] TypeError: 'NoneType' object is not callable
 (#27410)

Signed-off-by: Marcin Ostrowski <marcinx.ostrowski@intel.com>
---
 tests/v1/core/test_kv_cache_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index df6a5f109874..24611a4aaa1b 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -55,7 +55,7 @@
 def _auto_init_hash_fn(request):
     hash_fn: Callable
     if "hash_fn" in request.fixturenames:
-        hash_fn = init_none_hash(request.getfixturevalue("hash_fn"))
+        hash_fn = request.getfixturevalue("hash_fn")
     else:
         hash_fn = sha256
     init_none_hash(hash_fn)

From 5a84b76b86e03694d612afc8f0225512d9b4ddc9 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 14 Nov 2025 16:34:18 -0500
Subject: [PATCH 076/578] [ROCm][CI/Build] Change install location of uv
 (#28741)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 docker/Dockerfile.rocm | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 137452cad2c1..731a97d93da1 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -17,10 +17,7 @@ RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
 
 # Install UV
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Activate virtual environment and add uv to PATH
-ENV PATH="/root/.local/bin:$PATH"
+RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
 
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694

From 2e0ad629b0422358d424e1fcfddeb22d102936e8 Mon Sep 17 00:00:00 2001
From: Laith Sakka <laith.s.sakka@gmail.com>
Date: Fri, 14 Nov 2025 14:11:10 -0800
Subject: [PATCH 077/578] Avoid bytecode hook and simplify
 TorchCompileWrapperWithCustomDipatch (#25110)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 .../compile/piecewise/test_multiple_graphs.py |  11 +-
 tests/compile/piecewise/test_simple.py        |   3 +
 tests/compile/piecewise/test_toy_llama.py     |   9 +-
 tests/compile/test_wrapper.py                 | 155 +++++++++---
 .../multimodal/generation/test_qwen2_5_vl.py  |  10 +
 tests/v1/e2e/test_spec_decode.py              |   8 +
 vllm/compilation/decorators.py                | 234 +++++++++---------
 vllm/compilation/wrapper.py                   | 212 ++++++++++------
 vllm/envs.py                                  |   6 +
 vllm/v1/worker/tpu_model_runner.py            |  10 +-
 10 files changed, 422 insertions(+), 236 deletions(-)

diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
index 64d626bae483..6d3788af9de0 100644
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -22,6 +22,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
 
@@ -193,7 +195,14 @@ def run_model(
 
 
 @pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
-def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+@create_new_process_for_each_test("spawn")
+def test_multi_graph_piecewise_compile(
+    use_inductor_graph_partition: bool, use_bytecode_hook: bool, monkeypatch
+):
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
     if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index a48af8a8952a..e258133ab50a 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -21,6 +21,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from ..silly_attention import get_global_counter, reset_global_counter
 
@@ -124,6 +126,7 @@ def _run_simple_model(
 
 @pytest.mark.parametrize("use_inductor", [True, False])
 @torch.inference_mode()
+@create_new_process_for_each_test("spawn")
 def test_simple_piecewise_compile(use_inductor):
     _run_simple_model(
         splitting_ops=["silly::attention"],
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 92998ede1699..915fbc6ce7f3 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -29,6 +29,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
 
@@ -334,6 +336,7 @@ def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
         ("inductor", True),  # Inductor, Inductor partition
     ],
 )
+@create_new_process_for_each_test("spawn")
 def test_toy_llama(
     backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
 ):
@@ -513,4 +516,8 @@ def benchmark():
 
 
 if __name__ == "__main__":
-    benchmark()
+    # Protect against subprocess reimport when using spawn_new_process_for_each_test
+    import os
+
+    if os.environ.get("RUNNING_IN_SUBPROCESS") != "1":
+        benchmark()
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index da0afd9eaa49..356cac7af258 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -2,59 +2,134 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+import os
+
+import pytest
 import torch
 
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationMode
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    VllmConfig,
+    set_current_vllm_config,
+)
 
 
 class MyMod(torch.nn.Module):
     def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
-        if cache is not None:
-            return x + cache
-        return x * 2
+        if x.size()[0] >= 4:
+            return x * 2
+        else:
+            return x * 100
 
 
-class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
+class MyWrapper(TorchCompileWithNoGuardsWrapper):
     def __init__(self, model):
         self.model = model
-        compiled_callable = torch.compile(self.forward, backend="eager")
-        super().__init__(
-            compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
-        )
+        super().__init__()
 
-    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
+    def forward(self, x: torch.Tensor):  # type: ignore[override]
         # this is the function to be compiled
-        return self.model(x, cache)
-
-    def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
-        # let torch.compile compile twice
-        if len(self.compiled_codes) == 2:
-            dispatch_id = 0 if cache is None else 1
-            with self.dispatch_to_code(dispatch_id):
-                return self.forward(x, cache)
-        else:
-            return self.compiled_callable(x, cache)
+        return self.model(x)
+
 
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
+    """Test basic functionality of TorchCompileWithNoGuardsWrapper."""
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
 
-def test_torch_compile_wrapper():
-    mod = MyMod()
-    wrappers = []
-    for i in range(3):
+    # Create a proper vLLM config instead of mocking
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig()
+    vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
+    vllm_config.compilation_config.backend = "inductor"
+
+    # Test DYNAMO_TRACE_ONCE
+    with set_current_vllm_config(vllm_config):
         torch._dynamo.reset()
+        mod = MyMod()
+        wrapper = MyWrapper(mod)
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(result1, expected1), (
+            f"Expected {expected1}, got {result1}"
+        )
+
+        # Second call should use compiled code
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([2, 4, 6])
+        assert torch.allclose(result2, expected2), (
+            f"Expected {expected2}, got {result2}"
+        )
+
+        # without the wrapper result would be different.
+        result3 = mod(x2)
+        expected3 = torch.tensor([100, 200, 300])
+
+        assert torch.allclose(result3, expected3), (
+            f"Expected {result3}, got {expected3}"
+        )
+
+    # with STOCK_TORCH_COMPILE we do not remove guards.
+    vllm_config.compilation_config.mode = CompilationMode.STOCK_TORCH_COMPILE
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
+        mod = MyMod()
         wrapper = MyWrapper(mod)
-        wrappers.append(wrapper)
-        x = torch.tensor([1])
-        wrapper(x, None)  # profile run, compile
-        # create a cache tensor
-        cache = torch.tensor([2])
-        wrapper(x, cache)  # warm up with cache, recompile
-
-        # for new input, dispatch to the compiled code directly
-        new_x = torch.tensor([3])
-        assert wrapper(new_x, None).item() == 6  # dispatch to the first compiled code
-        assert wrapper(new_x, cache).item() == 5  # dispatch to the second compiled code
-
-    for wrapper in wrappers:
-        # make sure they have independent compiled codes
-        assert len(wrapper.compiled_codes) == 2
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(result1, expected1), (
+            f"Expected {expected1}, got {result1}"
+        )
+
+        # Second call should triger another compilation
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([100, 200, 300])
+        assert torch.allclose(result2, expected2), (
+            f"Expected {expected2}, got {result2}"
+        )
+
+    # NO_COMPILATION level not supported.
+    vllm_config.compilation_config.mode = None
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
+        torch._dynamo.reset()
+        mod = MyMod()
+
+        try:
+            wrapper = MyWrapper(mod)
+        except Exception:
+            return
+        raise AssertionError("expected an exception to be raised")
+
+
+if __name__ == "__main__":
+    # Run with both parameter values
+
+    class MockMonkeypatch:
+        def setenv(self, name, value):
+            os.environ[name] = value
+
+    mp = MockMonkeypatch()
+
+    print("Testing with VLLM_USE_BYTECODE_HOOK=False")
+    test_torch_compile_wrapper(False, mp)
+
+    print("Testing with VLLM_USE_BYTECODE_HOOK=True")
+    test_torch_compile_wrapper(True, mp)
+
+    print("All tests passed!")
diff --git a/tests/models/multimodal/generation/test_qwen2_5_vl.py b/tests/models/multimodal/generation/test_qwen2_5_vl.py
index 6b009075abfa..3ba665710af4 100644
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@@ -34,6 +34,7 @@ def qwen2_5_vl_chat_template(*query):
 @pytest.mark.parametrize("num_frames", [16])
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
 def test_qwen2_5_vl_evs_functionality(
     vllm_runner,
     video_assets,
@@ -42,10 +43,14 @@ def test_qwen2_5_vl_evs_functionality(
     num_frames: int,
     dtype: str,
     max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
 ) -> None:
     """Test EVS (Efficient Video Sampling) functionality with different
     pruning rates.
     """
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
 
     # Sample frames from video assets
     sampled_vids = [
@@ -86,6 +91,7 @@ def test_qwen2_5_vl_evs_functionality(
 @pytest.mark.parametrize("num_frames", [16])
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
 def test_qwen2_5_vl_evs_batched_videos(
     vllm_runner,
     video_assets,
@@ -94,6 +100,8 @@ def test_qwen2_5_vl_evs_batched_videos(
     num_frames: int,
     dtype: str,
     max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
 ) -> None:
     """Test EVS functionality with batched videos.
 
@@ -102,6 +110,8 @@ def test_qwen2_5_vl_evs_batched_videos(
     2. Both pruning configurations work with multiple videos
     3. The model doesn't crash when processing multiple videos simultaneously
     """
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
     # Sample frames from video assets
     sampled_vids = [
         sample_frames_from_video(asset.np_ndarrays, num_frames)
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 6cffaafb127e..03396270a31c 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -75,6 +75,14 @@ def model_name():
     return "meta-llama/Llama-3.1-8B-Instruct"
 
 
+@pytest.fixture(autouse=True)
+def reset_torch_dynamo():
+    """Reset torch dynamo cache before each test"""
+    yield
+    # Cleanup after test
+    torch._dynamo.reset()
+
+
 @pytest.mark.parametrize(
     "speculative_config",
     [
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 0946fa69171b..e325bca73abb 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -17,7 +17,7 @@
 
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
 from vllm.config import (
     CompilationMode,
     VllmConfig,
@@ -246,14 +246,14 @@ def _support_torch_compile(
     """
     A decorator to add support for compiling the forward method of a class.
     """
-    if TorchCompileWrapperWithCustomDispatcher in cls.__bases__:
+    if TorchCompileWithNoGuardsWrapper in cls.__bases__:
         # support decorating multiple times
         return cls
 
     # take care of method resolution order
     # make sure super().__init__ is called on the base class
-    #  other than TorchCompileWrapperWithCustomDispatcher
-    cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher,)
+    #  other than TorchCompileWithNoGuardsWrapper
+    cls.__bases__ = cls.__bases__ + (TorchCompileWithNoGuardsWrapper,)
 
     old_init = cls.__init__
 
@@ -290,12 +290,43 @@ def __init__(
             return
 
         compilation_counter.num_models_seen += 1
-        TorchCompileWrapperWithCustomDispatcher.__init__(
-            self, compilation_mode=vllm_config.compilation_config.mode
-        )
+        self.compiled = False
+        TorchCompileWithNoGuardsWrapper.__init__(self)
 
     cls.__init__ = __init__
 
+    def _mark_dynamic_inputs(mod, *args, **kwargs):
+        sig = inspect.signature(mod.__class__.forward)
+        bound_args = sig.bind(mod, *args, **kwargs)
+        bound_args.apply_defaults()
+        for k, dims in dynamic_arg_dims.items():
+            arg = bound_args.arguments.get(k)
+            if arg is not None:
+                dims = [dims] if isinstance(dims, int) else dims
+                if isinstance(arg, torch.Tensor):
+                    # In case dims is specified with negative indexing
+                    dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+                    torch._dynamo.mark_dynamic(arg, dims)
+                elif isinstance(arg, IntermediateTensors):
+                    for tensor in arg.tensors.values():
+                        # In case dims is specified with negative indexing
+                        dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims]
+                        torch._dynamo.mark_dynamic(tensor, dims)
+                else:
+                    raise ValueError(
+                        "Unsupported dynamic dimensions"
+                        f" {dims} for argument {k} with type {type(arg)}."
+                    )
+        if mark_unbacked_dims:
+            for k, dims in mark_unbacked_dims.items():
+                arg = bound_args.arguments.get(k)
+                if arg is not None:
+                    dims = [dims] if isinstance(dims, int) else dims
+                    if isinstance(arg, torch.Tensor):
+                        # In case dims is specified with negative indexing
+                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+                        torch._dynamo.decorators.mark_unbacked(arg, dims)
+
     def __call__(self, *args, **kwargs):
         # torch.compiler.is_compiling() means we are inside the compilation
         # e.g. TPU has the compilation logic in model runner, so we don't
@@ -303,6 +334,7 @@ def __call__(self, *args, **kwargs):
         if self.do_not_compile or torch.compiler.is_compiling():
             return self.forward(*args, **kwargs)
 
+        # if aot_compiled_fn is set, just call it.
         if getattr(self, "aot_compiled_fn", None) is not None:
             return self.aot_compiled_fn(self, *args, **kwargs)
 
@@ -362,120 +394,84 @@ def __call__(self, *args, **kwargs):
                 )
                 return self.aot_compiled_fn(self, *args, **kwargs)
 
+        if self.compiled:
+            assert not envs.VLLM_USE_AOT_COMPILE
+            return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
+
+        # This is the path for the first compilation.
+
         # the first compilation needs to have dynamic shapes marked
-        if len(self.compiled_codes) < 1:
-            sig = inspect.signature(self.__class__.forward)
-            bound_args = sig.bind(self, *args, **kwargs)
-            bound_args.apply_defaults()
-            for k, dims in dynamic_arg_dims.items():
-                arg = bound_args.arguments.get(k)
-                if arg is not None:
-                    dims = [dims] if isinstance(dims, int) else dims
-                    if isinstance(arg, torch.Tensor):
-                        # In case dims is specified with negative indexing
-                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                        torch._dynamo.mark_dynamic(arg, dims)
-                    elif isinstance(arg, IntermediateTensors):
-                        for tensor in arg.tensors.values():
-                            # In case dims is specified with negative indexing
-                            dims = [
-                                tensor.ndim + dim if dim < 0 else dim for dim in dims
-                            ]
-                            torch._dynamo.mark_dynamic(tensor, dims)
-                    else:
-                        raise ValueError(
-                            "Unsupported dynamic dimensions"
-                            f" {dims} for argument {k} with type {type(arg)}."
-                        )
-            if mark_unbacked_dims:
-                for k, dims in mark_unbacked_dims.items():
-                    arg = bound_args.arguments.get(k)
-                    if arg is not None:
-                        dims = [dims] if isinstance(dims, int) else dims
-                        if isinstance(arg, torch.Tensor):
-                            # In case dims is specified with negative indexing
-                            dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                            torch._dynamo.decorators.mark_unbacked(arg, dims)
-            # here, it is the starting point of the `torch.compile` process
-            start_monitoring_torch_compile(self.vllm_config)
-            logger.debug("Start compiling function %s", self.original_code_object)
-
-        # if we don't use custom dispatcher, we can directly call the
-        # compiled function and let torch.compile handle the dispatching,
-        # with the overhead of guard evaluation and recompilation.
-        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
-            # it seems Dynamo reuse the compilation across instances,
-            # while we need to make sure the compiled code is not reused.
-            # we need to control all the compilation of the model.
-            torch._dynamo.eval_frame.remove_from_cache(self.original_code_object)
-
-            # collect all relevant files traced by Dynamo,
-            # so that the compilation cache can trigger re-compilation
-            # properly when any of these files change.
-
-            # 1. the file containing the top-level forward function
-            self.vllm_config.compilation_config.traced_files.add(
-                self.original_code_object.co_filename
-            )
+        _mark_dynamic_inputs(self, *args, **kwargs)
 
-            # 2. every time Dynamo sees a function call, it will inline
-            # the function by calling InliningInstructionTranslator.inline_call_
-            # we hijack this function to know all the functions called
-            # during Dynamo tracing, and their corresponding files
-            inline_call = InliningInstructionTranslator.inline_call_
-
-            def patched_inline_call(self_):
-                code = self_.f_code
-                self.vllm_config.compilation_config.traced_files.add(code.co_filename)
-                return inline_call(self_)
-
-            # Disable the C++ compilation of symbolic shape guards. C++-fication
-            # of symbolic shape guards can improve guard overhead. But, since
-            # vllm skip guards anyways, setting this flag to False can improve
-            # compile time.
-            dynamo_config_patches = {}
-            try:
-                _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards
-                dynamo_config_patches["enable_cpp_symbolic_shape_guards"] = False
-            except AttributeError:
-                # Note: this config is not available in torch 2.6, we can skip
-                # if the config doesn't exist
-                logger.debug("enable_cpp_symbolic_shape_guards config not available")
-
-            with (
-                patch.object(
-                    InliningInstructionTranslator, "inline_call_", patched_inline_call
-                ),
-                torch._dynamo.config.patch(**dynamo_config_patches),
-                maybe_use_cudagraph_partition_wrapper(self.vllm_config),
-                _torch27_patch_tensor_subclasses(),
-            ):
-                if envs.VLLM_USE_AOT_COMPILE:
-                    self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
-                    output = self.aot_compiled_fn(self, *args, **kwargs)
-                    assert aot_compilation_path is not None
-                    assert cache_dir is not None
-                    try:
-                        os.makedirs(cache_dir, exist_ok=True)
-                        self.aot_compiled_fn.save_compiled_function(
-                            aot_compilation_path
-                        )
-                    except Exception as e:
-                        logger.warning(
-                            "Cannot save aot compilation to path %s, error: %s",
-                            aot_compilation_path,
-                            str(e),
-                        )
-                else:
-                    output = self.compiled_callable(*args, **kwargs)
-            return output
-
-        # usually, capturing the model once is enough, and then we can
-        # dispatch to the compiled code directly, without going through
-        # the Dynamo guard mechanism.
-        with self.dispatch_to_code(0):
-            model_output = self.forward(*args, **kwargs)
-            return model_output
+        # here, it is the starting point of the `torch.compile` process
+        start_monitoring_torch_compile(self.vllm_config)
+        original_code_object = self.original_code_object()
+        logger.debug("Start compiling function %s", original_code_object)
+
+        # we do not want tp delete the original code object entries since
+        # we depend on them now to look up cached compiled functions.
+        # torch._dynamo.eval_frame.remove_from_cache(original_code_object)
+
+        # collect all relevant files traced by Dynamo,
+        # so that the compilation cache can trigger re-compilation
+        # properly when any of these files change.
+
+        # 1. the file containing the top-level forward function
+        self.vllm_config.compilation_config.traced_files.add(
+            original_code_object.co_filename
+        )
+
+        # 2. every time Dynamo sees a function call, it will inline
+        # the function by calling InliningInstructionTranslator.inline_call_
+        # we hijack this function to know all the functions called
+        # during Dynamo tracing, and their corresponding files
+        inline_call = InliningInstructionTranslator.inline_call_
+
+        def patched_inline_call(self_):
+            code = self_.f_code
+            self.vllm_config.compilation_config.traced_files.add(code.co_filename)
+            return inline_call(self_)
+
+        # Disable the C++ compilation of symbolic shape guards. C++-fication
+        # of symbolic shape guards can improve guard overhead. But, since
+        # vllm skip guards anyways, setting this flag to False can improve
+        # compile time.
+        dynamo_config_patches = {}
+        try:
+            _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards
+            dynamo_config_patches["enable_cpp_symbolic_shape_guards"] = False
+        except AttributeError:
+            # Note: this config is not available in torch 2.6, we can skip
+            # if the config doesn't exist
+            logger.debug("enable_cpp_symbolic_shape_guards config not available")
+
+        with (
+            patch.object(
+                InliningInstructionTranslator, "inline_call_", patched_inline_call
+            ),
+            torch._dynamo.config.patch(**dynamo_config_patches),
+            maybe_use_cudagraph_partition_wrapper(self.vllm_config),
+            _torch27_patch_tensor_subclasses(),
+        ):
+            if envs.VLLM_USE_AOT_COMPILE:
+                self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
+                output = self.aot_compiled_fn(self, *args, **kwargs)
+                assert aot_compilation_path is not None
+                assert cache_dir is not None
+                try:
+                    os.makedirs(cache_dir, exist_ok=True)
+                    self.aot_compiled_fn.save_compiled_function(aot_compilation_path)
+                except Exception as e:
+                    logger.warning(
+                        "Cannot save aot compilation to path %s, error: %s",
+                        aot_compilation_path,
+                        str(e),
+                    )
+            else:
+                output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
+
+        self.compiled = True
+        return output
 
     cls.__call__ = __call__
     return cls
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 4d26619bd128..493e57f97f0f 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -4,11 +4,11 @@
 import os
 import sys
 from abc import abstractmethod
-from collections.abc import Callable
 from contextlib import contextmanager
 from types import CodeType
 
 import torch
+import torch._C._dynamo.guards
 
 import vllm.envs as envs
 from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
@@ -17,88 +17,153 @@
 logger = init_logger(__name__)
 
 
-class TorchCompileWrapperWithCustomDispatcher:
+def _noop_add_global_state_guard(self, *args, **kwargs):
+    """No-op to skip the GLOBAL_STATE guard entirely"""
+    pass
+
+
+def _noop_add_torch_function_mode_stack_guard(self, *args, **kwargs):
+    """No-op to skip the TORCH_FUNCTION_MODE_STACK guard entirely"""
+    pass
+
+
+@contextmanager
+def _compilation_context():
+    """Context manager for compilation settings and patches.
+
+    This manager:
+    1. Sets higher dynamo cache limits for compilation. (Needed for
+        qwen2_5_vl see test_qwen2_5_vl_evs_functionality).
+        Generally a recompilation can happen whenever we use a new
+        backend instance in torch.compile.
+    2. Patches out add_global_state_guard to skip GLOBAL_STATE guards
+    3. Patches out add_torch_function_mode_stack_guard to skip
+        TORCH_FUNCTION_MODE_STACK guards.
+    4. Restores everything when compilation completes
     """
-    A wrapper class for torch.compile, with a custom dispatch logic.
-    Subclasses should:
-    1. Implement the forward method
-    2. Implement the dispatch logic in the __call__ method
-        It can use `self.compiled_codes` to access the compiled bytecode,
-        and `with self.dispatch_to_code(index):` to dispatch to
-        the compiled code.
-    3. Implement the `__init__` method to determine how to call
-        `torch.compile` over the forward method.
+    # Save original values
+    original_global_state_guard = (
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard
+    )
+    original_torch_function_mode_stack_guard = (
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard
+    )
+    original_cache_size = torch._dynamo.config.cache_size_limit
+    original_accumulated_cache = torch._dynamo.config.accumulated_cache_size_limit
+
+    try:
+        # Set higher cache limits for compilation
+        torch._dynamo.config.cache_size_limit = 2048
+        torch._dynamo.config.accumulated_cache_size_limit = 8192
+
+        # Patch guard manager
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard = (
+            _noop_add_global_state_guard
+        )
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = (
+            _noop_add_torch_function_mode_stack_guard
+        )
+        yield
+    finally:
+        # Restore original values
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard = (
+            original_global_state_guard
+        )
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = (
+            original_torch_function_mode_stack_guard
+        )
+        torch._dynamo.config.cache_size_limit = original_cache_size
+        torch._dynamo.config.accumulated_cache_size_limit = original_accumulated_cache
+
+
+class TorchCompileWithNoGuardsWrapper:
     """
+    A wrapper class for torch.compile, it ensures that all guards are dropped
+    when CompilationMode is not CompilationMode.STOCK_TORCH_COMPILE.
+    When guards are dropped, the first time __call__ is invoked, a single
+    compilation is triggered. Dynamo should never be traced again after that
+    since we drop all guards.
+    """
+
+    def __init__(self):
+        self.compiled = False
 
-    def __init__(
-        self,
-        compiled_callable: Callable | None = None,
-        compilation_mode: CompilationMode = CompilationMode.NONE,
-    ):
         vllm_config = get_current_vllm_config()
         self.vllm_config = vllm_config
-        if compiled_callable is None:
-            # default compilation settings
-            # compiling the forward method
-
-            backend = vllm_config.compilation_config.init_backend(vllm_config)
-            options = None
-            if isinstance(backend, str) and backend == "inductor":
-                options = (
-                    get_current_vllm_config().compilation_config.inductor_compile_config
-                )
-            if envs.VLLM_USE_AOT_COMPILE:
-                options = options or {}
-                # This effectively drop all the guards.
-                # We need this because bytecode hook is not used any more to
-                # drop guards in the AOT compile mode.
-                options["guard_filter_fn"] = lambda guards: [False for _ in guards]
-                if hasattr(torch._dynamo.config, "enable_aot_compile"):
-                    torch._dynamo.config.enable_aot_compile = True
-                else:
-                    msg = "torch._dynamo.config.enable_aot_compile is not "
-                    msg += "available. AOT compile is disabled and please "
-                    msg += "upgrade PyTorch version to use AOT compile."
-                    logger.warning(msg)
-
-            compiled_callable = torch.compile(
-                self.forward, fullgraph=True, backend=backend, options=options
-            )
-
-        self.compiled_callable = compiled_callable
-        self.original_code_object = self.__class__.forward.__code__
-        self.compiled_codes: list[CodeType] = []
-        torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
-
-        # read the env var to determine whether to use the custom dispatcher
-        # subclasses can use this to switch between the custom dispatcher
-        # and the default Dynamo guard mechanism.
-        self.use_custom_dispatcher: bool = (
-            compilation_mode >= CompilationMode.DYNAMO_TRACE_ONCE
+        mode = vllm_config.compilation_config.mode
+        if mode is None:
+            raise RuntimeError("Compilation mode cannot be NO_COMPILATION")
+
+        backend = vllm_config.compilation_config.init_backend(vllm_config)
+        options = {}
+
+        if isinstance(backend, str) and backend == "inductor":
+            options = vllm_config.compilation_config.inductor_compile_config
+
+        if mode != CompilationMode.STOCK_TORCH_COMPILE:
+            # Drop all the guards.
+            options["guard_filter_fn"] = lambda x: [False for _ in x]
+
+        if envs.VLLM_USE_AOT_COMPILE:
+            if hasattr(torch._dynamo.config, "enable_aot_compile"):
+                torch._dynamo.config.enable_aot_compile = True
+            else:
+                msg = "torch._dynamo.config.enable_aot_compile is not "
+                msg += "available. AOT compile is disabled and please "
+                msg += "upgrade PyTorch version to use AOT compile."
+                logger.warning(msg)
+
+        self._compiled_callable = torch.compile(
+            self.forward,
+            fullgraph=True,
+            dynamic=False,
+            backend=backend,
+            options=options,
         )
 
+        if envs.VLLM_USE_BYTECODE_HOOK and mode != CompilationMode.STOCK_TORCH_COMPILE:
+            torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
+            self._compiled_bytecode = None
+
     def aot_compile(self, *args, **kwargs):
-        if not hasattr(self.compiled_callable, "aot_compile"):
+        if not hasattr(self._compiled_callable, "aot_compile"):
             raise RuntimeError(
                 "aot_compile is not supported by the current configuration. "
                 + "Please make sure torch.compile is enabled with the latest "
                 + f"version of PyTorch (current using torch: {torch.__version__})"
             )
-        return self.compiled_callable.aot_compile((args, kwargs))
+        return self._compiled_callable.aot_compile((args, kwargs))
 
     def __call__(self, *args, **kwargs):
-        """Implement the dispatch logic here, beyond the torch.compile mode.
-        NOTE: this function can have additional arguments beyond the forward
-         method, for directly dispatching to the compiled code.
-        """
-        return self.compiled_callable(*args, **kwargs)
+        if envs.VLLM_USE_BYTECODE_HOOK:
+            if (
+                self.vllm_config.compilation_config.mode
+                == CompilationMode.STOCK_TORCH_COMPILE
+            ):
+                return self._compiled_callable(*args, **kwargs)
+
+            if not self._compiled_bytecode:
+                # Make sure a compilation is triggered by clearing dynamo
+                # cache.
+                torch._dynamo.eval_frame.remove_from_cache(self.original_code_object())
+                return self._compiled_callable(*args, **kwargs)
+            else:
+                with self._dispatch_to_compiled_code():
+                    return self.forward(*args, **kwargs)
+        else:
+            with _compilation_context():
+                return self._compiled_callable(*args, **kwargs)
 
     @abstractmethod
     def forward(self, *args, **kwargs): ...
 
+    def original_code_object(self) -> CodeType:
+        """Return the original code object of the forward method."""
+        return self.__class__.forward.__code__
+
     def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
         """Hook to save the compiled bytecode for direct execution."""
-        if old_code is not self.original_code_object:
+        if old_code is not self.original_code_object():
             return
         # code borrowed from https://github.com/thuml/depyf/blob/f4ad79fadee27ea113b4c75202db1eb1a11c0dbc/depyf/explain/enable_debugging.py#L25
         frame = sys._getframe()
@@ -114,7 +179,7 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
         if frame.f_locals["self"] is not self:
             return
 
-        self.compiled_codes.append(new_code)
+        self._compiled_bytecode = new_code
 
         path = self.vllm_config.compile_debug_dump_path()
         if path:
@@ -153,16 +218,21 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
             raise RuntimeError(msg)
 
     @contextmanager
-    def dispatch_to_code(self, index: int):
-        """Context manager to dispatch to the compiled code.
+    def _dispatch_to_compiled_code(self):
+        # noqa: E501
+        """
+        Context manager to dispatch to internally compiled code for torch<2.8.
         Why does this work? Because Dynamo guarantees that the compiled
         bytecode has exactly the same arguments, cell variables, and free
         variables as the original code. Therefore we can directly switch
         the code object in the function and call it.
 
-        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7
-        for more details.
-        """
-        self.__class__.forward.__code__ = self.compiled_codes[index]
-        yield
-        self.__class__.forward.__code__ = self.original_code_object
+        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
+        """  # noqa: E501 line too long
+        original = self.original_code_object()
+        assert self._compiled_bytecode is not None
+        self.__class__.forward.__code__ = self._compiled_bytecode
+        try:
+            yield
+        finally:
+            self.__class__.forward.__code__ = original
diff --git a/vllm/envs.py b/vllm/envs.py
index 0530938c32f9..7987e5fb83fd 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -92,6 +92,7 @@
     VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
     VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
     VLLM_USE_AOT_COMPILE: bool = False
+    VLLM_USE_BYTECODE_HOOK: bool = False
     VLLM_FORCE_AOT_LOAD: bool = False
     VLLM_TORCH_PROFILER_WITH_STACK: bool = True
     VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
@@ -556,6 +557,11 @@ def get_vllm_port() -> int | None:
     # compilation is done in warmup phase and the compilation will be
     # reused in subsequent calls.
     "VLLM_USE_AOT_COMPILE": use_aot_compile,
+    # Feature flag to enable/disable bytecode in
+    # TorchCompileWithNoGuardsWrapper.
+    "VLLM_USE_BYTECODE_HOOK": lambda: bool(
+        int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "1"))
+    ),
     # Force vllm to always load AOT compiled models from disk. Failure
     # to load will result in a hard error when this is enabled.
     # Will be ignored when VLLM_USE_AOT_COMPILE is disabled.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 0f90578671db..01490e0dfac9 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -21,7 +21,7 @@
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import MLAAttention
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
 from vllm.config import (
     ParallelConfig,
     VllmConfig,
@@ -1895,12 +1895,14 @@ def reset_dynamo_cache(self):
             compiled_model = self.model.get_language_model().model
         else:
             compiled_model = self.model.model
-        if isinstance(compiled_model, TorchCompileWrapperWithCustomDispatcher):
+        if isinstance(compiled_model, TorchCompileWithNoGuardsWrapper):
             logger.info("Clear dynamo cache and cached dynamo bytecode.")
             torch._dynamo.eval_frame.remove_from_cache(
-                compiled_model.original_code_object
+                compiled_model.original_code_object()
             )
-            compiled_model.compiled_codes.clear()
+            # Reset the wrapper to re-initialize.
+            compiled_model.compiled = False
+            TorchCompileWithNoGuardsWrapper.__init__(compiled_model)
 
     @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def select_hidden_states(self, hidden_states, indices_do_sample):

From e5c78956c0c576d8f7230c29550ff09ffff0c064 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Fri, 14 Nov 2025 17:13:46 -0500
Subject: [PATCH 078/578] [Bugfix] Fix incorrect use of hidden_states for
 shared_experts due to do_naive_dispatch_combine (#28740)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index aed8245cbd83..023132acfed3 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1749,14 +1749,16 @@ def forward_impl(
 
         with sp_ctx:
             if do_naive_dispatch_combine:
-                hidden_states, router_logits = get_ep_group().dispatch(
+                hidden_states_combined, router_logits = get_ep_group().dispatch(
                     hidden_states, router_logits, self.is_sequence_parallel
                 )
 
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
-                x=hidden_states,
+                x=hidden_states_combined
+                if do_naive_dispatch_combine
+                else hidden_states,
                 router_logits=router_logits,
                 top_k=self.top_k,
                 renormalize=self.renormalize,

From bf3ffb61e61525cce5fdec8a249f8114a0c0bfcc Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <bchislett@nvidia.com>
Date: Fri, 14 Nov 2025 17:14:46 -0500
Subject: [PATCH 079/578] [Bugfix] Fix ChunkedLocalAttention CUDA Graph setting
 (#28739)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
---
 .../layers/chunked_local_attention.py         | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index f144e8435b6c..48fcc6fa736b 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
-from typing import ClassVar
 
 import torch
 
@@ -12,11 +11,16 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
+    AttentionMetadataBuilder,
     CommonAttentionMetadata,
     make_local_attention_virtual_batches,
     subclass_attention_backend,
 )
-from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, KVCacheSpec
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    ChunkedLocalAttentionSpec,
+    KVCacheSpec,
+)
 
 from ..layer import Attention
 
@@ -30,9 +34,18 @@ def create_chunked_local_attention_backend(
     prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
 
     underlying_builder = underlying_attn_backend.get_builder_cls()
+    assert issubclass(underlying_builder, AttentionMetadataBuilder)
 
     class ChunkedLocalAttentionBuilder(underlying_builder):  # type: ignore
-        _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
+        @classmethod
+        def get_cudagraph_support(
+            cls: type["AttentionMetadataBuilder"],
+            vllm_config: VllmConfig,
+            kv_cache_spec: AttentionSpec,
+        ) -> AttentionCGSupport:
+            # Explicit override in case the underlying builder specialized this getter.
+            # @override omitted only because of mypy limitation due to type variable.
+            return AttentionCGSupport.NEVER
 
         def build(
             self,

From e0c910bb89e45f4a2a976dc3c76248bbdea854e0 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 14 Nov 2025 23:55:42 +0100
Subject: [PATCH 080/578] [Hybrid] [Kernel] Fix chunk scan kernel when
 BLOCK_SIZE_DSTATE > 128 (#28295)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index e5a5c9dd6f71..661c884627b0 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -245,7 +245,7 @@ def _chunk_scan_fwd_kernel(
             )
             if not HAS_INITSTATES and (seq_idx != seq_idx_prev):
                 prev_states = tl.zeros(
-                    (BLOCK_SIZE_DSTATE, BLOCK_SIZE_K), dtype=C_ptr.dtype.element_ty
+                    (BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=C_ptr.dtype.element_ty
                 )
             else:
                 prev_states = tl.load(

From ba041d980b5677a0ab6cebb3c7fe24cfe27bac66 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 14 Nov 2025 17:26:39 -0600
Subject: [PATCH 081/578] [Log] Save profiler results to file instead of stdout
 (#28144)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 vllm/v1/worker/gpu_worker.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 2b9d8bb2f25e..283e3744bcf6 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -596,14 +596,19 @@ def profile(self, is_start: bool = True):
             self.profiler.start()
         else:
             self.profiler.stop()
-            # only print profiler results on rank 0
-            if (
-                isinstance(self.profiler, torch.profiler.profile)
-                and self.local_rank == 0
-            ):
-                print(
-                    self.profiler.key_averages().table(sort_by="self_cuda_time_total")
-                )
+            if isinstance(self.profiler, torch.profiler.profile):
+                rank = self.local_rank
+                profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
+                profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
+                sort_key = "self_cuda_time_total"
+                table = self.profiler.key_averages().table(sort_by=sort_key)
+
+                with open(profiler_out_file, "w") as f:
+                    print(table, file=f)
+
+                # only print profiler results on rank 0
+                if rank == 0:
+                    print(table)
 
     def execute_dummy_batch(self) -> None:
         self.model_runner._dummy_run(1, uniform_decode=True)

From 75f01b9d3c3a40e52e2fa4a2c9efc92cf45a88fc Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 14 Nov 2025 18:53:21 -0500
Subject: [PATCH 082/578] [ROCm][CI/Build] Upgrade to ROCm 7.1 and AITER main
 (#28753)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 docker/Dockerfile.rocm_base | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 19f7fa7e1468..df4f9b6c26e7 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
 ARG TRITON_BRANCH="57c693b6"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
 ARG PYTORCH_BRANCH="1c57644d"
@@ -7,7 +7,7 @@ ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="9716b1b8"
+ARG AITER_BRANCH="59bd8ff2"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -19,6 +19,9 @@ ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx11
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 ENV AITER_ROCM_ARCH=gfx942;gfx950
 
+# Required for RCCL in ROCm7.1
+ENV HSA_NO_SCRATCH_RECLAIM=1
+
 ARG PYTHON_VERSION=3.12
 
 RUN mkdir -p /app

From 58e61e56b744da109269586fe45ecc47b10dca5f Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 14 Nov 2025 16:01:09 -0800
Subject: [PATCH 083/578] [Test] Rework e2e async scheduling tests (#28744)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/e2e/test_async_scheduling.py | 358 +++++++++++++++++++-------
 1 file changed, 268 insertions(+), 90 deletions(-)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 444afd5196dd..dbe403ece051 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from itertools import repeat
 from typing import Any
 
 import pytest
@@ -8,126 +9,291 @@
 from vllm import SamplingParams
 from vllm.logprobs import Logprob
 from vllm.sampling_params import StructuredOutputsParams
+from vllm.v1.metrics.reader import Metric
 
 from ...conftest import VllmRunner
 from ...models.utils import check_outputs_equal
 
 MODEL = "Qwen/Qwen3-0.6B"
+MTP_MODEL = "XiaomiMiMo/MiMo-7B-Base"
 
 
-@dynamo_config.patch(cache_size_limit=16)
-def test_preempt_and_async_scheduling_e2e(
-    sample_json_schema, monkeypatch: pytest.MonkeyPatch
-):
-    """Test consistency of combos of async scheduling, preemption,
-    uni/multiproc executor, and various sampling parameters
-    including structured outputs."""
+first_prompt = (
+    "The following numbers of the sequence "
+    + ", ".join(str(i) for i in range(10))
+    + " are:"
+)
+example_prompts = [first_prompt, "In one word, the capital of France is "] + [
+    f"Tell me about the number {i}: " for i in range(32)
+]
 
-    first_prompt = (
-        "The following numbers of the sequence "
-        + ", ".join(str(i) for i in range(10))
-        + " are:"
-    )
-    example_prompts = [first_prompt, "In one word, the capital of France is "] + [
-        f"Tell me about the number {i}: " for i in range(32)
-    ]
+default_params = dict(
+    temperature=0.0,  # greedy
+    max_tokens=20,
+)
 
-    sampling_param_tests: list[dict[str, Any]] = [
+
+def test_without_spec_decoding(
+    sample_json_schema,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test consistency of combos of async scheduling, preemption,
+    uni/multiproc executor, prefill chunking."""
+    struct_outputs = StructuredOutputsParams(json=sample_json_schema)
+    test_sampling_params: list[dict[str, Any]] = [
         dict(),
         # dict(min_tokens=20),
         dict(presence_penalty=-1.0),
         dict(bad_words=["the", " the"]),
         dict(logprobs=2),
         dict(logprobs=2, presence_penalty=-1.0),
-        dict(structured_outputs=StructuredOutputsParams(json=sample_json_schema)),
+        dict(structured_outputs=struct_outputs),
         dict(
-            structured_outputs=StructuredOutputsParams(json=sample_json_schema),
+            structured_outputs=struct_outputs,
             logprobs=2,
             presence_penalty=-1.0,
         ),
     ]
 
-    default_params = dict(
-        temperature=0.0,  # greedy
-        max_tokens=20,
+    # test_preemption, executor, async_scheduling,
+    # spec_config, test_prefill_chunking
+    test_configs = [
+        (False, "mp", False, None, False),
+        (True, "mp", False, None, True),
+        (False, "mp", True, None, False),
+        (False, "uni", True, None, False),
+        (True, "mp", True, None, False),
+        (True, "uni", True, None, False),
+        (False, "mp", True, None, True),
+        # Async scheduling + preemption + chunked prefill needs to be fixed (WIP)
+        # (True, "mp", True, None, True),
+        # (True, "uni", True, None, True),
+    ]
+
+    run_tests(
+        monkeypatch,
+        MODEL,
+        test_configs,
+        test_sampling_params,
     )
 
+
+@pytest.mark.skip("MTP model too big to run in fp32 in CI")
+def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
+    """Test consistency and acceptance rates with some different combos of
+    preemption, executor, async scheduling, prefill chunking,
+    spec decoding model length.
+    """
+
+    spec_config = {
+        "method": "mtp",
+        "num_speculative_tokens": 2,
+    }
+    spec_config_short = spec_config | {"max_model_len": 50}
+
+    # test_preemption, executor, async_scheduling,
+    # spec_config, test_prefill_chunking
+    test_configs = [
+        (False, "mp", False, None, False),
+        (False, "mp", False, spec_config, False),
+        (True, "mp", False, spec_config, True),
+        (True, "uni", False, spec_config_short, True),
+        (False, "mp", True, spec_config, False),
+        (True, "mp", True, spec_config, False),
+        (False, "mp", True, spec_config_short, True),
+        (True, "uni", True, spec_config, False),
+        (True, "uni", True, spec_config_short, False),
+        # Async scheduling + preemption + chunked prefill needs to be fixed (WIP)
+        #  (True, "mp", True, spec_config, True),
+        #  (True, "uni", True, spec_config_short, True),
+    ]
+
+    run_tests(
+        monkeypatch,
+        MTP_MODEL,
+        test_configs,
+        [{}],
+    )
+
+
+@dynamo_config.patch(cache_size_limit=16)
+def run_tests(
+    monkeypatch: pytest.MonkeyPatch,
+    model: str,
+    test_configs: list[tuple],
+    test_sampling_params: list[dict[str, Any]],
+):
+    """Test consistency of combos of async scheduling, preemption,
+    uni/multiproc executor with spec decoding."""
+
     with monkeypatch.context() as m:
+        # avoid precision errors
         m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
         # m.setenv("VLLM_BATCH_INVARIANT", "1")
+        outputs: list[tuple[str, list, list]] = []
+        for n, (
+            test_preemption,
+            executor,
+            async_scheduling,
+            spec_config,
+            test_prefill_chunking,
+        ) in enumerate(test_configs, 1):
+            test_str = f"{n}/{len(test_configs)}"
+            test_results = run_test(
+                model,
+                test_str,
+                test_sampling_params,
+                test_preemption,
+                executor,
+                async_scheduling,
+                spec_config,
+                test_prefill_chunking=test_prefill_chunking,
+            )
+            outputs.append(test_results)
+
+    baseline_config, baseline_tests, _ = outputs[0]
+    _, _, baseline_acceptances = next(
+        (o for o in outputs if o[2] is not None), (None, None, None)
+    )
 
-        outputs: list[tuple[str, list]] = []
-        for test_preemption in [False, True]:
-            for executor in ["mp", "uni"]:
-                for async_scheduling in [False, True]:
-                    cache_arg: dict[str, Any] = (
-                        dict(num_gpu_blocks_override=32)
-                        if test_preemption
-                        else dict(gpu_memory_utilization=0.7)
-                    )
-                    test_config = (
-                        f"executor={executor}, preemption={test_preemption},"
-                        f" async_sched={async_scheduling}"
-                    )
-                    print("-" * 80)
-                    print(f"---- TESTING: {test_config}")
-                    print("-" * 80)
-                    with VllmRunner(
-                        MODEL,
-                        max_model_len=512,
-                        enforce_eager=True,
-                        async_scheduling=async_scheduling,
-                        distributed_executor_backend=executor,
-                        dtype="float32",  # avoid precision errors
-                        **cache_arg,
-                    ) as vllm_model:
-                        results = []
-                        for override_params in sampling_param_tests:
-                            print(f"----------- RUNNING PARAMS: {override_params}")
-                            results.append(
-                                vllm_model.generate(
-                                    example_prompts,
-                                    sampling_params=SamplingParams(
-                                        **default_params, **override_params
-                                    ),
-                                    return_logprobs=True,
-                                )
-                            )
-
-                        if not outputs:
-                            # First check that the different parameter configs
-                            # actually result in different output.
-                            for (other_test_outs, other_test_logprobs), params in zip(
-                                results[1:], sampling_param_tests[1:]
-                            ):
-                                with pytest.raises(AssertionError):
-                                    check_outputs_equal(
-                                        outputs_0_lst=results[0][0],
-                                        outputs_1_lst=other_test_outs,
-                                        name_0=f"baseline params={params}",
-                                        name_1=f"other params={params}",
-                                    )
-                                    assert _all_logprobs_match(
-                                        results[0][1], other_test_logprobs
-                                    )
-
-                        outputs.append((test_config, results))
-
-    baseline_config, baseline_tests = outputs[0]
-
-    for test_config, test_outputs in outputs[1:]:
-        for (base_outs, base_logprobs), (test_outs, test_logprobs), params in zip(
-            baseline_tests, test_outputs, sampling_param_tests
+    print(f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}")
+
+    failure = None
+    for test_config, test_outputs, test_acceptance_rates in outputs[1:]:
+        for (base_outs, base_logprobs), base_acceptance_rate, (
+            test_outs,
+            test_logprobs,
+        ), test_acceptance_rate, params in zip(
+            baseline_tests,
+            baseline_acceptances or repeat(None),
+            test_outputs,
+            test_acceptance_rates or repeat(None),
+            test_sampling_params,
         ):
-            check_outputs_equal(
-                outputs_0_lst=base_outs,
-                outputs_1_lst=test_outs,
-                name_0=f"baseline=[{baseline_config}], params={params}",
-                name_1=f"config=[{test_config}], params={params}",
+            try:
+                check_outputs_equal(
+                    outputs_0_lst=base_outs,
+                    outputs_1_lst=test_outs,
+                    name_0=f"baseline=[{baseline_config}], params={params}",
+                    name_1=f"config=[{test_config}], params={params}",
+                )
+                assert _all_logprobs_match(base_logprobs, test_logprobs)
+
+                if (
+                    base_acceptance_rate is not None
+                    and test_acceptance_rate is not None
+                ):
+                    if "spec_mml=None" in test_config:
+                        # because the acceptance rate can vary, we use a looser
+                        # tolerance here.
+                        assert (
+                            pytest.approx(test_acceptance_rate, rel=5e-2)
+                            == base_acceptance_rate
+                        )
+                    else:
+                        # Currently the reported acceptance rate is expected to be
+                        # lower when we skip drafting altogether.
+                        assert test_acceptance_rate > 0.05
+                print(
+                    f"PASSED: config=[{test_config}], params={params}"
+                    f" accept_rate={test_acceptance_rate}"
+                )
+            except AssertionError as e:
+                print(
+                    f"FAILED: config=[{test_config}], params={params}"
+                    f" accept_rate={test_acceptance_rate}"
+                )
+                if failure is None:
+                    failure = e
+
+    if failure is not None:
+        raise failure
+
+
+def run_test(
+    model: str,
+    test_str: str,
+    sampling_param_tests: list[dict[str, Any]],
+    test_preemption: bool,
+    executor: str,
+    async_scheduling: bool,
+    spec_config: dict[str, Any] | None,
+    test_prefill_chunking: bool,
+):
+    spec_decoding = spec_config is not None
+    cache_arg: dict[str, Any] = (
+        dict(num_gpu_blocks_override=32)
+        if test_preemption
+        else dict(gpu_memory_utilization=0.9)
+    )
+    spec_mml = (spec_config or {}).get("max_model_len")
+    test_config = (
+        f"executor={executor}, preemption={test_preemption}, "
+        f"async_sched={async_scheduling}, "
+        f"chunk_prefill={test_prefill_chunking}, "
+        f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
+    )
+    print("-" * 80)
+    print(f"---- TESTING {test_str}: {test_config}")
+    print("-" * 80)
+    with VllmRunner(
+        model,
+        max_model_len=512,
+        enable_chunked_prefill=test_prefill_chunking,
+        max_num_batched_tokens=48 if test_prefill_chunking else None,
+        # enforce_eager=True,
+        async_scheduling=async_scheduling,
+        distributed_executor_backend=executor,
+        dtype="float32",  # avoid precision errors
+        speculative_config=spec_config,
+        disable_log_stats=False,
+        **cache_arg,
+    ) as vllm_model:
+        results = []
+        acceptance_rates: list[float] | None = [] if spec_decoding else None
+        for override_params in sampling_param_tests:
+            metrics_before = vllm_model.llm.get_metrics()
+            print(f"----------- RUNNING PARAMS: {override_params}")
+            results.append(
+                vllm_model.generate(
+                    example_prompts,
+                    sampling_params=SamplingParams(
+                        **default_params,
+                        **override_params,
+                    ),
+                    return_logprobs=True,
+                )
             )
-            assert _all_logprobs_match(base_logprobs, test_logprobs)
+            metrics_after = vllm_model.llm.get_metrics()
+            if acceptance_rates is not None:
+                acceptance_rate = _get_acceptance_rate(metrics_before, metrics_after)
+                acceptance_rates.append(acceptance_rate)
+                print(f"ACCEPTANCE RATE {acceptance_rate}")
+
+            if test_preemption:
+                preemptions = _get_count(
+                    metrics_before,
+                    metrics_after,
+                    "vllm:num_preemptions",
+                )
+                assert preemptions > 0, "preemption test had no preemptions"
+
+    if len(results) > 1:
+        # First check that the different parameter configs
+        # actually result in different output.
+        for (other_test_outs, other_test_logprobs), params in zip(
+            results[1:], sampling_param_tests[1:]
+        ):
+            with pytest.raises(AssertionError):
+                check_outputs_equal(
+                    outputs_0_lst=results[0][0],
+                    outputs_1_lst=other_test_outs,
+                    name_0=f"baseline params={params}",
+                    name_1=f"other params={params}",
+                )
+                assert _all_logprobs_match(results[0][1], other_test_logprobs)
 
-            print(f"PASSED: config=[{test_config}], params={params}")
+    return test_config, results, acceptance_rates
 
 
 def _all_logprobs_match(req_a, req_b) -> bool:
@@ -149,3 +315,15 @@ def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> boo
         and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6)
         for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
     )
+
+
+def _get_acceptance_rate(before: list[Metric], after: list[Metric]) -> float:
+    draft = _get_count(before, after, "vllm:spec_decode_num_draft_tokens")
+    accept = _get_count(before, after, "vllm:spec_decode_num_accepted_tokens")
+    return accept / draft if draft > 0 else 0.0
+
+
+def _get_count(before: list[Metric], after: list[Metric], name: str) -> int:
+    before_val = next(m.value for m in before if m.name == name)
+    after_val = next(m.value for m in after if m.name == name)
+    return after_val - before_val

From 186352b2703652141df75bc2c012a784706e8572 Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Fri, 14 Nov 2025 16:04:04 -0800
Subject: [PATCH 084/578] [Core] Performance: Use list[np.ndarray] instead of
 list[list[int]] for output tokens for GC optimization (#26368)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 tests/v1/core/test_async_scheduler.py  |  3 +-
 tests/v1/core/test_scheduler.py        | 76 +++++++++++++++-----------
 tests/v1/kv_connector/unit/utils.py    |  3 +-
 tests/v1/spec_decode/test_eagle.py     |  5 +-
 tests/v1/spec_decode/test_ngram.py     | 18 +++---
 vllm/v1/core/sched/scheduler.py        |  4 +-
 vllm/v1/outputs.py                     |  2 +-
 vllm/v1/sample/rejection_sampler.py    |  8 +--
 vllm/v1/spec_decode/eagle.py           |  7 +--
 vllm/v1/spec_decode/ngram_proposer.py  |  6 +-
 vllm/v1/spec_decode/suffix_decoding.py | 10 ++--
 vllm/v1/worker/gpu_model_runner.py     | 36 +++++++-----
 12 files changed, 102 insertions(+), 76 deletions(-)

diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index e0645ed43015..1d80ee987591 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import deque
 
+import numpy as np
 import pytest
 
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -21,7 +22,7 @@ def _make_model_runner_output(
     return ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
-        sampled_token_ids=[[i] for i in range(len(req_ids))],
+        sampled_token_ids=[np.array([i]) for i in range(len(req_ids))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 04e738293cd7..6d95c29ec1ab 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -3,6 +3,7 @@
 import dataclasses
 from unittest.mock import Mock
 
+import numpy as np
 import pytest
 import torch
 
@@ -169,7 +170,7 @@ def test_schedule_partial_requests():
         req_id_to_index=req_to_index,
         # Only the first request has a sampled token id because
         # the rest requests are still being prefilled.
-        sampled_token_ids=[[0], [], []],
+        sampled_token_ids=[np.array([0]), np.array([]), np.array([])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -216,7 +217,7 @@ def test_no_mm_input_chunking():
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -276,7 +277,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -300,7 +301,8 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
+        sampled_token_ids=[np.array([0]), np.array([0])]
+        + [np.array([]) for _ in range(len(requests) - 2)],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -347,8 +349,8 @@ def test_stop_via_update_from_output():
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
         sampled_token_ids=[
-            [EOS_TOKEN_ID],
-            [10, 11],
+            np.array([EOS_TOKEN_ID]),
+            np.array([10, 11]),
         ],  # First request hits EOS, second continues
         logprobs=None,
         prompt_logprobs_dict={},
@@ -392,7 +394,10 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[10, 42, 12], [13, 14]],  # First request hits stop token
+        sampled_token_ids=[
+            np.array([10, 42, 12]),
+            np.array([13, 14]),
+        ],  # First request hits stop token
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -436,7 +441,10 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[10, 11, 12], [13]],  # First request exceeds max_tokens
+        sampled_token_ids=[
+            np.array([10, 11, 12]),
+            np.array([13]),
+        ],  # First request exceeds max_tokens
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -475,7 +483,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+        sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -616,7 +624,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -633,7 +641,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -670,7 +678,7 @@ def test_preempt_during_execution():
     model_runner_output0 = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -687,7 +695,7 @@ def test_preempt_during_execution():
     model_runner_output1 = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[[42]],
+        sampled_token_ids=[np.array([42])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -704,14 +712,18 @@ def test_preempt_during_execution():
 @pytest.mark.parametrize(
     "spec_tokens,output_tokens,expected",
     [
-        ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
-        ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
-        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (2, 3, 3, [2, 1])),  # multiple sequences
-        ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
-        ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
+        ([[1, 2, 3]], [np.array([1, 2, 3, 4])], (1, 3, 3, [1, 1, 1])),  # perfect match
+        ([[1, 2, 3]], [np.array([1, 5])], (1, 3, 1, [1, 0, 0])),  # early mismatch
+        (
+            [[1, 2], [3]],
+            [np.array([1, 2, 5]), np.array([3, 4])],
+            (2, 3, 3, [2, 1]),
+        ),  # multiple sequences
+        ([[1]], [np.array([1, 2])], (1, 1, 1, [1])),  # single token sequence
+        ([[]], [np.array([5])], (0, 0, 0, [0])),  # empty sequence
         (
             [[1, 2, 3], [4, 5, 6]],
-            [[1, 2, 7], [4, 8]],
+            [np.array([1, 2, 7]), np.array([4, 8])],
             (2, 6, 3, [2, 1, 0]),
         ),  # multiple mismatches
     ],
@@ -745,7 +757,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     model_runner_output = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([0]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -972,7 +984,7 @@ def test_kv_connector_basic(is_async: bool):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1025,7 +1037,7 @@ def test_kv_connector_basic(is_async: bool):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1088,7 +1100,7 @@ def test_external_prefix_cache_metrics():
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=[r.request_id for r in requests],
         req_id_to_index={r.request_id: i for i, r in enumerate(requests)},
-        sampled_token_ids=[[1000]] * NUM_REQUESTS,
+        sampled_token_ids=[np.array([1000])] * NUM_REQUESTS,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1154,7 +1166,7 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1239,7 +1251,7 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1332,7 +1344,7 @@ def make_output(scheduler: Scheduler):
     return ModelRunnerOutput(
         req_ids=[req.request_id for req in scheduler.running],
         req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)},
-        sampled_token_ids=[[1000]] * len(scheduler.running),
+        sampled_token_ids=[np.array([1000])] * len(scheduler.running),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1749,7 +1761,7 @@ def test_priority_scheduling_preemption():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[[100] for _ in low_priority_requests],
+        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1818,7 +1830,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[[100] for _ in low_priority_requests],
+        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -2064,7 +2076,7 @@ def test_priority_scheduling_heap_property():
             model_output = ModelRunnerOutput(
                 req_ids=[req.req_id],
                 req_id_to_index={req.req_id: 0},
-                sampled_token_ids=[[100]],
+                sampled_token_ids=[np.array([100])],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[],
@@ -2150,7 +2162,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[request_low.request_id],
         req_id_to_index={request_low.request_id: 0},
-        sampled_token_ids=[[100]],
+        sampled_token_ids=[np.array([100])],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2181,7 +2193,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[100] for _ in requests],
+        sampled_token_ids=[np.array([100]) for _ in requests],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2207,7 +2219,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[], [100]],
+        sampled_token_ids=[np.array([]), np.array([100])],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index f35f91bb3adf..c248104d5b5e 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -7,6 +7,7 @@
 from itertools import chain, count
 from typing import Any
 
+import numpy as np
 import torch
 
 from vllm import SamplingParams
@@ -228,7 +229,7 @@ def create_model_runner_output(
 
     # Make sampled tokens.
     sampled_token = EOS_TOKEN_ID if use_eos else token_id
-    sampled_token_ids = [[sampled_token] for _ in req_ids]
+    sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]
 
     kv_connector_output = (
         None
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 89d0ec769ac0..421da5241555 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -3,6 +3,7 @@
 
 from unittest import mock
 
+import numpy as np
 import pytest
 import torch
 
@@ -112,7 +113,9 @@ def test_prepare_next_token_ids():
     sampled_token_ids_tensor = torch.tensor(
         sampled_token_ids, dtype=torch.int32, device=device
     )
-    sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids]
+    sampled_token_ids_cpu = [
+        np.array([i for i in seq if i != -1]) for seq in sampled_token_ids
+    ]
 
     expected_next_token_ids_cpu = [1, 4, 30, 40]
     expected_next_token_ids_tensor = torch.tensor(
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 692c39282c37..563bc1d957f4 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -77,7 +77,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match.
     token_ids_cpu = np.array([[1, 2, 3, 4, 5]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -88,7 +88,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match for 4-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -99,7 +99,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match for 4-gram but match for 3-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -111,7 +111,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # In this case, the proposer should return the 4-gram match.
     token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -122,7 +122,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # Match for 2-gram and 3-gram, but not 4-gram.
     token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -133,7 +133,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # Multiple 3-gram matched, but always pick the first one.
     token_ids_cpu = np.array([[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -144,7 +144,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # check empty input
     token_ids_cpu = np.array([[]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -157,7 +157,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # second request has 3 tokens and no match. Padded with -1 for max len 5
     token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0], [1]],
+        sampled_token_ids=[np.array([0]), np.array([1])],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([5, 3]),
         token_ids_cpu=token_ids_cpu,
@@ -181,7 +181,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     input_2[:3] = [4, 5, 6]
     token_ids_cpu = np.array([input_1, input_2])
     result = ngram_proposer.propose(
-        sampled_token_ids=[[0], [1]],
+        sampled_token_ids=[np.array([0]), np.array([1])],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([len(input_1), 3]),
         token_ids_cpu=token_ids_cpu,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index ba7ad0c09173..c640c40a455d 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1010,8 +1010,8 @@ def update_from_output(
                 continue
 
             req_index = model_runner_output.req_id_to_index[req_id]
-            generated_token_ids = (
-                sampled_token_ids[req_index] if sampled_token_ids else []
+            generated_token_ids: list[int] = (
+                sampled_token_ids[req_index].tolist() if sampled_token_ids else []
             )
 
             scheduled_spec_token_ids = (
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index e32d5bb608b1..60ee9671e497 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -158,7 +158,7 @@ class ModelRunnerOutput:
     # num_generated_tokens is the number of tokens
     # generated in the current step. It can be different for
     # each request due to speculative/jump decoding.
-    sampled_token_ids: list[list[int]]
+    sampled_token_ids: list[np.ndarray]
 
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 926305d25f56..f31a0cddda9a 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -3,6 +3,7 @@
 
 from dataclasses import replace
 
+import numpy as np
 import torch
 import torch.nn as nn
 
@@ -204,7 +205,7 @@ def _get_logprobs_tensors(
     def parse_output(
         output_token_ids: torch.Tensor,
         vocab_size: int,
-    ) -> list[list[int]]:
+    ) -> list[np.ndarray]:
         """Parse the output of the rejection sampler.
         Args:
             output_token_ids: The sampled token IDs in shape
@@ -220,10 +221,7 @@ def parse_output(
         valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
             output_token_ids_np < vocab_size
         )
-        outputs = [
-            row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
-        ]
-        return outputs
+        return [row[valid_mask[i]] for i, row in enumerate(output_token_ids_np)]
 
     def apply_logits_processors(
         self,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index beef5203e039..f3b34544f8d9 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -484,7 +484,7 @@ def propose(
 
     def prepare_next_token_ids_cpu(
         self,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
         requests: dict[str, CachedRequestState],
         gpu_input_batch: InputBatch,
         num_scheduled_tokens: dict[str, int],
@@ -499,7 +499,7 @@ def prepare_next_token_ids_cpu(
         req_ids = gpu_input_batch.req_ids
         next_token_ids: list[int] = []
         for i, token_ids in enumerate(sampled_token_ids):
-            if token_ids:
+            if token_ids.shape[0] > 0:
                 # Common case.
                 next_token_id = token_ids[-1]
             else:
@@ -510,10 +510,9 @@ def prepare_next_token_ids_cpu(
                 seq_len = req_state.num_computed_tokens + num_scheduled_tokens[req_id]
                 next_token_id = req_state.get_token_id(seq_len)
             next_token_ids.append(next_token_id)
-        next_token_ids = torch.tensor(
+        return torch.tensor(
             next_token_ids, dtype=torch.int32, device=self.input_ids.device
         )
-        return next_token_ids
 
     def prepare_next_token_ids_padded(
         self,
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index e2f83cb24aa9..378937dba988 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -54,7 +54,7 @@ def __init__(self, vllm_config: VllmConfig):
         # Trigger Numba JIT compilation for N-gram proposer.
         # This usually takes less than 1 second.
         self.propose(
-            [[]] * 1024,
+            [np.array([])] * 1024,
             [""] * 1024,
             np.zeros(1024, dtype=np.int32),
             np.zeros((1024, self.max_model_len), dtype=np.int32),
@@ -131,7 +131,7 @@ def batch_propose(
 
     def propose(
         self,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
         req_ids: list[str],
         num_tokens_no_spec: np.ndarray,
         token_ids_cpu: np.ndarray,
@@ -140,7 +140,7 @@ def propose(
         # find which requests need ngram proposals
         valid_ngram_requests = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            num_sampled_ids = len(sampled_ids)
+            num_sampled_ids = sampled_ids.shape[0]
             if not num_sampled_ids:
                 # Skip speculative decoding.
                 continue
diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py
index 049e335db325..d76e0ffe778d 100644
--- a/vllm/v1/spec_decode/suffix_decoding.py
+++ b/vllm/v1/spec_decode/suffix_decoding.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+
 from vllm.config import VllmConfig
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
@@ -32,16 +34,16 @@ def __init__(self, vllm_config: VllmConfig):
     def propose(
         self,
         input_batch: InputBatch,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
     ) -> list[list[int]]:
         """
         Propose speculative tokens for each request in the input batch. Suffix Decoding
         will speculate a dynamic number of tokens for each request every decoding step,
         so each entry in the returned list may have different lengths.
         """
-        draft_token_ids: list[list[int]] = []
+        draft_token_ids: list[np.ndarray] = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            if not sampled_ids:
+            if sampled_ids.shape[0] == 0:
                 # Skip speculative decoding for partial prefills.
                 draft_token_ids.append([])
                 continue
@@ -70,7 +72,7 @@ def propose(
                 self.suffix_cache.start_request(req_id, prompt_token_ids)
 
             # Append the newly sampled ids to the suffix cache for this request.
-            self.suffix_cache.add_active_response(req_id, sampled_ids)
+            self.suffix_cache.add_active_response(req_id, sampled_ids.tolist())
 
             # Suffix decoding only uses the most recent tokens up to max_tree_depth, so
             # we extract the pattern from the end of the input.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9b3e5b668aab..d0d6164180e6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -216,9 +216,11 @@ def get_output(self) -> ModelRunnerOutput:
         del self._logprobs_tensors
         del self._sampled_token_ids
 
-        valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
+        valid_sampled_token_ids: list[np.ndarray] = [
+            row for row in self.sampled_token_ids_cpu.numpy()
+        ]
         for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i].clear()
+            valid_sampled_token_ids[i] = np.array([])
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
@@ -2339,7 +2341,7 @@ def _bookkeeping_sync(
     ) -> tuple[
         dict[str, int],
         LogprobsLists | None,
-        list[list[int]],
+        list[np.ndarray],
         dict[str, LogprobsTensors | None],
         list[str],
         dict[str, int],
@@ -2365,6 +2367,7 @@ def _bookkeeping_sync(
         num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
         invalid_req_indices = []
+        valid_sampled_token_ids: list[np.ndarray]
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
@@ -2379,7 +2382,7 @@ def _bookkeeping_sync(
                 )
             # Mask out the sampled tokens that should not be sampled.
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)].clear()
+                valid_sampled_token_ids[int(i)] = np.array([])
         else:
             valid_sampled_token_ids = []
             invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
@@ -2407,19 +2410,24 @@ def _bookkeeping_sync(
             [0] if spec_decode_metadata and logprobs_tensors else None
         )
         for req_idx in range(num_sampled_tokens):
+            sampled_ids: np.ndarray | None
             if self.use_async_scheduling:
-                sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
+                sampled_ids = (
+                    np.array([-1]) if req_idx not in invalid_req_indices_set else None
+                )
             else:
                 sampled_ids = valid_sampled_token_ids[req_idx]
 
-            num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
+            num_sampled_ids: int = (
+                sampled_ids.shape[0] if sampled_ids is not None else 0
+            )
 
             if cu_num_accepted_tokens is not None:
                 cu_num_accepted_tokens.append(
                     cu_num_accepted_tokens[-1] + num_sampled_ids
                 )
 
-            if not sampled_ids:
+            if sampled_ids is None or num_sampled_ids == 0:
                 continue
 
             start_idx = self.input_batch.num_tokens_no_spec[req_idx]
@@ -2761,7 +2769,9 @@ def sample_tokens(
         with record_function_or_nullcontext("gpu_model_runner: sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
 
-        def propose_draft_token_ids(sampled_token_ids):
+        def propose_draft_token_ids(
+            sampled_token_ids: torch.Tensor | list[np.ndarray],
+        ) -> None:
             assert spec_decode_common_attn_metadata is not None
             with record_function_or_nullcontext("gpu_model_runner: draft"):
                 self._draft_token_ids = self.propose_draft_token_ids(
@@ -2883,14 +2893,14 @@ def take_draft_token_ids(self) -> DraftTokenIds | None:
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
-        sampled_token_ids: torch.Tensor | list[list[int]],
+        sampled_token_ids: torch.Tensor | list[np.ndarray],
         sampling_metadata: SamplingMetadata,
         hidden_states: torch.Tensor,
         sample_hidden_states: torch.Tensor,
         aux_hidden_states: list[torch.Tensor] | None,
         spec_decode_metadata: SpecDecodeMetadata | None,
         common_attn_metadata: CommonAttentionMetadata,
-    ) -> list[list[int]] | torch.Tensor:
+    ) -> torch.Tensor | list[list[int]]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if self.speculative_config.method == "ngram":
             assert isinstance(sampled_token_ids, list)
@@ -2922,7 +2932,7 @@ def propose_draft_token_ids(
                 for num_draft, tokens in zip(
                     spec_decode_metadata.num_draft_tokens, sampled_token_ids
                 ):
-                    indices.append(offset + len(tokens) - 1)
+                    indices.append(offset + tokens.shape[0] - 1)
                     offset += num_draft + 1
                 indices = torch.tensor(indices, device=self.device)
                 hidden_states = sample_hidden_states[indices]
@@ -4862,7 +4872,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
 
         return kv_cache_spec
 
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
         # This is a short term mitigation for issue mentioned in
         # https://github.com/vllm-project/vllm/issues/22754.
         # `tolist` would trigger a cuda wise stream sync, which
@@ -4875,4 +4885,4 @@ def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
         pinned.copy_(sampled_token_ids, non_blocking=True)
         self.transfer_event.record()
         self.transfer_event.synchronize()
-        return pinned.tolist()
+        return [row for row in pinned.numpy()]

From 9fc81ec765aa0daa6f704023c0f902a0da653b72 Mon Sep 17 00:00:00 2001
From: QiliangCui <derrhein@gmail.com>
Date: Fri, 14 Nov 2025 16:58:32 -0800
Subject: [PATCH 085/578] [TPU] Fix import error in tpu launch (#28758)

Signed-off-by: Qiliang Cui <derrhein@gmail.com>
---
 vllm/platforms/tpu.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 4ab037fdb77e..c1218801bc07 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -9,20 +9,25 @@
 
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
-from vllm.sampling_params import SamplingParams, SamplingType
 
 from .interface import Platform, PlatformEnum
 
 if TYPE_CHECKING:
+    from typing import TypeAlias
+
     from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import BlockSize
     from vllm.pooling_params import PoolingParams
+    from vllm.sampling_params import SamplingParams
+
+    ParamsType: TypeAlias = SamplingParams | PoolingParams
 else:
     BlockSize = None
     VllmConfig = None
     PoolingParams = None
     AttentionBackendEnum = None
+    ParamsType = None
 
 logger = init_logger(__name__)
 
@@ -203,10 +208,12 @@ def get_device_communicator_cls(cls) -> str:
     def validate_request(
         cls,
         prompt: PromptType,
-        params: SamplingParams | PoolingParams,
+        params: ParamsType,
         processed_inputs: ProcessorInputs,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
+        from vllm.sampling_params import SamplingParams, SamplingType
+
         if (
             isinstance(params, SamplingParams)
             and params.sampling_type == SamplingType.RANDOM_SEED

From f05d474c8a08659cc1610a85de7e7a7095494a52 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sat, 15 Nov 2025 03:45:11 +0000
Subject: [PATCH 086/578] [Model][Qwen3VL] Use `mm_position` to compute mrope
 positions (#28730)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/qwen3_vl.py | 87 +++++++++-----------------
 1 file changed, 31 insertions(+), 56 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index f1c020ab5813..fa6b71bf9268 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -24,7 +24,7 @@
 # limitations under the License.
 """Inference-only Qwen3VL model compatible with HuggingFace weights."""
 
-from collections.abc import Callable, Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
 from functools import partial
 from itertools import islice
 from typing import Any
@@ -1412,72 +1412,47 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
                 )
         return mm_input_by_modality
 
+    def iter_mm_grid_hw(
+        self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec]
+    ) -> Iterator[tuple[int, int, int]]:
+        video_token_id = self.config.video_token_id
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset):
+            offset = mm_feature.mm_position.offset
+            if mm_feature.modality == "image":
+                t, h, w = mm_feature.data["image_grid_thw"].data.tolist()
+                assert t == 1, f"Image must have 1 frame, got {t}"
+                yield offset, h // spatial_merge_size, w // spatial_merge_size
+            elif mm_feature.modality == "video":
+                t, h, w = mm_feature.data["video_grid_thw"].data.tolist()
+                llm_grid_h = h // spatial_merge_size
+                llm_grid_w = w // spatial_merge_size
+                for _ in range(t):
+                    offset = input_tokens.index(video_token_id, offset)
+                    yield offset, llm_grid_h, llm_grid_w
+                    offset += llm_grid_h * llm_grid_w
+            else:
+                raise ValueError(f"Unsupported modality: {mm_feature.modality}")
+
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
         mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        kwargs = MultiModalFeatureSpec.gather_kwargs(
-            mm_features,
-            {"image_grid_thw", "video_grid_thw"},
-        )
-        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
-        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
-
-        video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)]
-
-        hf_config = self.config
-        image_token_id = hf_config.image_token_id
-        video_token_id = hf_config.video_token_id
-        vision_start_token_id = hf_config.vision_start_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-
-        input_tokens_array = np.array(input_tokens)
-        vision_start_mask = input_tokens_array == vision_start_token_id
-        vision_tokens = input_tokens_array[vision_start_mask.nonzero()[0] + 1]
-        image_nums = np.count_nonzero(vision_tokens == image_token_id)
-        video_nums = np.count_nonzero(vision_tokens == video_token_id)
-        llm_pos_ids_list: list = []
-
+        llm_pos_ids_list = []
         st = 0
-        remain_images, remain_videos = image_nums, video_nums
-
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
-            else:
-                ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = image_grid_thw[image_index]
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = video_grid_thw[video_index]
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
-
-            llm_grid_t, llm_grid_h, llm_grid_w = (
-                t,
-                h // spatial_merge_size,
-                w // spatial_merge_size,
-            )
-            text_len = ed - st
-
+        for offset, llm_grid_h, llm_grid_w in self.iter_mm_grid_hw(
+            input_tokens, mm_features
+        ):
+            text_len = offset - st
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
             llm_pos_ids_list.append(
                 np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
             )
 
-            grid_indices = np.indices((llm_grid_t, llm_grid_h, llm_grid_w))
-            llm_pos_ids_list.append(grid_indices.reshape(3, -1) + text_len + st_idx)
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+            grid_indices = np.indices((1, llm_grid_h, llm_grid_w)).reshape(3, -1)
+            llm_pos_ids_list.append(grid_indices + text_len + st_idx)
+            st = offset + llm_grid_h * llm_grid_w
 
         if st < len(input_tokens):
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0

From edfe49818959b1a1a0b7e8ef7ffcdc39d9903ec6 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 14 Nov 2025 22:51:05 -0500
Subject: [PATCH 087/578] [Bugfix] Build hadacore kernels on >SM90 (#28748)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcc44be87e55..3a37040edbf1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -861,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # Hadacore kernels
-  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   if(HADACORE_ARCHS)
     set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
     set_gencode_flags_for_srcs(

From ac86bff8cb53939117a6a460af1a6c3fea829a56 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 14 Nov 2025 20:24:00 -0800
Subject: [PATCH 088/578] =?UTF-8?q?Revert=20"[Core]=20Performance:=20Use?=
 =?UTF-8?q?=20list[np.ndarray]=20instead=20of=20list[list=E2=80=A6=20(#287?=
 =?UTF-8?q?73)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/v1/core/test_async_scheduler.py  |  3 +-
 tests/v1/core/test_scheduler.py        | 76 +++++++++++---------------
 tests/v1/kv_connector/unit/utils.py    |  3 +-
 tests/v1/spec_decode/test_eagle.py     |  5 +-
 tests/v1/spec_decode/test_ngram.py     | 18 +++---
 vllm/v1/core/sched/scheduler.py        |  4 +-
 vllm/v1/outputs.py                     |  2 +-
 vllm/v1/sample/rejection_sampler.py    |  8 ++-
 vllm/v1/spec_decode/eagle.py           |  7 ++-
 vllm/v1/spec_decode/ngram_proposer.py  |  6 +-
 vllm/v1/spec_decode/suffix_decoding.py | 10 ++--
 vllm/v1/worker/gpu_model_runner.py     | 36 +++++-------
 12 files changed, 76 insertions(+), 102 deletions(-)

diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index 1d80ee987591..e0645ed43015 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import deque
 
-import numpy as np
 import pytest
 
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -22,7 +21,7 @@ def _make_model_runner_output(
     return ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
-        sampled_token_ids=[np.array([i]) for i in range(len(req_ids))],
+        sampled_token_ids=[[i] for i in range(len(req_ids))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 6d95c29ec1ab..04e738293cd7 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -3,7 +3,6 @@
 import dataclasses
 from unittest.mock import Mock
 
-import numpy as np
 import pytest
 import torch
 
@@ -170,7 +169,7 @@ def test_schedule_partial_requests():
         req_id_to_index=req_to_index,
         # Only the first request has a sampled token id because
         # the rest requests are still being prefilled.
-        sampled_token_ids=[np.array([0]), np.array([]), np.array([])],
+        sampled_token_ids=[[0], [], []],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -217,7 +216,7 @@ def test_no_mm_input_chunking():
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
+        sampled_token_ids=[[] for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -277,7 +276,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
+        sampled_token_ids=[[] for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -301,8 +300,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([0]), np.array([0])]
-        + [np.array([]) for _ in range(len(requests) - 2)],
+        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -349,8 +347,8 @@ def test_stop_via_update_from_output():
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
         sampled_token_ids=[
-            np.array([EOS_TOKEN_ID]),
-            np.array([10, 11]),
+            [EOS_TOKEN_ID],
+            [10, 11],
         ],  # First request hits EOS, second continues
         logprobs=None,
         prompt_logprobs_dict={},
@@ -394,10 +392,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[
-            np.array([10, 42, 12]),
-            np.array([13, 14]),
-        ],  # First request hits stop token
+        sampled_token_ids=[[10, 42, 12], [13, 14]],  # First request hits stop token
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -441,10 +436,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[
-            np.array([10, 11, 12]),
-            np.array([13]),
-        ],  # First request exceeds max_tokens
+        sampled_token_ids=[[10, 11, 12], [13]],  # First request exceeds max_tokens
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -483,7 +475,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
+        sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -624,7 +616,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -641,7 +633,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -678,7 +670,7 @@ def test_preempt_during_execution():
     model_runner_output0 = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -695,7 +687,7 @@ def test_preempt_during_execution():
     model_runner_output1 = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[np.array([42])],
+        sampled_token_ids=[[42]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -712,18 +704,14 @@ def test_preempt_during_execution():
 @pytest.mark.parametrize(
     "spec_tokens,output_tokens,expected",
     [
-        ([[1, 2, 3]], [np.array([1, 2, 3, 4])], (1, 3, 3, [1, 1, 1])),  # perfect match
-        ([[1, 2, 3]], [np.array([1, 5])], (1, 3, 1, [1, 0, 0])),  # early mismatch
-        (
-            [[1, 2], [3]],
-            [np.array([1, 2, 5]), np.array([3, 4])],
-            (2, 3, 3, [2, 1]),
-        ),  # multiple sequences
-        ([[1]], [np.array([1, 2])], (1, 1, 1, [1])),  # single token sequence
-        ([[]], [np.array([5])], (0, 0, 0, [0])),  # empty sequence
+        ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
+        ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (2, 3, 3, [2, 1])),  # multiple sequences
+        ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
+        ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
         (
             [[1, 2, 3], [4, 5, 6]],
-            [np.array([1, 2, 7]), np.array([4, 8])],
+            [[1, 2, 7], [4, 8]],
             (2, 6, 3, [2, 1, 0]),
         ),  # multiple mismatches
     ],
@@ -757,7 +745,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     model_runner_output = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([0]) for _ in range(len(requests))],
+        sampled_token_ids=[[0] for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -984,7 +972,7 @@ def test_kv_connector_basic(is_async: bool):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1037,7 +1025,7 @@ def test_kv_connector_basic(is_async: bool):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1100,7 +1088,7 @@ def test_external_prefix_cache_metrics():
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=[r.request_id for r in requests],
         req_id_to_index={r.request_id: i for i, r in enumerate(requests)},
-        sampled_token_ids=[np.array([1000])] * NUM_REQUESTS,
+        sampled_token_ids=[[1000]] * NUM_REQUESTS,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1166,7 +1154,7 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1251,7 +1239,7 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1344,7 +1332,7 @@ def make_output(scheduler: Scheduler):
     return ModelRunnerOutput(
         req_ids=[req.request_id for req in scheduler.running],
         req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)},
-        sampled_token_ids=[np.array([1000])] * len(scheduler.running),
+        sampled_token_ids=[[1000]] * len(scheduler.running),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1761,7 +1749,7 @@ def test_priority_scheduling_preemption():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
+        sampled_token_ids=[[100] for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1830,7 +1818,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
+        sampled_token_ids=[[100] for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -2076,7 +2064,7 @@ def test_priority_scheduling_heap_property():
             model_output = ModelRunnerOutput(
                 req_ids=[req.req_id],
                 req_id_to_index={req.req_id: 0},
-                sampled_token_ids=[np.array([100])],
+                sampled_token_ids=[[100]],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[],
@@ -2162,7 +2150,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[request_low.request_id],
         req_id_to_index={request_low.request_id: 0},
-        sampled_token_ids=[np.array([100])],
+        sampled_token_ids=[[100]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2193,7 +2181,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[np.array([100]) for _ in requests],
+        sampled_token_ids=[[100] for _ in requests],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2219,7 +2207,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[np.array([]), np.array([100])],
+        sampled_token_ids=[[], [100]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index c248104d5b5e..f35f91bb3adf 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -7,7 +7,6 @@
 from itertools import chain, count
 from typing import Any
 
-import numpy as np
 import torch
 
 from vllm import SamplingParams
@@ -229,7 +228,7 @@ def create_model_runner_output(
 
     # Make sampled tokens.
     sampled_token = EOS_TOKEN_ID if use_eos else token_id
-    sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]
+    sampled_token_ids = [[sampled_token] for _ in req_ids]
 
     kv_connector_output = (
         None
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 421da5241555..89d0ec769ac0 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -3,7 +3,6 @@
 
 from unittest import mock
 
-import numpy as np
 import pytest
 import torch
 
@@ -113,9 +112,7 @@ def test_prepare_next_token_ids():
     sampled_token_ids_tensor = torch.tensor(
         sampled_token_ids, dtype=torch.int32, device=device
     )
-    sampled_token_ids_cpu = [
-        np.array([i for i in seq if i != -1]) for seq in sampled_token_ids
-    ]
+    sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids]
 
     expected_next_token_ids_cpu = [1, 4, 30, 40]
     expected_next_token_ids_tensor = torch.tensor(
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 563bc1d957f4..692c39282c37 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -77,7 +77,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match.
     token_ids_cpu = np.array([[1, 2, 3, 4, 5]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -88,7 +88,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match for 4-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -99,7 +99,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match for 4-gram but match for 3-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -111,7 +111,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # In this case, the proposer should return the 4-gram match.
     token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -122,7 +122,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # Match for 2-gram and 3-gram, but not 4-gram.
     token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -133,7 +133,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # Multiple 3-gram matched, but always pick the first one.
     token_ids_cpu = np.array([[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -144,7 +144,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # check empty input
     token_ids_cpu = np.array([[]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -157,7 +157,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # second request has 3 tokens and no match. Padded with -1 for max len 5
     token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[np.array([0]), np.array([1])],
+        sampled_token_ids=[[0], [1]],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([5, 3]),
         token_ids_cpu=token_ids_cpu,
@@ -181,7 +181,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     input_2[:3] = [4, 5, 6]
     token_ids_cpu = np.array([input_1, input_2])
     result = ngram_proposer.propose(
-        sampled_token_ids=[np.array([0]), np.array([1])],
+        sampled_token_ids=[[0], [1]],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([len(input_1), 3]),
         token_ids_cpu=token_ids_cpu,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index c640c40a455d..ba7ad0c09173 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1010,8 +1010,8 @@ def update_from_output(
                 continue
 
             req_index = model_runner_output.req_id_to_index[req_id]
-            generated_token_ids: list[int] = (
-                sampled_token_ids[req_index].tolist() if sampled_token_ids else []
+            generated_token_ids = (
+                sampled_token_ids[req_index] if sampled_token_ids else []
             )
 
             scheduled_spec_token_ids = (
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 60ee9671e497..e32d5bb608b1 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -158,7 +158,7 @@ class ModelRunnerOutput:
     # num_generated_tokens is the number of tokens
     # generated in the current step. It can be different for
     # each request due to speculative/jump decoding.
-    sampled_token_ids: list[np.ndarray]
+    sampled_token_ids: list[list[int]]
 
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index f31a0cddda9a..926305d25f56 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -3,7 +3,6 @@
 
 from dataclasses import replace
 
-import numpy as np
 import torch
 import torch.nn as nn
 
@@ -205,7 +204,7 @@ def _get_logprobs_tensors(
     def parse_output(
         output_token_ids: torch.Tensor,
         vocab_size: int,
-    ) -> list[np.ndarray]:
+    ) -> list[list[int]]:
         """Parse the output of the rejection sampler.
         Args:
             output_token_ids: The sampled token IDs in shape
@@ -221,7 +220,10 @@ def parse_output(
         valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
             output_token_ids_np < vocab_size
         )
-        return [row[valid_mask[i]] for i, row in enumerate(output_token_ids_np)]
+        outputs = [
+            row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
+        ]
+        return outputs
 
     def apply_logits_processors(
         self,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index f3b34544f8d9..beef5203e039 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -484,7 +484,7 @@ def propose(
 
     def prepare_next_token_ids_cpu(
         self,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
         requests: dict[str, CachedRequestState],
         gpu_input_batch: InputBatch,
         num_scheduled_tokens: dict[str, int],
@@ -499,7 +499,7 @@ def prepare_next_token_ids_cpu(
         req_ids = gpu_input_batch.req_ids
         next_token_ids: list[int] = []
         for i, token_ids in enumerate(sampled_token_ids):
-            if token_ids.shape[0] > 0:
+            if token_ids:
                 # Common case.
                 next_token_id = token_ids[-1]
             else:
@@ -510,9 +510,10 @@ def prepare_next_token_ids_cpu(
                 seq_len = req_state.num_computed_tokens + num_scheduled_tokens[req_id]
                 next_token_id = req_state.get_token_id(seq_len)
             next_token_ids.append(next_token_id)
-        return torch.tensor(
+        next_token_ids = torch.tensor(
             next_token_ids, dtype=torch.int32, device=self.input_ids.device
         )
+        return next_token_ids
 
     def prepare_next_token_ids_padded(
         self,
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 378937dba988..e2f83cb24aa9 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -54,7 +54,7 @@ def __init__(self, vllm_config: VllmConfig):
         # Trigger Numba JIT compilation for N-gram proposer.
         # This usually takes less than 1 second.
         self.propose(
-            [np.array([])] * 1024,
+            [[]] * 1024,
             [""] * 1024,
             np.zeros(1024, dtype=np.int32),
             np.zeros((1024, self.max_model_len), dtype=np.int32),
@@ -131,7 +131,7 @@ def batch_propose(
 
     def propose(
         self,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
         req_ids: list[str],
         num_tokens_no_spec: np.ndarray,
         token_ids_cpu: np.ndarray,
@@ -140,7 +140,7 @@ def propose(
         # find which requests need ngram proposals
         valid_ngram_requests = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            num_sampled_ids = sampled_ids.shape[0]
+            num_sampled_ids = len(sampled_ids)
             if not num_sampled_ids:
                 # Skip speculative decoding.
                 continue
diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py
index d76e0ffe778d..049e335db325 100644
--- a/vllm/v1/spec_decode/suffix_decoding.py
+++ b/vllm/v1/spec_decode/suffix_decoding.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import numpy as np
-
 from vllm.config import VllmConfig
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
@@ -34,16 +32,16 @@ def __init__(self, vllm_config: VllmConfig):
     def propose(
         self,
         input_batch: InputBatch,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
     ) -> list[list[int]]:
         """
         Propose speculative tokens for each request in the input batch. Suffix Decoding
         will speculate a dynamic number of tokens for each request every decoding step,
         so each entry in the returned list may have different lengths.
         """
-        draft_token_ids: list[np.ndarray] = []
+        draft_token_ids: list[list[int]] = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            if sampled_ids.shape[0] == 0:
+            if not sampled_ids:
                 # Skip speculative decoding for partial prefills.
                 draft_token_ids.append([])
                 continue
@@ -72,7 +70,7 @@ def propose(
                 self.suffix_cache.start_request(req_id, prompt_token_ids)
 
             # Append the newly sampled ids to the suffix cache for this request.
-            self.suffix_cache.add_active_response(req_id, sampled_ids.tolist())
+            self.suffix_cache.add_active_response(req_id, sampled_ids)
 
             # Suffix decoding only uses the most recent tokens up to max_tree_depth, so
             # we extract the pattern from the end of the input.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d0d6164180e6..9b3e5b668aab 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -216,11 +216,9 @@ def get_output(self) -> ModelRunnerOutput:
         del self._logprobs_tensors
         del self._sampled_token_ids
 
-        valid_sampled_token_ids: list[np.ndarray] = [
-            row for row in self.sampled_token_ids_cpu.numpy()
-        ]
+        valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
         for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i] = np.array([])
+            valid_sampled_token_ids[i].clear()
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
@@ -2341,7 +2339,7 @@ def _bookkeeping_sync(
     ) -> tuple[
         dict[str, int],
         LogprobsLists | None,
-        list[np.ndarray],
+        list[list[int]],
         dict[str, LogprobsTensors | None],
         list[str],
         dict[str, int],
@@ -2367,7 +2365,6 @@ def _bookkeeping_sync(
         num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
         invalid_req_indices = []
-        valid_sampled_token_ids: list[np.ndarray]
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
@@ -2382,7 +2379,7 @@ def _bookkeeping_sync(
                 )
             # Mask out the sampled tokens that should not be sampled.
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)] = np.array([])
+                valid_sampled_token_ids[int(i)].clear()
         else:
             valid_sampled_token_ids = []
             invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
@@ -2410,24 +2407,19 @@ def _bookkeeping_sync(
             [0] if spec_decode_metadata and logprobs_tensors else None
         )
         for req_idx in range(num_sampled_tokens):
-            sampled_ids: np.ndarray | None
             if self.use_async_scheduling:
-                sampled_ids = (
-                    np.array([-1]) if req_idx not in invalid_req_indices_set else None
-                )
+                sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
             else:
                 sampled_ids = valid_sampled_token_ids[req_idx]
 
-            num_sampled_ids: int = (
-                sampled_ids.shape[0] if sampled_ids is not None else 0
-            )
+            num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
 
             if cu_num_accepted_tokens is not None:
                 cu_num_accepted_tokens.append(
                     cu_num_accepted_tokens[-1] + num_sampled_ids
                 )
 
-            if sampled_ids is None or num_sampled_ids == 0:
+            if not sampled_ids:
                 continue
 
             start_idx = self.input_batch.num_tokens_no_spec[req_idx]
@@ -2769,9 +2761,7 @@ def sample_tokens(
         with record_function_or_nullcontext("gpu_model_runner: sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
 
-        def propose_draft_token_ids(
-            sampled_token_ids: torch.Tensor | list[np.ndarray],
-        ) -> None:
+        def propose_draft_token_ids(sampled_token_ids):
             assert spec_decode_common_attn_metadata is not None
             with record_function_or_nullcontext("gpu_model_runner: draft"):
                 self._draft_token_ids = self.propose_draft_token_ids(
@@ -2893,14 +2883,14 @@ def take_draft_token_ids(self) -> DraftTokenIds | None:
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
-        sampled_token_ids: torch.Tensor | list[np.ndarray],
+        sampled_token_ids: torch.Tensor | list[list[int]],
         sampling_metadata: SamplingMetadata,
         hidden_states: torch.Tensor,
         sample_hidden_states: torch.Tensor,
         aux_hidden_states: list[torch.Tensor] | None,
         spec_decode_metadata: SpecDecodeMetadata | None,
         common_attn_metadata: CommonAttentionMetadata,
-    ) -> torch.Tensor | list[list[int]]:
+    ) -> list[list[int]] | torch.Tensor:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if self.speculative_config.method == "ngram":
             assert isinstance(sampled_token_ids, list)
@@ -2932,7 +2922,7 @@ def propose_draft_token_ids(
                 for num_draft, tokens in zip(
                     spec_decode_metadata.num_draft_tokens, sampled_token_ids
                 ):
-                    indices.append(offset + tokens.shape[0] - 1)
+                    indices.append(offset + len(tokens) - 1)
                     offset += num_draft + 1
                 indices = torch.tensor(indices, device=self.device)
                 hidden_states = sample_hidden_states[indices]
@@ -4872,7 +4862,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
 
         return kv_cache_spec
 
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
         # This is a short term mitigation for issue mentioned in
         # https://github.com/vllm-project/vllm/issues/22754.
         # `tolist` would trigger a cuda wise stream sync, which
@@ -4885,4 +4875,4 @@ def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
         pinned.copy_(sampled_token_ids, non_blocking=True)
         self.transfer_event.record()
         self.transfer_event.synchronize()
-        return [row for row in pinned.numpy()]
+        return pinned.tolist()

From 363aaeef0ff8511fd1466d41a2e027b22b28f39b Mon Sep 17 00:00:00 2001
From: Mohammad Othman <48595863+OthmanMohammad@users.noreply.github.com>
Date: Sat, 15 Nov 2025 06:31:36 +0200
Subject: [PATCH 089/578] Fix IntermediateTensors initialization and add type
 hints (#28743)

Signed-off-by: Mohammad Othman <Mo@MohammadOthman.com>
Co-authored-by: Mohammad Othman <Mo@MohammadOthman.com>
---
 vllm/sequence.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index 6bcc94ad5c62..6d20ca9aac22 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -60,12 +60,17 @@ class IntermediateTensors:
     tensors: dict[str, torch.Tensor]
     kv_connector_output: KVConnectorOutput | None
 
-    def __init__(self, tensors):
+    def __init__(
+        self,
+        tensors: dict[str, torch.Tensor],
+        kv_connector_output: KVConnectorOutput | None = None,
+    ) -> None:
         # manually define this function, so that
         # Dynamo knows `IntermediateTensors()` comes from this file.
         # Otherwise, dataclass will generate this function by evaluating
         # a string, and we will lose the information about the source file.
         self.tensors = tensors
+        self.kv_connector_output = kv_connector_output
 
     def __getitem__(self, key: str | slice):
         if isinstance(key, str):

From c9e665852abbd42d7404a4f6dad7d47478ca95f8 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Fri, 14 Nov 2025 23:51:32 -0600
Subject: [PATCH 090/578] [NIXL] heterogeneous block_size support (#26759)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
---
 .../nixl_integration/run_accuracy_test.sh     |   4 +
 .../kv_connector/unit/test_nixl_connector.py  |   3 +
 .../kv_connector/v1/nixl_connector.py         | 309 ++++++++++++++----
 3 files changed, 257 insertions(+), 59 deletions(-)

diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index a9817313cf02..ebc8575e5b39 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -49,6 +49,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
 DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
+PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-16}
+DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-16}
 
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
@@ -136,6 +138,7 @@ run_tests_for_model() {
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
+    --block-size ${PREFILL_BLOCK_SIZE} \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '$KV_CONFIG'"
@@ -177,6 +180,7 @@ run_tests_for_model() {
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
+    --block-size ${DECODE_BLOCK_SIZE} \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --kv-transfer-config '$KV_CONFIG'"
   
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 8e421717fea3..b7d7a10057b8 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -407,6 +407,7 @@ def _nixl_handshake(
                 # `self.kv_cache_layout` is only forced to HND when vllm engine
                 # is started. We mock HND here.
                 kv_cache_layout="HND",
+                block_size=self.block_size,
             ),
             remote_tp_size=remote_tp_size,
         )
@@ -652,6 +653,7 @@ def test_handshake_fails_on_kv_cache_layout_mismatch(self, dist_init):
                 block_lens=worker.block_len_per_layer,
                 attn_backend_name=worker.backend_name,
                 kv_cache_layout=mismatched_layout,
+                block_size=worker.block_size,
             )
 
             with pytest.raises(RuntimeError):
@@ -706,6 +708,7 @@ def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental(
                 block_lens=[i * 2 for i in worker.block_len_per_layer],
                 attn_backend_name=worker.backend_name,
                 kv_cache_layout="HND",
+                block_size=worker.block_size,
             )
 
             # We don't check layout for homogeneous TP and MLA for now, as the
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 3d4547c51453..a70c98b63713 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -108,6 +108,7 @@ class NixlAgentMetadata(KVConnectorHandshakeMetadata):
     block_lens: list[int]
     attn_backend_name: str
     kv_cache_layout: str
+    block_size: int
 
 
 @dataclass
@@ -709,6 +710,9 @@ def split_k_and_v(self) -> bool:
                 self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first
             )
 
+        block_size: int
+        remote_block_size: dict[EngineId, int]
+
         def tp_ratio(
             self,
             remote_tp_size: int,
@@ -725,6 +729,19 @@ def tp_ratio(
             )
             return self.tp_size // remote_tp_size
 
+        def block_size_ratio(
+            self,
+            remote_block_size: int,
+        ) -> float:
+            """
+            Calculate the block size ratio between local and remote TP.
+            """
+            assert self.block_size % remote_block_size == 0, (
+                f"Local block size {self.block_size} is not divisible "
+                f"by remote block size {remote_block_size} or vice versa."
+            )
+            return self.block_size // remote_block_size
+
         def tp_ratio_from_engine_id(
             self,
             remote_engine_id: EngineId,
@@ -732,6 +749,13 @@ def tp_ratio_from_engine_id(
             remote_tp_size = self.remote_tp_size[remote_engine_id]
             return self.tp_ratio(remote_tp_size)
 
+        def block_size_ratio_from_engine_id(
+            self,
+            remote_engine_id: EngineId,
+        ) -> float:
+            remote_block_size = self.remote_block_size[remote_engine_id]
+            return self.block_size_ratio(remote_block_size)
+
         def is_kv_replicated(self, engine_id: EngineId) -> bool:
             """
             Whether the KV cache is replicated across TP workers due to the
@@ -866,6 +890,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
 
         # nixl_prepped_dlist_handle.
         self.src_xfer_side_handle: int = 0
+        self.src_xfer_side_handles: dict[int, int] = {}
         # Map of engine_id -> nixl_prepped_dlist_handle (int)].
         self.dst_xfer_side_handles: dict[EngineId, int] = {}
 
@@ -925,6 +950,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
 
         self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
+        self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size}
         # With heterogeneous TP, P must wait for all assigned D TP workers to
         # finish reading before safely freeing the blocks.
         self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
@@ -936,6 +962,8 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
             remote_tp_size=self._tp_size,  # shared state
             is_mla=self.use_mla,
             total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
+            block_size=self.block_size,
+            remote_block_size=self._block_size,
             attn_backend=backend,
         )
         self._use_pallas = self.kv_topo._use_pallas
@@ -987,9 +1015,13 @@ def _nixl_handshake(
                 )
 
             # Register Remote agent.
+            assert metadata.block_size <= self.block_size, (
+                "nP > nD is not supported yet."
+            )
             remote_agent_name = self.add_remote_agent(
                 metadata, p_remote_rank, remote_tp_size
             )
+
             setup_agent_time = time.perf_counter()
             logger.debug(
                 "NIXL handshake: add agent took: %s",
@@ -1217,43 +1249,10 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             self.num_regions *= 2
 
         # Register local/src descr for NIXL xfer.
-        blocks_data = []
-        for i, base_addr in enumerate(seen_base_addresses):
-            kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
-            # NOTE With heter-TP, more blocks are prepared than what are
-            # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
-            # could create fewer, but then _get_block_descs_ids needs to
-            # select agent_meta.num_blocks instead of self.num_blocks for
-            # local descr, and that makes handling regular flow less clean.
-            for block_id in range(self.num_blocks):
-                block_offset = block_id * self.block_len_per_layer[i]
-                addr = base_addr + block_offset
-                # (addr, len, device id)
-                blocks_data.append((addr, kv_block_len, self.device_id))
-
-            if self.kv_topo.is_kv_layout_blocks_first:
-                # Separate and interleave K/V regions to maintain the same
-                # descs ordering. This is needed for selecting contiguous heads
-                # when split across TP ranks.
-                for block_id in range(self.num_blocks):
-                    block_offset = block_id * self.block_len_per_layer[i]
-                    addr = base_addr + block_offset
-                    # Register addresses for V cache (K registered first).
-                    v_addr = addr + kv_block_len
-                    blocks_data.append((v_addr, kv_block_len, self.device_id))
-        logger.debug(
-            "Created %s blocks for src engine %s and rank %s on device id %s",
-            len(blocks_data),
-            self.engine_id,
-            self.tp_rank,
-            self.device_id,
-        )
+        self.seen_base_addresses = seen_base_addresses
+        self.src_xfer_side_handle = self.register_local_xfer_handler(self.block_size)
 
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
-        # NIXL_INIT_AGENT to be used for preparations of local descs.
-        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
-            "NIXL_INIT_AGENT", descs
-        )
+        self.src_xfer_side_handles[self.block_size] = self.src_xfer_side_handle
 
         # TODO(mgoin): Hybrid memory allocator is currently disabled for
         # models with local attention (Llama 4). Can remove this once enabled.
@@ -1289,8 +1288,62 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             kv_cache_layout=self.kv_cache_layout
             if not self.use_host_buffer
             else self.host_buffer_kv_cache_layout,
+            block_size=self.block_size,
+        )
+
+    def register_local_xfer_handler(
+        self,
+        block_size: int,
+    ) -> int:
+        """
+        Function used for register local xfer handler with local block_size or
+        Remote block_size.
+
+        When local block_size is same as remote block_size, we use local block_size
+        to register local_xfer_handler during init.
+
+        When remote block size is less than local block size, we need to use
+        register another local_xfer_handler using remote block len to ensure
+        data copy correctness.
+        """
+        block_size_ratio = self.block_size // block_size
+        blocks_data = []
+        for i, base_addr in enumerate(self.seen_base_addresses):
+            # The new block_len is using prefill block_len;
+            # and num_blocks is multiple with N
+            kv_block_len = (
+                self.get_backend_aware_kv_block_len(layer_idx=i) // block_size_ratio
+            )
+            block_len_per_layer = self.block_len_per_layer[i] // block_size_ratio
+            num_blocks = self.num_blocks * block_size_ratio
+            for block_id in range(num_blocks):
+                block_offset = block_id * block_len_per_layer
+                addr = base_addr + block_offset
+                # (addr, len, device id)
+                blocks_data.append((addr, kv_block_len, self.device_id))
+
+            if self.kv_topo.is_kv_layout_blocks_first:
+                # Separate and interleave K/V regions to maintain the same
+                # descs ordering. This is needed for selecting contiguous heads
+                # when split across TP ranks.
+                for block_id in range(num_blocks):
+                    block_offset = block_id * block_len_per_layer
+                    addr = base_addr + block_offset
+                    # Register addresses for V cache (K registered first).
+                    v_addr = addr + kv_block_len
+                    blocks_data.append((v_addr, kv_block_len, self.device_id))
+        logger.debug(
+            "Created %s blocks for src engine %s and rank %s on device id %s",
+            len(blocks_data),
+            self.engine_id,
+            self.tp_rank,
+            self.device_id,
         )
 
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
+        # NIXL_INIT_AGENT to be used for preparations of local descs.
+        return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+
     def add_remote_agent(
         self,
         nixl_agent_meta: NixlAgentMetadata,
@@ -1349,6 +1402,8 @@ def add_remote_agent(
         ### Register remote agent metadata
         if engine_id not in self._tp_size:
             self._tp_size[engine_id] = remote_tp_size
+        if engine_id not in self._block_size:
+            self._block_size[engine_id] = nixl_agent_meta.block_size
 
         remote_agent_name = self.nixl_wrapper.add_remote_agent(
             nixl_agent_meta.agent_metadata
@@ -1359,6 +1414,13 @@ def add_remote_agent(
 
         # Create dst descs and xfer side handles. TP workers have same #blocks
         # so we only register once per engine_id.
+        # Example:
+        # block_size_ratio > 1:
+        # remote:               | 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|
+        # local origin:|          0|          1|          8|         12|
+        # local mapped:| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(engine_id)
+
         if engine_id not in self.dst_num_blocks:
             self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
 
@@ -1381,8 +1443,14 @@ def add_remote_agent(
         # Register all remote blocks, but only the corresponding kv heads.
         for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
             kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
+            remote_kv_block_len = kv_block_len // block_size_ratio
+            if block_size_ratio > 1:
+                # using remote kv_block_len as transfer unit
+                kv_block_len = remote_kv_block_len
             rank_offset = (
-                self.tp_rank % tp_ratio * kv_block_len if not replicates_kv_cache else 0
+                self.tp_rank % tp_ratio * remote_kv_block_len
+                if not replicates_kv_cache
+                else 0
             )
             for block_id in range(nixl_agent_meta.num_blocks):
                 block_offset = block_id * nixl_agent_meta.block_lens[i]
@@ -1417,6 +1485,13 @@ def add_remote_agent(
             remote_agent_name, descs
         )
 
+        if block_size_ratio > 1:
+            # when prefill with smaller block_size, we need to init a
+            # new handler with same block_len to match
+            self.src_xfer_side_handles[nixl_agent_meta.block_size] = (
+                self.register_local_xfer_handler(nixl_agent_meta.block_size)
+            )
+
         return remote_agent_name
 
     def _validate_remote_agent_handshake(
@@ -1433,6 +1508,9 @@ def _validate_remote_agent_handshake(
         assert nixl_agent_meta.attn_backend_name == self.backend_name
 
         tp_ratio = self.kv_topo.tp_ratio_from_engine_id(remote_engine_id)
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
+            remote_engine_id
+        )
         assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
         assert not self._use_pallas or tp_ratio == 1, (
             "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
@@ -1463,33 +1541,26 @@ def _validate_remote_agent_handshake(
         remote_block_len = nixl_agent_meta.block_lens[0]
         if self.use_mla or self.kv_topo.is_kv_replicated(remote_engine_id):
             # With replicated KV cache, only the number of blocks can differ.
-            assert self.block_len_per_layer == nixl_agent_meta.block_lens, (
-                "KV cache sizes must match between P and D when replicated"
-            )
-            remote_block_size = remote_block_len // (self.slot_size_per_layer[0])
+            for i in range(len(self.block_len_per_layer)):
+                assert (
+                    self.block_len_per_layer[i] // block_size_ratio
+                    == nixl_agent_meta.block_lens[i]
+                ), "KV cache sizes must match between P and D when replicated"
         else:
             # When MLA is not used, this is a list of the same block length
             for block_len in nixl_agent_meta.block_lens:
                 assert block_len == remote_block_len, (
                     "All remote layers must have the same block size"
                 )
-            remote_block_size = remote_block_len // (
-                self.slot_size_per_layer[0] * tp_ratio
-            )
-            if self.kv_topo.is_kv_layout_blocks_first:
-                # With flashinfer, KV are sent in the same message.
-                remote_block_size //= 2
 
-            assert remote_block_len == self.block_len_per_layer[0] * tp_ratio, (
+            assert (
+                remote_block_len
+                == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
+            ), (
                 "Remote P worker KV layer cache must be of shape [2, N, "
                 "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
             )
 
-        assert self.block_size == remote_block_size, (
-            "Remote P worker with different page/block size is not supported "
-            f"{self.block_size=}, {remote_block_size=}"
-        )
-
         # TP workers have same #blocks.
         assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks
 
@@ -1576,6 +1647,56 @@ def permute_device_kv(self, block_ids: list[int]):
                 )
                 cache.index_copy_(0, indices, permuted_blocks)
 
+    def blocksize_post_process(self, block_ids_per_ratio: dict[float, list[list[int]]]):
+        def _process_local_gt_remote(blocks_to_update, block_size_ratio):
+            n_kv_heads, block_size, head_size = blocks_to_update.shape[1:]
+            remote_block_size = block_size // block_size_ratio
+            n_blocks = block_size_ratio
+            # actual permute is to convert
+            # for local blocksize > remote blocksize
+            # ex: local blocksize = 16 tokens, remote blocksize = 4 tokens
+            # local block[0] = remote block[0, 1, 2, 3]
+            # remote is |h0-b0|h1-b0|h2-b0|h3-b0|h0-b1|h1-b1|h2-b1|h3-b1|...
+            # local is  |h0-b0..................|h1-b0..................|...
+            # permute is to:
+            # 1. view => view remote as n_blocks * remote_shape(H,remoteN,D)
+            # 2. permute => (H, nblocks, remoteN, D)
+            # 3. flatten => (H, localN, D)
+            permuted_blocks = (
+                blocks_to_update.reshape(
+                    -1, n_blocks, n_kv_heads, remote_block_size, head_size
+                )
+                .permute(0, 2, 1, 3, 4)
+                .flatten(2, 3)
+            )
+            return permuted_blocks
+
+        if len(self.device_kv_caches) == 0:
+            return
+        split_k_and_v = not (
+            self.use_mla or self._use_pallas or self.kv_topo.is_kv_layout_blocks_first
+        )
+        sample_cache = list(self.device_kv_caches.values())[0][0]
+        for block_size_ratio, block_ids_list in block_ids_per_ratio.items():
+            assert block_size_ratio > 1, "Only nP < nD supported currently."
+            block_ids_list = [[item for sublist in block_ids_list for item in sublist]]
+
+            for block_ids in block_ids_list:
+                indices = torch.tensor(block_ids, device=sample_cache.device)
+
+                for _, cache_or_caches in self.device_kv_caches.items():
+                    cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
+                    for cache in cache_list:
+                        blocks_to_update = cache.index_select(0, indices)
+                        # because kv_cache is always using original layout NHD as
+                        # virtual shape while stride can be either HND / NHD at
+                        # initialization.
+                        # we need to firstly get physical view of the tensor
+                        permuted_blocks = _process_local_gt_remote(
+                            blocks_to_update.permute(0, 2, 1, 3), block_size_ratio
+                        ).permute(0, 2, 1, 3)
+                        cache.index_copy_(0, indices, permuted_blocks)
+
     def get_finished(self) -> tuple[set[str], set[str]]:
         """
         Get requests that are done sending or recving on this specific worker.
@@ -1599,6 +1720,7 @@ def get_finished(self) -> tuple[set[str], set[str]]:
             )
 
         block_ids_to_permute = []
+        block_ids_for_blocksize_post_process = defaultdict(list)
         for req_id in done_recving:
             # clean up metadata for completed requests
             meta = self._recving_metadata.pop(req_id, None)
@@ -1607,6 +1729,20 @@ def get_finished(self) -> tuple[set[str], set[str]]:
                 self.sync_recved_kv_to_device(req_id, meta)
             if self.enable_permute_local_kv:
                 block_ids_to_permute += meta.local_physical_block_ids
+
+            # post processing for heteroblocksize
+            block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
+                meta.remote_engine_id
+            )
+            if (
+                not self.use_mla
+                and block_size_ratio > 1
+                and self.kv_cache_layout == "HND"
+            ):
+                block_ids_for_blocksize_post_process[block_size_ratio].append(
+                    meta.local_block_ids
+                )
+        self.blocksize_post_process(block_ids_for_blocksize_post_process)
         if len(block_ids_to_permute) > 0:
             self.permute_device_kv(block_ids_to_permute)
 
@@ -1781,6 +1917,24 @@ def _read_blocks(
         dst_engine_id: str,
         request_id: str,
     ):
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id)
+        if block_size_ratio > 1:
+            local_block_ids = self.get_mapped_blocks(
+                np.asarray(local_block_ids), block_size_ratio
+            )
+            if len(local_block_ids) > len(remote_block_ids):
+                # NOTE:
+                # get_mapped_blocks will always expand block_ids for n times.
+                # ex:
+                # prefill block_ids with block_size as 4:
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+                # Local decode block_ids with block_size as 16: [1, 2, 3]
+                # expland ecode block_ids with get_mapped_blocks from [1, 2, 3] to
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+                # Then we clip local to align with prefill
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] to
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+                local_block_ids = local_block_ids[: len(remote_block_ids)]
         # NOTE(rob): having the staging blocks be on the READER side is
         # not going to work well (since we will have to call rearrange tensors).
         # after we detect the txn is complete (which means we cannot make the
@@ -1823,7 +1977,10 @@ def _read_blocks(
             remote_block_ids = remote_block_ids[-num_local_blocks:]
 
         # Get side handles.
-        local_xfer_side_handle = self.src_xfer_side_handle
+        remote_block_size = self.kv_topo.remote_block_size[dst_engine_id]
+        local_xfer_side_handle = self.src_xfer_side_handles.get(
+            remote_block_size, self.src_xfer_side_handle
+        )
         remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
 
         # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
@@ -1833,13 +1990,17 @@ def _read_blocks(
         # Get descs ids.
         local_block_descs_ids: np.ndarray
         remote_block_descs_ids: np.ndarray
+
         if not self.block_window_per_layer:
             # Default case: assume global attention
             remote_block_descs_ids = self._get_block_descs_ids(
-                dst_engine_id, remote_block_ids
+                dst_engine_id,
+                remote_block_ids,
             )
             local_block_descs_ids = self._get_block_descs_ids(
-                self.engine_id, local_block_ids
+                self.engine_id,
+                local_block_ids,
+                block_size_ratio=block_size_ratio,
             )
         else:
             # TODO(mgoin): remove this once we have hybrid memory allocator
@@ -1860,10 +2021,15 @@ def _read_blocks(
 
                 # Get descs ids for the layer.
                 layer_local_desc_ids = self._get_block_descs_ids(
-                    self.engine_id, layer_local_block_ids, layer_idx
+                    dst_engine_id,
+                    layer_local_block_ids,
+                    layer_idx,
                 )
                 layer_remote_desc_ids = self._get_block_descs_ids(
-                    dst_engine_id, layer_remote_block_ids, layer_idx
+                    self.engine_id,
+                    layer_remote_block_ids,
+                    layer_idx,
+                    block_size_ratio=block_size_ratio,
                 )
 
                 local_descs_list.append(layer_local_desc_ids)
@@ -1905,8 +2071,31 @@ def _read_blocks(
                 self.nixl_wrapper.release_xfer_handle(handle)
             self._failed_recv_reqs.add(request_id)
 
+    def get_mapped_blocks(self, block_ids, block_size_ratio):
+        """
+          Calculates the new set of block IDs by mapping every element
+          in the (potentially sparse) input array.
+          Example: block_ids=[0, 2], block_size_ratio=2
+        get_mapped_blocks    0     1     [2     3]     4     5
+              # remote is |h0-b0|h1-b0||h0-b1|h1-b1||h0-b1|h1-b1||
+              # local is  |h0-b0......||h1-b0......||h2-b0........
+        local_block_ids         0           [1]           2
+        """
+        if block_ids.size == 0:
+            return np.array([], dtype=np.int64)
+
+        start_ids = block_ids * block_size_ratio
+        offsets = np.arange(block_size_ratio)
+        mapped_2d = start_ids[:, None] + offsets[None, :]
+
+        return mapped_2d.flatten().astype(np.int64)
+
     def _get_block_descs_ids(
-        self, engine_id: str, block_ids: list[int], layer_idx: int | None = None
+        self,
+        engine_id: str,
+        block_ids: list[int],
+        layer_idx: int | None = None,
+        block_size_ratio: float | None = None,
     ) -> np.ndarray:
         """
         Get the descs ids for a set of block ids.
@@ -1929,6 +2118,8 @@ def _get_block_descs_ids(
                 region_ids = np.arange(layer_idx, layer_idx + 1)
 
         num_blocks = self.dst_num_blocks[engine_id]
+        if block_size_ratio is not None:
+            num_blocks = int(num_blocks * block_size_ratio)
 
         # Compute the desc ids for each block.
         region_ids = region_ids[:, None]

From 6965ef436fb398bfbbdce5b6f88dd842c5944771 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Sat, 15 Nov 2025 00:52:14 -0500
Subject: [PATCH 091/578] [Performance][DeepGEMM] Estimate expected_m (#28694)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 tests/kernels/moe/test_deepep_deepgemm_moe.py | 46 ++++++++++++++-----
 vllm/forward_context.py                       |  4 ++
 .../layers/fused_moe/batched_deep_gemm_moe.py | 40 ++++++++++++++--
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 0faf8bc95d2e..455ecacef5ec 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -7,6 +7,7 @@
 """
 
 import dataclasses
+from contextlib import contextmanager
 
 import pytest
 import torch.distributed
@@ -14,6 +15,7 @@
 from typing_extensions import ParamSpec
 
 from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     fp8_w8a8_moe_quant_config,
@@ -61,6 +63,23 @@
 P = ParamSpec("P")
 
 
+@contextmanager
+def with_dp_metadata(M: int, world_size: int):
+    num_tokens_across_dp = torch.tensor([M] * world_size, device="cpu", dtype=torch.int)
+
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.data_parallel_size = world_size
+    vllm_config.parallel_config.enable_expert_parallel = True
+
+    with set_forward_context(
+        None,
+        vllm_config,
+        num_tokens=M,
+        num_tokens_across_dp=num_tokens_across_dp,
+    ):
+        yield
+
+
 def next_power_of_2(x):
     import math
 
@@ -285,18 +304,21 @@ def build_expert_map():
         quant_config=quant_config,
     )
 
-    out = mk.forward(
-        hidden_states=test_tensors.rank_tokens,
-        w1=w1,
-        w2=w2,
-        topk_weights=test_tensors.topk_weights,
-        topk_ids=test_tensors.topk,
-        inplace=False,
-        activation="silu",
-        global_num_experts=num_experts,
-        expert_map=build_expert_map(),
-        apply_router_weight_on_input=False,
-    )
+    with with_dp_metadata(
+        M=test_tensors.rank_tokens.size(0), world_size=pgi.world_size
+    ):
+        out = mk.forward(
+            hidden_states=test_tensors.rank_tokens,
+            w1=w1,
+            w2=w2,
+            topk_weights=test_tensors.topk_weights,
+            topk_ids=test_tensors.topk,
+            inplace=False,
+            activation="silu",
+            global_num_experts=num_experts,
+            expert_map=build_expert_map(),
+            apply_router_weight_on_input=False,
+        )
     return out
 
 
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 44bc2a4cda31..25fb7181a8f2 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -221,6 +221,10 @@ def get_forward_context() -> ForwardContext:
     return _forward_context
 
 
+def is_forward_context_available() -> bool:
+    return _forward_context is not None
+
+
 def create_forward_context(
     attn_metadata: Any,
     vllm_config: VllmConfig,
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 79c92eb48612..53362277dae8 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -5,6 +5,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.forward_context import get_forward_context, is_forward_context_available
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
@@ -19,7 +20,7 @@
     get_mk_alignment_for_contiguous_layout,
     is_deep_gemm_e8m0_used,
 )
-from vllm.utils.math_utils import cdiv
+from vllm.utils.math_utils import cdiv, round_up
 
 logger = init_logger(__name__)
 
@@ -313,6 +314,33 @@ def workspace_shapes(
         output = (num_experts, max_num_tokens * num_dispatchers, K)
         return (workspace13, workspace2, output)
 
+    def estimate_expected_m(
+        self, global_num_experts: int, max_tokens_per_expert: int, topk: int
+    ) -> int:
+        dp_meta = (
+            get_forward_context().dp_metadata
+            if is_forward_context_available()
+            else None
+        )
+        if dp_meta is None:
+            logger.warning_once(
+                "DPMetadata unavailable. Defaulting expected_m to "
+                f"{max_tokens_per_expert}.",
+                scope="local",
+            )
+            return max_tokens_per_expert
+
+        total_num_tokens = dp_meta.num_tokens_across_dp_cpu.sum().item()
+        total_num_tokens_replicated = total_num_tokens * topk
+
+        # Assume even load balancing
+        assert global_num_experts != 0
+        estimate = round_up(int(total_num_tokens_replicated // global_num_experts), 16)
+        # clamp estimate
+        estimate = max(estimate, 16)
+        estimate = min(max_tokens_per_expert, estimate)
+        return estimate
+
     def apply(
         self,
         output: torch.Tensor,
@@ -348,10 +376,12 @@ def apply(
 
         workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))
 
-        # (from deepgemm docs) : A value hint (which is a value on CPU)
-        # for the M expectation of each batch, correctly setting this value
-        # may lead to better performance.
-        expected_m = max_num_tokens
+        expected_m = self.estimate_expected_m(
+            global_num_experts=global_num_experts,
+            max_tokens_per_expert=max_num_tokens,
+            topk=topk_ids.size(-1),
+        )
+
         fp8_m_grouped_gemm_nt_masked(
             (a1q, a1q_scale),
             (w1, self.w1_scale),

From 98b4d389ed27f09fd185ade889a02f640a3ff0b4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Nov 2025 14:47:41 +0800
Subject: [PATCH 092/578] [Redo] #26368 (#28771)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 tests/v1/core/test_async_scheduler.py         |  3 +-
 .../v1/core/test_priority_scheduler_random.py |  6 +-
 tests/v1/core/test_scheduler.py               | 88 +++++++++++--------
 .../kv_connector/unit/test_nixl_connector.py  |  7 +-
 tests/v1/kv_connector/unit/utils.py           |  3 +-
 tests/v1/spec_decode/test_eagle.py            |  5 +-
 tests/v1/spec_decode/test_ngram.py            | 18 ++--
 vllm/v1/core/sched/scheduler.py               |  4 +-
 vllm/v1/outputs.py                            |  4 +-
 vllm/v1/sample/rejection_sampler.py           |  8 +-
 vllm/v1/spec_decode/eagle.py                  |  7 +-
 vllm/v1/spec_decode/ngram_proposer.py         |  6 +-
 vllm/v1/spec_decode/suffix_decoding.py        | 10 ++-
 vllm/v1/worker/gpu_model_runner.py            | 36 +++++---
 vllm/v1/worker/tpu_model_runner.py            |  8 +-
 15 files changed, 122 insertions(+), 91 deletions(-)

diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index e0645ed43015..1d80ee987591 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import deque
 
+import numpy as np
 import pytest
 
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -21,7 +22,7 @@ def _make_model_runner_output(
     return ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
-        sampled_token_ids=[[i] for i in range(len(req_ids))],
+        sampled_token_ids=[np.array([i]) for i in range(len(req_ids))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py
index b4805be80272..ba0b703302e3 100644
--- a/tests/v1/core/test_priority_scheduler_random.py
+++ b/tests/v1/core/test_priority_scheduler_random.py
@@ -3,6 +3,7 @@
 import random
 import uuid
 
+import numpy as np
 import pytest
 
 from vllm.config import VllmConfig
@@ -99,8 +100,7 @@ def _mock_execute_model(
         random.randint(*num_output_tokens_range) for _ in range(len(request_ids))
     ]
     sampled_token_ids = [
-        [random.randint(0, 100) for _ in range(num_tokens)]
-        for num_tokens in num_output_tokens
+        np.random.randint(0, 100, size=num_tokens) for num_tokens in num_output_tokens
     ]
 
     return ModelRunnerOutput(
@@ -196,6 +196,8 @@ def test_priority_scheduling_blast(
     num_blocks: int,
 ):
     random.seed(42)
+    np.random.seed(42)
+
     seen_request_prompt_length = dict[str, int]()
     seen_request_ids = set[str]()
     seen_mm_hashes = set[str]()
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 04e738293cd7..0570c0854c67 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -3,6 +3,7 @@
 import dataclasses
 from unittest.mock import Mock
 
+import numpy as np
 import pytest
 import torch
 
@@ -169,7 +170,7 @@ def test_schedule_partial_requests():
         req_id_to_index=req_to_index,
         # Only the first request has a sampled token id because
         # the rest requests are still being prefilled.
-        sampled_token_ids=[[0], [], []],
+        sampled_token_ids=[np.array([0]), np.array([]), np.array([])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -216,7 +217,7 @@ def test_no_mm_input_chunking():
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -276,7 +277,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -300,7 +301,8 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
+        sampled_token_ids=[np.array([0]), np.array([0])]
+        + [np.array([]) for _ in range(len(requests) - 2)],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -347,8 +349,8 @@ def test_stop_via_update_from_output():
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
         sampled_token_ids=[
-            [EOS_TOKEN_ID],
-            [10, 11],
+            np.array([EOS_TOKEN_ID]),
+            np.array([10, 11]),
         ],  # First request hits EOS, second continues
         logprobs=None,
         prompt_logprobs_dict={},
@@ -392,7 +394,10 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[10, 42, 12], [13, 14]],  # First request hits stop token
+        sampled_token_ids=[
+            np.array([10, 42, 12]),
+            np.array([13, 14]),
+        ],  # First request hits stop token
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -436,7 +441,10 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[10, 11, 12], [13]],  # First request exceeds max_tokens
+        sampled_token_ids=[
+            np.array([10, 11, 12]),
+            np.array([13]),
+        ],  # First request exceeds max_tokens
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -475,7 +483,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+        sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -616,7 +624,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -633,7 +641,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -670,7 +678,7 @@ def test_preempt_during_execution():
     model_runner_output0 = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -687,7 +695,7 @@ def test_preempt_during_execution():
     model_runner_output1 = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[[42]],
+        sampled_token_ids=[np.array([42])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -704,14 +712,18 @@ def test_preempt_during_execution():
 @pytest.mark.parametrize(
     "spec_tokens,output_tokens,expected",
     [
-        ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
-        ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
-        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (2, 3, 3, [2, 1])),  # multiple sequences
-        ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
-        ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
+        ([[1, 2, 3]], [np.array([1, 2, 3, 4])], (1, 3, 3, [1, 1, 1])),  # perfect match
+        ([[1, 2, 3]], [np.array([1, 5])], (1, 3, 1, [1, 0, 0])),  # early mismatch
+        (
+            [[1, 2], [3]],
+            [np.array([1, 2, 5]), np.array([3, 4])],
+            (2, 3, 3, [2, 1]),
+        ),  # multiple sequences
+        ([[1]], [np.array([1, 2])], (1, 1, 1, [1])),  # single token sequence
+        ([[]], [np.array([5])], (0, 0, 0, [0])),  # empty sequence
         (
             [[1, 2, 3], [4, 5, 6]],
-            [[1, 2, 7], [4, 8]],
+            [np.array([1, 2, 7]), np.array([4, 8])],
             (2, 6, 3, [2, 1, 0]),
         ),  # multiple mismatches
     ],
@@ -745,7 +757,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     model_runner_output = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0] for _ in range(len(requests))],
+        sampled_token_ids=[np.array([0]) for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -972,7 +984,7 @@ def test_kv_connector_basic(is_async: bool):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1025,7 +1037,7 @@ def test_kv_connector_basic(is_async: bool):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1088,7 +1100,7 @@ def test_external_prefix_cache_metrics():
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=[r.request_id for r in requests],
         req_id_to_index={r.request_id: i for i, r in enumerate(requests)},
-        sampled_token_ids=[[1000]] * NUM_REQUESTS,
+        sampled_token_ids=[np.array([1000])] * NUM_REQUESTS,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1154,7 +1166,7 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1239,7 +1251,7 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1332,7 +1344,7 @@ def make_output(scheduler: Scheduler):
     return ModelRunnerOutput(
         req_ids=[req.request_id for req in scheduler.running],
         req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)},
-        sampled_token_ids=[[1000]] * len(scheduler.running),
+        sampled_token_ids=[np.array([1000])] * len(scheduler.running),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1749,7 +1761,7 @@ def test_priority_scheduling_preemption():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[[100] for _ in low_priority_requests],
+        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1818,7 +1830,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[[100] for _ in low_priority_requests],
+        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -2064,7 +2076,7 @@ def test_priority_scheduling_heap_property():
             model_output = ModelRunnerOutput(
                 req_ids=[req.req_id],
                 req_id_to_index={req.req_id: 0},
-                sampled_token_ids=[[100]],
+                sampled_token_ids=[np.array([100])],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[],
@@ -2150,7 +2162,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[request_low.request_id],
         req_id_to_index={request_low.request_id: 0},
-        sampled_token_ids=[[100]],
+        sampled_token_ids=[np.array([100])],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2181,7 +2193,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[100] for _ in requests],
+        sampled_token_ids=[np.array([100]) for _ in requests],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2207,7 +2219,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[], [100]],
+        sampled_token_ids=[np.array([]), np.array([100])],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2624,7 +2636,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
     model_output = ModelRunnerOutput(
         req_ids=[request1.request_id],
         req_id_to_index={request1.request_id: 0},
-        sampled_token_ids=[[100]],
+        sampled_token_ids=[np.array([100])],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2830,7 +2842,7 @@ def test_ec_connector_unable_to_allocate(use_kv_connector):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[1000]] * len(req_ids),
+        sampled_token_ids=[np.array([1000])] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -2943,7 +2955,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
     model_output = ModelRunnerOutput(
         req_ids=[request_low.request_id],
         req_id_to_index={request_low.request_id: 0},
-        sampled_token_ids=[[100]],
+        sampled_token_ids=[np.array([100])],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2994,7 +3006,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[100] for _ in requests],
+        sampled_token_ids=[np.array([100]) for _ in requests],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -3029,7 +3041,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[[100], [100, 200]],
+        sampled_token_ids=[np.array([100]), np.array([100, 200])],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -3215,7 +3227,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
     model_output = ModelRunnerOutput(
         req_ids=[request1.request_id, request2.request_id],
         req_id_to_index={request1.request_id: 0, request2.request_id: 1},
-        sampled_token_ids=[[100], [121]],
+        sampled_token_ids=[np.array([100]), np.array([121])],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index b7d7a10057b8..b264e5108c16 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -11,6 +11,7 @@
 from collections import defaultdict
 from unittest.mock import patch
 
+import numpy as np
 import pytest
 import ray
 import torch
@@ -826,7 +827,7 @@ def test_kv_connector_stats_aggregation():
         output = ModelRunnerOutput(
             req_ids=[f"req_{i}"],
             req_id_to_index={f"req_{i}": 0},
-            sampled_token_ids=[[123]],  # dummy token
+            sampled_token_ids=[np.array([123])],  # dummy token
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[None],
@@ -907,7 +908,7 @@ def make_multi_stats(nixl_count: int, foo_count: int) -> MultiKVConnectorStats:
         output = ModelRunnerOutput(
             req_ids=[f"req_{i}"],
             req_id_to_index={f"req_{i}": 0},
-            sampled_token_ids=[[123]],
+            sampled_token_ids=[np.array([123])],
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[None],
@@ -965,7 +966,7 @@ def test_scheduler_kv_connector_stats_aggregation():
     model_output = ModelRunnerOutput(
         req_ids=["req_0"],
         req_id_to_index={"req_0": 0},
-        sampled_token_ids=[[123]],
+        sampled_token_ids=[np.array([123])],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[None],
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index f35f91bb3adf..c248104d5b5e 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -7,6 +7,7 @@
 from itertools import chain, count
 from typing import Any
 
+import numpy as np
 import torch
 
 from vllm import SamplingParams
@@ -228,7 +229,7 @@ def create_model_runner_output(
 
     # Make sampled tokens.
     sampled_token = EOS_TOKEN_ID if use_eos else token_id
-    sampled_token_ids = [[sampled_token] for _ in req_ids]
+    sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]
 
     kv_connector_output = (
         None
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 89d0ec769ac0..421da5241555 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -3,6 +3,7 @@
 
 from unittest import mock
 
+import numpy as np
 import pytest
 import torch
 
@@ -112,7 +113,9 @@ def test_prepare_next_token_ids():
     sampled_token_ids_tensor = torch.tensor(
         sampled_token_ids, dtype=torch.int32, device=device
     )
-    sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids]
+    sampled_token_ids_cpu = [
+        np.array([i for i in seq if i != -1]) for seq in sampled_token_ids
+    ]
 
     expected_next_token_ids_cpu = [1, 4, 30, 40]
     expected_next_token_ids_tensor = torch.tensor(
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 692c39282c37..563bc1d957f4 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -77,7 +77,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match.
     token_ids_cpu = np.array([[1, 2, 3, 4, 5]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -88,7 +88,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match for 4-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -99,7 +99,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match for 4-gram but match for 3-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -111,7 +111,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # In this case, the proposer should return the 4-gram match.
     token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -122,7 +122,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # Match for 2-gram and 3-gram, but not 4-gram.
     token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -133,7 +133,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # Multiple 3-gram matched, but always pick the first one.
     token_ids_cpu = np.array([[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -144,7 +144,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # check empty input
     token_ids_cpu = np.array([[]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0]],
+        sampled_token_ids=[np.array([0])],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -157,7 +157,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # second request has 3 tokens and no match. Padded with -1 for max len 5
     token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[[0], [1]],
+        sampled_token_ids=[np.array([0]), np.array([1])],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([5, 3]),
         token_ids_cpu=token_ids_cpu,
@@ -181,7 +181,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     input_2[:3] = [4, 5, 6]
     token_ids_cpu = np.array([input_1, input_2])
     result = ngram_proposer.propose(
-        sampled_token_ids=[[0], [1]],
+        sampled_token_ids=[np.array([0]), np.array([1])],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([len(input_1), 3]),
         token_ids_cpu=token_ids_cpu,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index ba7ad0c09173..c640c40a455d 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1010,8 +1010,8 @@ def update_from_output(
                 continue
 
             req_index = model_runner_output.req_id_to_index[req_id]
-            generated_token_ids = (
-                sampled_token_ids[req_index] if sampled_token_ids else []
+            generated_token_ids: list[int] = (
+                sampled_token_ids[req_index].tolist() if sampled_token_ids else []
             )
 
             scheduled_spec_token_ids = (
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index e32d5bb608b1..c0b2835c3124 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -158,7 +158,7 @@ class ModelRunnerOutput:
     # num_generated_tokens is the number of tokens
     # generated in the current step. It can be different for
     # each request due to speculative/jump decoding.
-    sampled_token_ids: list[list[int]]
+    sampled_token_ids: list[np.ndarray]
 
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
@@ -220,7 +220,7 @@ def make_empty_encoder_model_runner_output(
     req_id_to_index: dict[str, int] = {rid: idx for idx, rid in enumerate(req_ids)}
 
     # No tokens generated yet ⇒ one empty list per request
-    sampled_token_ids: list[list[int]] = [[0] for _ in req_ids]
+    sampled_token_ids: list[list[int]] = [np.array([0]) for _ in req_ids]
 
     # Pooler outputs are not available yet ⇒ use None placeholders
     pooler_output: list[torch.Tensor | None] = [None for _ in req_ids]
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 926305d25f56..f31a0cddda9a 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -3,6 +3,7 @@
 
 from dataclasses import replace
 
+import numpy as np
 import torch
 import torch.nn as nn
 
@@ -204,7 +205,7 @@ def _get_logprobs_tensors(
     def parse_output(
         output_token_ids: torch.Tensor,
         vocab_size: int,
-    ) -> list[list[int]]:
+    ) -> list[np.ndarray]:
         """Parse the output of the rejection sampler.
         Args:
             output_token_ids: The sampled token IDs in shape
@@ -220,10 +221,7 @@ def parse_output(
         valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
             output_token_ids_np < vocab_size
         )
-        outputs = [
-            row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
-        ]
-        return outputs
+        return [row[valid_mask[i]] for i, row in enumerate(output_token_ids_np)]
 
     def apply_logits_processors(
         self,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index beef5203e039..f3b34544f8d9 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -484,7 +484,7 @@ def propose(
 
     def prepare_next_token_ids_cpu(
         self,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
         requests: dict[str, CachedRequestState],
         gpu_input_batch: InputBatch,
         num_scheduled_tokens: dict[str, int],
@@ -499,7 +499,7 @@ def prepare_next_token_ids_cpu(
         req_ids = gpu_input_batch.req_ids
         next_token_ids: list[int] = []
         for i, token_ids in enumerate(sampled_token_ids):
-            if token_ids:
+            if token_ids.shape[0] > 0:
                 # Common case.
                 next_token_id = token_ids[-1]
             else:
@@ -510,10 +510,9 @@ def prepare_next_token_ids_cpu(
                 seq_len = req_state.num_computed_tokens + num_scheduled_tokens[req_id]
                 next_token_id = req_state.get_token_id(seq_len)
             next_token_ids.append(next_token_id)
-        next_token_ids = torch.tensor(
+        return torch.tensor(
             next_token_ids, dtype=torch.int32, device=self.input_ids.device
         )
-        return next_token_ids
 
     def prepare_next_token_ids_padded(
         self,
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index e2f83cb24aa9..378937dba988 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -54,7 +54,7 @@ def __init__(self, vllm_config: VllmConfig):
         # Trigger Numba JIT compilation for N-gram proposer.
         # This usually takes less than 1 second.
         self.propose(
-            [[]] * 1024,
+            [np.array([])] * 1024,
             [""] * 1024,
             np.zeros(1024, dtype=np.int32),
             np.zeros((1024, self.max_model_len), dtype=np.int32),
@@ -131,7 +131,7 @@ def batch_propose(
 
     def propose(
         self,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
         req_ids: list[str],
         num_tokens_no_spec: np.ndarray,
         token_ids_cpu: np.ndarray,
@@ -140,7 +140,7 @@ def propose(
         # find which requests need ngram proposals
         valid_ngram_requests = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            num_sampled_ids = len(sampled_ids)
+            num_sampled_ids = sampled_ids.shape[0]
             if not num_sampled_ids:
                 # Skip speculative decoding.
                 continue
diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py
index 049e335db325..d76e0ffe778d 100644
--- a/vllm/v1/spec_decode/suffix_decoding.py
+++ b/vllm/v1/spec_decode/suffix_decoding.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+
 from vllm.config import VllmConfig
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
@@ -32,16 +34,16 @@ def __init__(self, vllm_config: VllmConfig):
     def propose(
         self,
         input_batch: InputBatch,
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: list[np.ndarray],
     ) -> list[list[int]]:
         """
         Propose speculative tokens for each request in the input batch. Suffix Decoding
         will speculate a dynamic number of tokens for each request every decoding step,
         so each entry in the returned list may have different lengths.
         """
-        draft_token_ids: list[list[int]] = []
+        draft_token_ids: list[np.ndarray] = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            if not sampled_ids:
+            if sampled_ids.shape[0] == 0:
                 # Skip speculative decoding for partial prefills.
                 draft_token_ids.append([])
                 continue
@@ -70,7 +72,7 @@ def propose(
                 self.suffix_cache.start_request(req_id, prompt_token_ids)
 
             # Append the newly sampled ids to the suffix cache for this request.
-            self.suffix_cache.add_active_response(req_id, sampled_ids)
+            self.suffix_cache.add_active_response(req_id, sampled_ids.tolist())
 
             # Suffix decoding only uses the most recent tokens up to max_tree_depth, so
             # we extract the pattern from the end of the input.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9b3e5b668aab..d0d6164180e6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -216,9 +216,11 @@ def get_output(self) -> ModelRunnerOutput:
         del self._logprobs_tensors
         del self._sampled_token_ids
 
-        valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
+        valid_sampled_token_ids: list[np.ndarray] = [
+            row for row in self.sampled_token_ids_cpu.numpy()
+        ]
         for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i].clear()
+            valid_sampled_token_ids[i] = np.array([])
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
@@ -2339,7 +2341,7 @@ def _bookkeeping_sync(
     ) -> tuple[
         dict[str, int],
         LogprobsLists | None,
-        list[list[int]],
+        list[np.ndarray],
         dict[str, LogprobsTensors | None],
         list[str],
         dict[str, int],
@@ -2365,6 +2367,7 @@ def _bookkeeping_sync(
         num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
         invalid_req_indices = []
+        valid_sampled_token_ids: list[np.ndarray]
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
@@ -2379,7 +2382,7 @@ def _bookkeeping_sync(
                 )
             # Mask out the sampled tokens that should not be sampled.
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)].clear()
+                valid_sampled_token_ids[int(i)] = np.array([])
         else:
             valid_sampled_token_ids = []
             invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
@@ -2407,19 +2410,24 @@ def _bookkeeping_sync(
             [0] if spec_decode_metadata and logprobs_tensors else None
         )
         for req_idx in range(num_sampled_tokens):
+            sampled_ids: np.ndarray | None
             if self.use_async_scheduling:
-                sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
+                sampled_ids = (
+                    np.array([-1]) if req_idx not in invalid_req_indices_set else None
+                )
             else:
                 sampled_ids = valid_sampled_token_ids[req_idx]
 
-            num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
+            num_sampled_ids: int = (
+                sampled_ids.shape[0] if sampled_ids is not None else 0
+            )
 
             if cu_num_accepted_tokens is not None:
                 cu_num_accepted_tokens.append(
                     cu_num_accepted_tokens[-1] + num_sampled_ids
                 )
 
-            if not sampled_ids:
+            if sampled_ids is None or num_sampled_ids == 0:
                 continue
 
             start_idx = self.input_batch.num_tokens_no_spec[req_idx]
@@ -2761,7 +2769,9 @@ def sample_tokens(
         with record_function_or_nullcontext("gpu_model_runner: sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
 
-        def propose_draft_token_ids(sampled_token_ids):
+        def propose_draft_token_ids(
+            sampled_token_ids: torch.Tensor | list[np.ndarray],
+        ) -> None:
             assert spec_decode_common_attn_metadata is not None
             with record_function_or_nullcontext("gpu_model_runner: draft"):
                 self._draft_token_ids = self.propose_draft_token_ids(
@@ -2883,14 +2893,14 @@ def take_draft_token_ids(self) -> DraftTokenIds | None:
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
-        sampled_token_ids: torch.Tensor | list[list[int]],
+        sampled_token_ids: torch.Tensor | list[np.ndarray],
         sampling_metadata: SamplingMetadata,
         hidden_states: torch.Tensor,
         sample_hidden_states: torch.Tensor,
         aux_hidden_states: list[torch.Tensor] | None,
         spec_decode_metadata: SpecDecodeMetadata | None,
         common_attn_metadata: CommonAttentionMetadata,
-    ) -> list[list[int]] | torch.Tensor:
+    ) -> torch.Tensor | list[list[int]]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if self.speculative_config.method == "ngram":
             assert isinstance(sampled_token_ids, list)
@@ -2922,7 +2932,7 @@ def propose_draft_token_ids(
                 for num_draft, tokens in zip(
                     spec_decode_metadata.num_draft_tokens, sampled_token_ids
                 ):
-                    indices.append(offset + len(tokens) - 1)
+                    indices.append(offset + tokens.shape[0] - 1)
                     offset += num_draft + 1
                 indices = torch.tensor(indices, device=self.device)
                 hidden_states = sample_hidden_states[indices]
@@ -4862,7 +4872,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
 
         return kv_cache_spec
 
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
         # This is a short term mitigation for issue mentioned in
         # https://github.com/vllm-project/vllm/issues/22754.
         # `tolist` would trigger a cuda wise stream sync, which
@@ -4875,4 +4885,4 @@ def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
         pinned.copy_(sampled_token_ids, non_blocking=True)
         self.transfer_event.record()
         self.transfer_event.synchronize()
-        return pinned.tolist()
+        return [row for row in pinned.numpy()]
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 01490e0dfac9..e9eb7cad38f8 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1254,13 +1254,15 @@ def concat_lists(input_lists):
 
         max_gen_len = selected_token_ids.shape[-1]
         if max_gen_len == 1:
-            valid_sampled_token_ids = selected_token_ids.tolist()
+            valid_sampled_token_ids: list[np.ndarray] = [
+                row for row in selected_token_ids.numpy()
+            ]
 
             # Mask out the sampled tokens that should not be sampled.
             # TODO: Keep in sync with gpu_model_runner.py, in particular
             #       the "else" case here
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[i].clear()
+                valid_sampled_token_ids[i] = np.array([])
 
             # Append sampled tokens
             for i, req_state, seq_len in request_seq_lens:
@@ -1273,7 +1275,7 @@ def concat_lists(input_lists):
             valid_mask = selected_token_ids != INVALID_TOKEN_ID
             gen_lens = valid_mask.sum(dim=1).tolist()
             valid_sampled_token_ids = [
-                seq.tolist() for seq in selected_token_ids[valid_mask].split(gen_lens)
+                seq.numpy() for seq in selected_token_ids[valid_mask].split(gen_lens)
             ]
             self.input_batch.num_tokens[:num_reqs] += gen_lens
             for i, req_state, seq_len in request_seq_lens:

From dd6ac1c2bb3d29f8ba612a2f66f350a2c55c7e8b Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Fri, 14 Nov 2025 23:59:42 -0800
Subject: [PATCH 093/578] [RL] [V1] Remove unused device argument from
 reset_kv_cache (#28766)

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
---
 vllm/engine/protocol.py               |  2 +-
 vllm/entrypoints/llm.py               |  5 ++---
 vllm/entrypoints/openai/api_server.py | 10 +++-------
 vllm/v1/engine/async_llm.py           |  6 ++----
 vllm/v1/engine/llm_engine.py          |  3 +--
 5 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 24fcd9fe1cab..462d2c4e50e7 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -125,7 +125,7 @@ async def reset_mm_cache(self) -> None:
         ...
 
     @abstractmethod
-    async def reset_prefix_cache(self, device: Device | None = None) -> None:
+    async def reset_prefix_cache(self) -> None:
         """Reset the prefix cache"""
         ...
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 62717a7eacdf..b0786bd355aa 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -32,7 +32,6 @@
     TokenizerMode,
 )
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.protocol import Device
 from vllm.entrypoints.chat_utils import (
     ChatCompletionMessageParam,
     ChatTemplateContentFormatOption,
@@ -1499,8 +1498,8 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
-    def reset_prefix_cache(self, device: Device | None = None) -> None:
-        self.llm_engine.reset_prefix_cache(device)
+    def reset_prefix_cache(self) -> None:
+        self.llm_engine.reset_prefix_cache()
 
     def sleep(self, level: int = 1):
         """
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 3e59af717d95..3cf66fcd27e2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -39,7 +39,7 @@
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.protocol import Device, EngineClient
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.anthropic.protocol import (
     AnthropicError,
     AnthropicErrorResponse,
@@ -1069,12 +1069,8 @@ async def reset_prefix_cache(raw_request: Request):
         Reset the prefix cache. Note that we currently do not check if the
         prefix cache is successfully reset in the API server.
         """
-        device = None
-        device_str = raw_request.query_params.get("device")
-        if device_str is not None:
-            device = Device[device_str.upper()]
-        logger.info("Resetting prefix cache with specific %s...", str(device))
-        await engine_client(raw_request).reset_prefix_cache(device)
+        logger.info("Resetting prefix cache...")
+        await engine_client(raw_request).reset_prefix_cache()
         return Response(status_code=200)
 
     @router.post("/reset_mm_cache")
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 48ea6ef8515c..c160c7cbcab4 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -14,7 +14,7 @@
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.protocol import Device, EngineClient
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
@@ -672,9 +672,7 @@ async def reset_mm_cache(self) -> None:
         self.processor.clear_mm_cache()
         await self.engine_core.reset_mm_cache_async()
 
-    async def reset_prefix_cache(self, device: Device | None = None) -> None:
-        if device == Device.CPU:
-            raise ValueError("Not supported on CPU.")
+    async def reset_prefix_cache(self) -> None:
         await self.engine_core.reset_prefix_cache_async()
 
     async def sleep(self, level: int = 1) -> None:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 1db83446ba0b..e403cea87788 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -14,7 +14,6 @@
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.protocol import Device
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -321,7 +320,7 @@ def reset_mm_cache(self):
         self.processor.clear_mm_cache()
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self, device: Device | None = None):
+    def reset_prefix_cache(self):
         self.engine_core.reset_prefix_cache()
 
     def sleep(self, level: int = 1):

From 74b5267d3a2d49be548e488650d1504be0b3e3fe Mon Sep 17 00:00:00 2001
From: "Jane (Yuan) Xu" <31798555+janeyx99@users.noreply.github.com>
Date: Sat, 15 Nov 2025 04:10:15 -0500
Subject: [PATCH 094/578] Use narrow over indexing in `hadacore_transform` to
 prep for ABI stable (#28756)

Signed-off-by: Jane Xu <janeyx@meta.com>
---
 csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
index 5369d409f9b2..aff11326d78e 100644
--- a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
+++ b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
@@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
     });
 
     if (numel % 256 != 0) {
-        out = out.index({torch::indexing::Slice(0, numel / had_size)});
+        out = out.narrow(0, 0, numel / had_size);
     }
 
     if (inplace && out.data_ptr() != x.data_ptr()) {

From 1ec978c209391286d4cee968426900e9a4d256a5 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Sat, 15 Nov 2025 01:10:48 -0800
Subject: [PATCH 095/578] [Kernel][Moe Configs] llama4 maverick fp8 moe config
 tp8 on mi325 (#28709)

Signed-off-by: Zhewen Li <zhewenli@meta.com>
---
 ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 000000000000..555d17364452
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}

From 638e4196d15f14a5fe68a64000801abda6c2ef8f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Nov 2025 17:59:31 +0800
Subject: [PATCH 096/578] [Misc] Make `SchedulerConfig.max_model_len` init-only
 (#28733)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/kernels/moe/test_batched_moe.py         |  2 --
 tests/kernels/moe/test_block_fp8.py           |  2 --
 tests/kernels/moe/test_block_int8.py          |  2 --
 tests/kernels/moe/test_cutlass_moe.py         |  2 --
 tests/kernels/moe/test_flashinfer.py          |  2 --
 tests/kernels/moe/test_moe.py                 |  2 --
 tests/kernels/moe/test_pplx_cutlass_moe.py    |  2 --
 tests/kernels/moe/test_pplx_moe.py            |  2 --
 tests/kernels/moe/test_triton_moe_ptpc_fp8.py |  2 --
 tests/kernels/quantization/test_block_fp8.py  |  2 --
 tests/kernels/quantization/test_block_int8.py |  2 --
 vllm/config/scheduler.py                      | 36 +++++++++----------
 vllm/config/vllm.py                           |  1 -
 vllm/platforms/cpu.py                         |  2 +-
 vllm/platforms/tpu.py                         |  2 +-
 vllm/platforms/xpu.py                         |  2 +-
 vllm/v1/core/sched/scheduler.py               |  2 +-
 17 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 62704bbcbbc7..2285709fa7d6 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -40,8 +40,6 @@
 TOP_KS = [1, 2, 6]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 @dataclass
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index cd34617ee0fc..88db4b3e537c 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -33,8 +33,6 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py
index 3799e60f1294..e35ca4caa9db 100644
--- a/tests/kernels/moe/test_block_int8.py
+++ b/tests/kernels/moe/test_block_int8.py
@@ -18,8 +18,6 @@
     pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 DTYPES = [torch.bfloat16]
 
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 5512ccce47b0..c15837f14570 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -42,8 +42,6 @@
 ]
 
 vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 @dataclasses.dataclass
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index 707068b2bbdc..3a681d4603f8 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -45,8 +45,6 @@
 ]
 
 vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def quant_fp8_per_tensor_batches(a):
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index c27cf2468ede..0550c2d9e212 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -81,8 +81,6 @@
 ]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def run_moe_test(
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index a2de64974b35..dd4eb4da913b 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -192,8 +192,6 @@ def pplx_cutlass_moe(
 
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def _pplx_moe(
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 0f0ed3326d15..f671b23d300c 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -81,8 +81,6 @@
 DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def torch_prepare(
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
index 933cd9dbdeaa..7a467e160b78 100644
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -18,8 +18,6 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index 55f092e7ea69..e9973c1fcc15 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -29,8 +29,6 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
index dabc10a122f7..310091b6a554 100644
--- a/tests/kernels/quantization/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -18,8 +18,6 @@
     pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 DTYPES = [torch.half, torch.bfloat16]
 M = [1, 33, 64, 222]
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 444568994a95..8194295ffedb 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -6,7 +6,7 @@
 from dataclasses import InitVar
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
 
-from pydantic import Field, field_validator, model_validator
+from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
 from typing_extensions import Self, deprecated
 
@@ -48,13 +48,6 @@ class SchedulerConfig:
     In real usage, this should be set in `EngineArgs.create_engine_config`.
     """
 
-    max_model_len: int = Field(default=8192, ge=1)
-    """Maximum length of a sequence (including prompt and generated text).
-
-    The default value here is mainly for convenience when testing.
-    In real usage, this should duplicate `ModelConfig.max_model_len` via
-    `EngineArgs`."""
-
     max_num_partial_prefills: int = Field(default=1, ge=1)
     """For chunked prefill, the maximum number of sequences that can be
     partially prefilled concurrently."""
@@ -89,6 +82,12 @@ class SchedulerConfig:
     is_multimodal_model: bool = False
     """True if the model is multimodal."""
 
+    max_model_len: InitVar[int] = 8192
+    """Maximum length of a sequence (including prompt and generated text).
+
+    Note: This is stored in the ModelConfig, and is used only here to
+    provide fallbacks and validate other attributes."""
+
     is_encoder_decoder: InitVar[bool] = False
     """True if the model is an encoder-decoder model.
 
@@ -199,7 +198,7 @@ def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
             return value
         return handler(value)
 
-    def __post_init__(self, is_encoder_decoder: bool) -> None:
+    def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
         if is_encoder_decoder:
             # Chunked prefill should be disabled for encoder-decoder models.
             self.disable_chunked_mm_input = True
@@ -221,7 +220,7 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
 
         if self.max_num_partial_prefills > 1:
             if self.long_prefill_token_threshold == 0:
-                self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
+                self.long_prefill_token_threshold = int(max_model_len * 0.04)
 
             logger.info(
                 "Concurrent partial prefills enabled with "
@@ -232,6 +231,8 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 self.long_prefill_token_threshold,
             )
 
+        self.verify_max_model_len(max_model_len)
+
     @property
     @deprecated(
         "`SchedulerConfig.chunked_prefill_enabled` has been renamed to "
@@ -245,15 +246,14 @@ def chunked_prefill_enabled(self) -> bool:
     def chunked_prefill_enabled(self, value: bool):
         self.enable_chunked_prefill = value
 
-    @model_validator(mode="after")
-    def _verify_args(self) -> Self:
+    def verify_max_model_len(self, max_model_len: int) -> Self:
         if (
-            self.max_num_batched_tokens < self.max_model_len
+            self.max_num_batched_tokens < max_model_len
             and not self.enable_chunked_prefill
         ):
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
-                f"smaller than max_model_len ({self.max_model_len}). "
+                f"smaller than max_model_len ({max_model_len}). "
                 "This effectively limits the maximum sequence length to "
                 "max_num_batched_tokens and makes vLLM reject longer "
                 "sequences. Please increase max_num_batched_tokens or "
@@ -267,12 +267,12 @@ def _verify_args(self) -> Self:
                 f"({self.max_num_seqs})."
             )
 
-        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
+        if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
             logger.warning(
                 "max_num_batched_tokens (%d) exceeds max_num_seqs "
                 "* max_model_len (%d). This may lead to unexpected behavior.",
                 self.max_num_batched_tokens,
-                self.max_num_seqs * self.max_model_len,
+                self.max_num_seqs * max_model_len,
             )
 
         if self.max_num_partial_prefills > 1:
@@ -282,11 +282,11 @@ def _verify_args(self) -> Self:
                     "max_num_partial_prefills > 1."
                 )
 
-            if self.long_prefill_token_threshold > self.max_model_len:
+            if self.long_prefill_token_threshold > max_model_len:
                 raise ValueError(
                     "long_prefill_token_threshold "
                     f"({self.long_prefill_token_threshold}) cannot be greater "
-                    f"than the max_model_len ({self.max_model_len})."
+                    f"than the max_model_len ({max_model_len})."
                 )
 
         if self.max_long_partial_prefills > self.max_num_partial_prefills:
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 1e6e455210c8..bf9bcd0e8a11 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -929,7 +929,6 @@ def recalculate_max_model_len(self, max_model_len: int):
         model_config = self.model_config
         max_model_len = model_config.get_and_verify_max_len(max_model_len)
         self.model_config.max_model_len = max_model_len
-        self.scheduler_config.max_model_len = max_model_len
 
     def try_verify_and_update_config(self):
         if self.model_config is None:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 1da34629472c..ed655912d396 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -339,7 +339,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
+                vllm_config.model_config.max_model_len,
                 vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index c1218801bc07..944344a22957 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -191,7 +191,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
+                vllm_config.model_config.max_model_len,
                 vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index ad4beb28bdae..65516827a16d 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -185,7 +185,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
+                vllm_config.model_config.max_model_len,
                 vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index c640c40a455d..bc15979dea62 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -83,7 +83,7 @@ def __init__(
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
         self.max_num_scheduled_tokens = self.scheduler_config.max_num_batched_tokens
-        self.max_model_len = self.scheduler_config.max_model_len
+        self.max_model_len = vllm_config.model_config.max_model_len
         self.enable_kv_cache_events = (
             self.kv_events_config is not None
             and self.kv_events_config.enable_kv_cache_events

From 173b356abff3e2e547fc44c60361f3b0adc41aaf Mon Sep 17 00:00:00 2001
From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Date: Sat, 15 Nov 2025 14:13:41 +0400
Subject: [PATCH 097/578] [PERF] Remove TRTLLM Gen attn kernel limitation
 `max_seq_len <=131072` (#28755)

Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
---
 vllm/config/vllm.py      | 15 ---------------
 vllm/utils/flashinfer.py |  6 ++----
 2 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index bf9bcd0e8a11..87f6b6eed851 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -483,21 +483,6 @@ def __post_init__(self):
                             "Overriding cudagraph_mode to PIECEWISE."
                         )
                         self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                    elif (
-                        current_platform.is_cuda()
-                        and current_platform.is_device_capability(100)
-                        and self.model_config.max_model_len > 131072
-                        and not self.model_config.use_mla
-                    ):
-                        # Refer to vllm/utils/flashinfer.py::use_trtllm_attention()
-                        logger.warning_once(
-                            "NVIDIA Blackwell TRTLLM attention cannot support "
-                            "max_model_len >= 131072 (found "
-                            f"{self.model_config.max_model_len}), causing dynamic "
-                            "dispatching that breaks full cudagraphs. "
-                            "Overriding cudagraph_mode to PIECEWISE."
-                        )
-                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
             # disable cudagraph when enforce eager execution
             if self.model_config is not None and self.model_config.enforce_eager:
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 79e5a4c30259..1209d64901bf 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -319,14 +319,12 @@ def use_trtllm_attention(
         # Environment variable not set - use auto-detection
         if is_prefill:
             # Prefill auto-detection
-            use_trtllm = max_seq_len <= 131072 and kv_cache_dtype == "auto"
+            use_trtllm = kv_cache_dtype == "auto"
             if use_trtllm:
                 logger.warning_once("Using TRTLLM prefill attention (auto-detected).")
         else:
             # Decode auto-detection
-            use_trtllm = (
-                num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto"
-            )
+            use_trtllm = num_tokens <= 256 and kv_cache_dtype == "auto"
             if use_trtllm:
                 logger.warning_once("Using TRTLLM decode attention (auto-detected).")
         return use_trtllm

From f36292dbee27a5ebe0e7115c061b82f6f5372dcf Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Sat, 15 Nov 2025 03:46:12 -0800
Subject: [PATCH 098/578] [compile] Enable sequence parallelism matching w/o
 custom ops enabled  (#27126)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: angelayi <yiangela7@gmail.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: ProExpertProg <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <luka.govedic@gmail.com>
---
 .buildkite/test-pipeline.yaml               |  14 +-
 tests/compile/test_fusions_e2e.py           | 228 ++++++++++--
 tests/compile/test_sequence_parallelism.py  | 262 +++++++-------
 tests/distributed/test_sequence_parallel.py |  15 +-
 vllm/compilation/sequence_parallelism.py    | 369 ++++++--------------
 vllm/config/vllm.py                         |  28 +-
 6 files changed, 472 insertions(+), 444 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 52539728215b..723f311a2646 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -478,10 +478,11 @@ steps:
   - vllm/
   - tests/compile
   commands:
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
   - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
     # Limit to no custom ops to reduce running time
     # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -925,7 +926,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
 
-- label: Blackwell Fusion Tests # 30 min
+- label: Blackwell Fusion & Compile Tests # 30 min
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   gpu: b200
@@ -946,7 +947,9 @@ steps:
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
     # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell Fusion E2E Tests # 30 min
   timeout_in_minutes: 40
@@ -969,8 +972,6 @@ steps:
     - nvidia-smi
     # Run all e2e fusion tests
     - pytest -v -s tests/compile/test_fusions_e2e.py
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
@@ -1266,7 +1267,8 @@ steps:
     - pytest -v -s tests/compile/test_async_tp.py
     - pytest -v -s tests/compile/test_sequence_parallelism.py
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py
index e1560efb3f24..f22d60ef000b 100644
--- a/tests/compile/test_fusions_e2e.py
+++ b/tests/compile/test_fusions_e2e.py
@@ -20,13 +20,22 @@
 
 from ..utils import flat_product, multi_gpu_test
 
+is_blackwell = lambda: current_platform.is_device_capability(100)
+"""Are we running on Blackwell, a lot of tests depend on it"""
+
+
+class Matches(NamedTuple):
+    attention_fusion: int = 0
+    allreduce_fusion: int = 0
+    sequence_parallel: int = 0
+    async_tp: int = 0
+
 
 class ModelBackendTestCase(NamedTuple):
     model_name: str
     model_kwargs: dict[str, Any]
     backend: AttentionBackendEnum
-    attention_fusions: int
-    allreduce_fusions: int | None = None
+    matches: Matches
 
 
 MODELS_FP8: list[ModelBackendTestCase] = []
@@ -38,17 +47,33 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             # Use smaller model for L40s in CI
             model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            attention_fusions=32,
-            allreduce_fusions=65,
+            # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
+            #  so FI attention+fp8_quant is at least tested once
+            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
+            backend=AttentionBackendEnum.FLASHINFER
+            if is_blackwell()
+            else AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                attention_fusion=32,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
         ),
         ModelBackendTestCase(
             model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER,
-            attention_fusions=48,
-            allreduce_fusions=96,
+            # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
+            # https://github.com/vllm-project/vllm/issues/28568
+            # TODO FlashInfer attn broken on Blackwell for llama4:
+            # https://github.com/vllm-project/vllm/issues/28604
+            backend=AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                attention_fusion=48,
+                allreduce_fusion=96,
+                sequence_parallel=96,
+                async_tp=95,  # mlp is moe, no fusion there
+            ),
         ),
     ]
 
@@ -57,8 +82,12 @@ class ModelBackendTestCase(NamedTuple):
             model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
             backend=AttentionBackendEnum.FLASHINFER,
-            attention_fusions=32,
-            allreduce_fusions=65,
+            matches=Matches(
+                attention_fusion=32,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
         ),
     ]
 
@@ -68,15 +97,23 @@ class ModelBackendTestCase(NamedTuple):
             model_name="meta-llama/Llama-3.1-8B-Instruct",
             model_kwargs=dict(max_model_len=1024),
             backend=AttentionBackendEnum.TRITON_ATTN,
-            attention_fusions=0,
-            allreduce_fusions=65,
+            matches=Matches(
+                attention_fusion=0,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
         ),
         ModelBackendTestCase(
             model_name="Qwen/Qwen3-30B-A3B",
             model_kwargs=dict(max_model_len=1024),
             backend=AttentionBackendEnum.TRITON_ATTN,
-            attention_fusions=0,
-            allreduce_fusions=97,
+            matches=Matches(
+                attention_fusion=0,
+                allreduce_fusion=97,
+                sequence_parallel=97,
+                async_tp=96,  # MLP is MoE, half the fusions of dense
+            ),
         ),
     ]
 
@@ -86,19 +123,19 @@ class ModelBackendTestCase(NamedTuple):
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
             backend=AttentionBackendEnum.TRITON_ATTN,
-            attention_fusions=32,
+            matches=Matches(attention_fusion=32),
         ),
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
             backend=AttentionBackendEnum.ROCM_ATTN,
-            attention_fusions=32,
+            matches=Matches(attention_fusion=32),
         ),
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
             backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
-            attention_fusions=32,
+            matches=Matches(attention_fusion=32),
         ),
     ]
 
@@ -106,8 +143,7 @@ class ModelBackendTestCase(NamedTuple):
 
 
 @pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, "
-    "attention_fusions, allreduce_fusions, custom_ops",
+    "model_name, model_kwargs, backend, matches, custom_ops",
     # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
     list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
     # quant_fp4 only has the custom impl
@@ -118,15 +154,14 @@ def test_attn_quant(
     model_name: str,
     model_kwargs: dict[str, Any],
     backend: AttentionBackendEnum,
-    attention_fusions: int,
-    allreduce_fusions: int,
+    matches: Matches,
     custom_ops: str,
     inductor_graph_partition: bool,
     caplog_mp_spawn,
     monkeypatch,
 ):
     if backend == AttentionBackendEnum.FLASHINFER and (
-        not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
+        not is_blackwell() or not has_flashinfer()
     ):
         pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
     if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
@@ -169,12 +204,12 @@ def test_attn_quant(
     with caplog_mp_spawn(logging.DEBUG) as log_holder:
         run_model(compilation_config, model_name, **model_kwargs)
 
-    matches = re.findall(
+    log_matches = re.findall(
         r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
         log_holder.text,
     )
-    assert len(matches) == 1, log_holder.text
-    assert int(matches[0]) == attention_fusions
+    assert len(log_matches) == 1, log_holder.text
+    assert int(log_matches[0]) == matches.attention_fusion
 
 
 CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
@@ -187,8 +222,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, "
-    "attention_fusions, allreduce_fusions, custom_ops",
+    "model_name, model_kwargs, backend, matches, custom_ops",
     # Toggle RMSNorm and QuantFP8 for FP8 models
     list(
         flat_product(
@@ -209,8 +243,7 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
     model_name: str,
     model_kwargs: dict,
     backend: AttentionBackendEnum,
-    attention_fusions: int,
-    allreduce_fusions: int,
+    matches: Matches,
     custom_ops: str,
     inductor_graph_partition: bool,
     caplog_mp_spawn,
@@ -219,6 +252,13 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
     if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("Inductor graph partition requires torch>=2.9")
 
+    if "fp4" in model_name.lower() and not is_blackwell():
+        pytest.skip("NVFP4 quant requires Blackwell")
+
+    if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
+        # FlashInfer attn fusion requires Blackwell
+        matches = matches._replace(attention_fusion=0)
+
     custom_ops_list = custom_ops.split(",") if custom_ops else []
 
     if inductor_graph_partition:
@@ -258,23 +298,135 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
         run_model(
             compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
         )
-    matches = re.findall(
+    log_matches = re.findall(
+        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.attention_fusion
+    assert int(log_matches[1]) == matches.attention_fusion
+
+    log_matches = re.findall(
+        r"collective_fusion.py:\d+] Replaced (\d+) patterns",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.allreduce_fusion
+    assert int(log_matches[1]) == matches.allreduce_fusion
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, model_kwargs, backend, matches, custom_ops",
+    # Toggle RMSNorm and QuantFP8 for FP8 models
+    list(
+        flat_product(
+            MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
+        )
+    )
+    # Toggle RMSNorm for FP4 models and unquant models
+    + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
+)
+@pytest.mark.parametrize("inductor_graph_partition", [True, False])
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="sequence parallel only tested on CUDA",
+)
+def test_tp2_attn_quant_async_tp(
+    model_name: str,
+    model_kwargs: dict,
+    backend: AttentionBackendEnum,
+    matches: Matches,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    caplog_mp_spawn,
+    monkeypatch,
+):
+    if is_blackwell():
+        # TODO: https://github.com/vllm-project/vllm/issues/27893
+        pytest.skip("Blackwell is not supported for AsyncTP pass")
+
+    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition requires torch>=2.9")
+
+    if "fp4" in model_name.lower() and not is_blackwell():
+        pytest.skip("NVFP4 quant requires Blackwell")
+
+    if backend == AttentionBackendEnum.FLASHINFER:
+        if not has_flashinfer():
+            pytest.skip("FlashInfer backend requires flashinfer installed")
+        if not is_blackwell():
+            # FlashInfer attn fusion requires Blackwell
+            matches = matches._replace(attention_fusion=0)
+
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
+
+    if inductor_graph_partition:
+        mode = CUDAGraphMode.FULL_AND_PIECEWISE
+        splitting_ops: list[str] | None = None
+    else:
+        mode = CUDAGraphMode.FULL_DECODE_ONLY
+        splitting_ops = []
+
+    # Disable, compile cache to make sure custom passes run.
+    # Otherwise, we can't verify fusion happened through the logs.
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    # To capture subprocess logs, we need to know whether spawn or fork is used.
+    # Force spawn as it is more general.
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    compilation_config = CompilationConfig(
+        # Testing properties
+        use_inductor_graph_partition=inductor_graph_partition,
+        cudagraph_mode=mode,
+        custom_ops=custom_ops_list,
+        splitting_ops=splitting_ops,
+        # Common
+        level=CompilationMode.VLLM_COMPILE,
+        pass_config=PassConfig(
+            enable_attn_fusion=True,
+            enable_noop=True,
+            enable_sequence_parallelism=True,
+            enable_async_tp=True,
+        ),
+        # Inductor caches custom passes by default as well via uuid
+        inductor_compile_config={"force_disable_caches": True},
+    )
+
+    with caplog_mp_spawn(logging.DEBUG) as log_holder:
+        run_model(
+            compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
+        )
+    log_matches = re.findall(
         r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
         log_holder.text,
     )
-    assert len(matches) == 2, log_holder.text
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.attention_fusion
+    assert int(log_matches[1]) == matches.attention_fusion
+
+    log_matches = re.findall(
+        r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
 
-    assert int(matches[0]) == attention_fusions
-    assert int(matches[1]) == attention_fusions
+    assert int(log_matches[0]) == matches.sequence_parallel
+    assert int(log_matches[1]) == matches.sequence_parallel
 
-    matches = re.findall(
+    log_matches = re.findall(
         r"collective_fusion.py:\d+] Replaced (\d+) patterns",
         log_holder.text,
     )
-    assert len(matches) == 2, log_holder.text
+    assert len(log_matches) == 2, log_holder.text
 
-    assert int(matches[0]) == allreduce_fusions
-    assert int(matches[1]) == allreduce_fusions
+    assert int(log_matches[0]) == matches.async_tp
+    assert int(log_matches[1]) == matches.async_tp
 
 
 def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index e909cf7393ad..9cd7f64b04af 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -5,15 +5,15 @@
 import torch
 
 import vllm.envs as envs
-from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.compilation.fusion import RMSNormQuantFusionPass
-from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
+from vllm.compilation.fx_utils import find_auto_fn
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.compilation.sequence_parallelism import SequenceParallelismPass
 from vllm.compilation.vllm_inductor_pass import VllmInductorPass
 from vllm.config import (
     CompilationConfig,
+    CUDAGraphMode,
     DeviceConfig,
     ModelConfig,
     PassConfig,
@@ -27,6 +27,7 @@
     initialize_model_parallel,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
@@ -43,172 +44,157 @@
 ]
 
 
-class TestModel(torch.nn.Module):
-    def __init__(self, hidden_size=16, intermediate_size=32):
+class TestAllReduceRMSNormModel(torch.nn.Module):
+    def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.gate_proj = torch.nn.Parameter(
-            torch.empty((intermediate_size, hidden_size))
-        )
-        self.norm = RMSNorm(intermediate_size, 1e-05)
-        # Initialize weights
-        torch.nn.init.normal_(self.gate_proj, std=0.02)
+        self.eps = eps
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
 
-    def forward(self, hidden_states, residual):
-        """
-        Forward pass implementing the operations in the FX graph
+    def forward(self, x):
+        z = torch.relu(x)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
 
-        Args:
-            hidden_states: Input tensor
-            residual: Residual tensor from previous layer
+        z2 = torch.mm(y, self.w[0])
+        x2 = tensor_model_parallel_all_reduce(z2)
 
-        Returns:
-            Tuple containing the output tensor
-        """
-        # Reshape input
-        view = hidden_states.reshape(-1, self.hidden_size)
+        y2, resid = self.norm[1](x2, resid)
 
-        # matrix multiplication
-        permute = self.gate_proj.permute(1, 0)
-        mm = torch.mm(view, permute)
+        z3 = torch.mm(y2, self.w[1])
+        x3 = tensor_model_parallel_all_reduce(z3)
 
-        # Tensor parallel all-reduce
-        all_reduce = tensor_model_parallel_all_reduce(mm)
+        y3, resid = self.norm[2](x3, resid)
 
-        # layer normalization
-        norm_output, residual_output = self.norm(all_reduce, residual)
+        z4 = torch.mm(y3, self.w[2])
+        x4 = tensor_model_parallel_all_reduce(z4)
 
-        return norm_output, residual_output
+        y4, resid = self.norm[3](x4, resid)
+        return y4
 
     def ops_in_model_before(self):
         return [torch.ops.vllm.all_reduce.default]
 
     def ops_in_model_after(self):
         return [
-            torch.ops.vllm.reduce_scatter.default,
             torch.ops.vllm.all_gather.default,
+            torch.ops.vllm.reduce_scatter.default,
         ]
 
     def ops_in_model(self):
-        return [torch.ops._C.fused_add_rms_norm.default]
+        if RMSNorm.enabled():
+            return [
+                torch.ops._C.rms_norm.default,
+                torch.ops._C.fused_add_rms_norm.default,
+            ]
+        else:
+            return []
 
 
-class TestQuantModel(torch.nn.Module):
-    def __init__(self, hidden_size=16, intermediate_size=32):
+class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
+    def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
         self.vllm_config = get_current_vllm_config()
-        self.gate_proj = torch.nn.Parameter(
-            torch.empty((intermediate_size, hidden_size)), requires_grad=False
+        self.hidden_size = hidden_size
+        self.eps = eps
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+        self.w = [
+            torch.rand(hidden_size, hidden_size)
+            .to(dtype=current_platform.fp8_dtype())
+            .t()
+            for _ in range(3)
+        ]
+
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=True,
+            act_quant_group_shape=GroupShape.PER_TENSOR,
         )
-        self.norm = RMSNorm(intermediate_size, 1e-05)
-        # Initialize weights
-        torch.nn.init.normal_(self.gate_proj, std=0.02)
-
-        self.fp8_linear = Fp8LinearOp(act_quant_static=True)
-
-        self.scale = torch.rand(1, dtype=torch.float32)
-        # Create a weight that is compatible with torch._scaled_mm,
-        # which expects a column-major layout.
-        self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t()
-        self.wscale = torch.rand(1, dtype=torch.float32)
-
-    def forward(self, hidden_states, residual):
-        """
-        Forward pass implementing the operations in the FX graph
-
-        Args:
-            hidden_states: Input tensor
-            residual: Residual tensor from previous layer
-
-        Returns:
-            Tuple containing the output tensor
-        """
-        # Reshape input
-        view = hidden_states.reshape(-1, self.hidden_size)
-
-        # matrix multiplication
-        permute = self.gate_proj.permute(1, 0)
-        mm = torch.mm(view, permute)
-
-        # Tensor parallel all-reduce
-        all_reduce = tensor_model_parallel_all_reduce(mm)
-
-        # layer normalization
-        norm_output, residual_output = self.norm(all_reduce, residual)
-
-        # scaled_mm with static input quantization
-        fp8_linear_result = self.fp8_linear.apply(
-            norm_output,
-            self.w,
-            self.wscale,
-            input_scale=self.scale.to(norm_output.device),
+
+        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+
+    def forward(self, hidden_states):
+        # avoid having graph input be an arg to a pattern directly
+        z = torch.relu(hidden_states)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
+
+        z2 = self.fp8_linear.apply(
+            y, self.w[0], self.wscale[0], input_scale=self.scale[0]
         )
 
-        return fp8_linear_result, residual_output
+        x2 = tensor_model_parallel_all_reduce(z2)
+        y2, resid = self.norm[1](x2, resid)
 
-    def ops_in_model_before(self):
-        ops_to_remove = [torch.ops.vllm.all_reduce.default]  # Always removed by SP
-        # The following are only removed if fusion happens
-        if (
-            self.vllm_config
-            and self.vllm_config.compilation_config.pass_config.enable_fusion
-        ):
-            ops_to_remove.extend(
-                [
-                    torch.ops._C.fused_add_rms_norm.default,
-                    torch.ops._C.static_scaled_fp8_quant.default,
-                ]
-            )
-        return ops_to_remove
+        z3 = self.fp8_linear.apply(
+            y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
+        )
+
+        x3 = tensor_model_parallel_all_reduce(z3)
+        y3, resid = self.norm[2](x3, resid)  # use resid here
+
+        z4 = self.fp8_linear.apply(
+            y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
+        )
+        x4 = tensor_model_parallel_all_reduce(z4)
+        y4, resid = self.norm[3](x4, resid)  # use resid here
+        return y4
 
     def ops_in_model_after(self):
-        ops_to_add = [
-            torch.ops.vllm.reduce_scatter.default,
+        return [
             torch.ops.vllm.all_gather.default,
+            torch.ops.vllm.reduce_scatter.default,
+        ]
+
+    def ops_in_model_before(self):
+        return [
+            torch.ops.vllm.all_reduce.default,
         ]
-        # The following is only added if fusion happens
-        if (
-            self.vllm_config
-            and self.vllm_config.compilation_config.pass_config.enable_fusion
-        ):
-            ops_to_add.append(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
-        return ops_to_add
 
     def ops_in_model(self):
-        if (
-            self.vllm_config
-            and self.vllm_config.compilation_config.pass_config.enable_fusion
-        ):
-            # If fusion happens, the fused op is the one
-            # we check for (de)functionalization
+        if self.vllm_config.compilation_config.pass_config.enable_fusion:
             return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
-        else:
-            # If no fusion, the original ops are checked
+        elif RMSNorm.enabled():
             return [
                 torch.ops._C.fused_add_rms_norm.default,
-                # TODO  functionalization pass does not handle this yet
-                # torch.ops._C.static_scaled_fp8_quant.default,
             ]
+        elif self.fp8_linear.quant_fp8.enabled():
+            return [
+                torch.ops._C.static_scaled_fp8_quant.default,
+            ]
+        else:
+            return []
 
 
 @multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("test_model_cls", [TestModel, TestQuantModel])
+@pytest.mark.parametrize(
+    "test_model_cls, custom_ops",
+    [
+        (TestAllReduceRMSNormModel, "+rms_norm"),
+        (TestAllReduceRMSNormModel, "-rms_norm"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,+quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,-quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,+quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,-quant_fp8"),
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("enable_fusion", [True, False])
+@pytest.mark.parametrize("dynamic", [False, True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_sequence_parallelism_pass(
     test_model_cls: type[torch.nn.Module],
+    custom_ops: str,
     batch_size: int,
     seq_len: int,
     hidden_size: int,
     dtype: torch.dtype,
     enable_fusion: bool,
+    dynamic: bool,
 ):
     num_processes = 2
 
@@ -220,11 +206,13 @@ def run_torch_spawn(fn, nprocs):
             args=(
                 num_processes,
                 test_model_cls,
+                custom_ops,
                 batch_size,
                 seq_len,
                 hidden_size,
                 dtype,
                 enable_fusion,
+                dynamic,
             ),
             nprocs=nprocs,
         )
@@ -236,11 +224,13 @@ def sequence_parallelism_pass_on_test_model(
     local_rank: int,
     world_size: int,
     test_model_cls: type[torch.nn.Module],
+    custom_ops: str,
     batch_size: int,
     seq_len: int,
     hidden_size: int,
     dtype: torch.dtype,
     enable_fusion: bool,
+    dynamic: bool,
 ):
     current_platform.seed_everything(0)
 
@@ -264,12 +254,16 @@ def sequence_parallelism_pass_on_test_model(
     initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # configure vllm config for SequenceParallelismPass
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
     compilation_config = CompilationConfig(
+        splitting_ops=[],  # avoid automatic rms_norm enablement
+        cudagraph_mode=CUDAGraphMode.NONE,  # avoid piecewise warnings
+        custom_ops=custom_ops_list,
         pass_config=PassConfig(
             enable_sequence_parallelism=True,
             enable_fusion=enable_fusion,
             enable_noop=True,
-        )
+        ),
     )  # NoOp needed for fusion
     device_config = DeviceConfig(device=torch.device("cuda"))
 
@@ -289,7 +283,6 @@ def sequence_parallelism_pass_on_test_model(
     with set_current_vllm_config(vllm_config):
         noop_pass = NoOpEliminationPass(vllm_config)
         sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
-        func_pass = FixFunctionalizationPass(vllm_config)
         cleanup_pass = PostCleanupPass(vllm_config)
         assert (
             sequence_parallelism_pass.compilation_config.splitting_ops
@@ -310,38 +303,29 @@ def sequence_parallelism_pass_on_test_model(
 
         passes_for_backend.append(cleanup_pass)
 
-        backend_no_func = TestBackend(*passes_for_backend)
-        backend_func = TestBackend(*passes_for_backend, func_pass)
+        backend = TestBackend(*passes_for_backend)
 
-        model = test_model_cls(hidden_size, hidden_size * 2)
+        model = test_model_cls(hidden_size)
 
         hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
-        residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
 
-        compiled_model_no_func = torch.compile(model, backend=backend_no_func)
-        compiled_model_no_func(hidden_states, residual)
-        compiled_model_func = torch.compile(model, backend=backend_func)
-        compiled_model_func(hidden_states, residual)
+        if dynamic:
+            torch._dynamo.mark_dynamic(hidden_states, 0)
+
+        compiled_model = torch.compile(model, backend=backend)
+        compiled_model(hidden_states)
 
-        assert sequence_parallelism_pass.matched_count == 1
+        assert sequence_parallelism_pass.matched_count == 4
 
         # In pre-nodes, all reduce should be there,
         # reduce scatter and all gather should not
-        backend_no_func.check_before_ops(model.ops_in_model_before())
+        for op in model.ops_in_model_before():
+            assert backend.op_count(op, before=True) == 4
 
         # In post-nodes, reduce scatter and all gather should be there,
         # all reduce should not
-        backend_no_func.check_after_ops(model.ops_in_model_after())
+        for op in model.ops_in_model_after():
+            assert backend.op_count(op, before=False) == 4
 
-        # check if the functionalization pass is applied
         for op in model.ops_in_model():
-            find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
-            assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
-
-        # make sure the ops were all de-functionalized
-        found = dict()
-        for node in backend_func.graph_post_pass.nodes:
-            for op in model.ops_in_model():
-                if is_func(node, op):
-                    found[op] = True
-        assert all(found[op] for op in model.ops_in_model())
+            find_auto_fn(backend.graph_post_pass.nodes, op)
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index 94b2b51211a6..f38c509775ed 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -18,6 +18,7 @@
 from vllm.config.compilation import CompilationMode
 from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 from ..models.registry import HF_EXAMPLE_MODELS
@@ -161,6 +162,7 @@ def _compare_sp(
     test_options: SPTestOptions,
     num_gpus_available: int,
     use_inductor_graph_partition: bool,
+    enable_async_tp: bool,
     *,
     method: Literal["generate", "encode"],
     is_multimodal: bool,
@@ -244,10 +246,10 @@ def _compare_sp(
 
     compilation_config = {
         "mode": CompilationMode.VLLM_COMPILE,
-        "custom_ops": ["+rms_norm"],
         "compile_sizes": [4, 8],
         "pass_config": {
             "enable_sequence_parallelism": True,
+            "enable_async_tp": enable_async_tp,
             "enable_fusion": enable_fusion,
             "enable_noop": True,
         },
@@ -307,6 +309,7 @@ def _compare_sp(
     ],
 )
 @pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
+@pytest.mark.parametrize("enable_async_tp", [False])  # TODO: enable async TP
 @create_new_process_for_each_test()
 def test_tp_sp_generation(
     model_id: str,
@@ -316,10 +319,19 @@ def test_tp_sp_generation(
     test_options: SPTestOptions,
     num_gpus_available,
     use_inductor_graph_partition: bool,
+    enable_async_tp: bool,
 ):
     if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
+    # Skip FP8 SP-only test on sm89 (compute capability 8.9)
+    if (
+        "fp8" in model_id.lower()
+        and current_platform.get_device_capability() < (9, 0)
+        and (not enable_async_tp)
+    ):
+        pytest.skip("FP8 reduction support begins with sm90 capable devices.")
+
     _compare_sp(
         model_id,
         parallel_setup,
@@ -328,6 +340,7 @@ def test_tp_sp_generation(
         test_options,
         num_gpus_available,
         use_inductor_graph_partition,
+        enable_async_tp=enable_async_tp,
         method="generate",
         is_multimodal=False,
     )
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index 31624a8fdcc0..bb4dcf12d865 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import functools
+
 import torch
 import torch._inductor.pattern_matcher as pm
 import torch.fx as fx
@@ -10,98 +12,28 @@
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    kFp8StaticTensorSym,
+)
 from vllm.platforms import current_platform
 
 from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm
+from .noop_elimination import NoOpEliminationPass
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
 
 
-class _RMSNormAndQuantOpHelper:
-    """Base helper for RMSNorm and RMSNorm + Quantization functionalization."""
+def get_first_out_wrapper(fn):
+    @functools.wraps(fn)
+    def wrapper(*args):
+        return fn(*args)[0]
 
-    def __init__(
-        self,
-        epsilon: float,
-        dtype: torch.dtype,
-        device: str,
-        quant_op: torch._ops.OpOverload | None = None,
-        **kwargs,
-    ):
-        self.epsilon = epsilon
-        self.dtype = dtype
-        self.device = device
-        self.quant_op = quant_op
-
-    def _functional_rmsnorm(self, result_buffer, input_tensor, weight_tensor):
-        return torch.ops.higher_order.auto_functionalized(
-            torch.ops._C.rms_norm.default,
-            result=result_buffer,
-            input=input_tensor,
-            weight=weight_tensor,
-            epsilon=self.epsilon,
-        )
+    return wrapper
 
-    def _functional_fused_add_rmsnorm(
-        self, input_tensor, residual_tensor, weight_tensor
-    ):
-        return torch.ops.higher_order.auto_functionalized(
-            torch.ops._C.fused_add_rms_norm.default,
-            input=input_tensor,
-            residual=residual_tensor,
-            weight=weight_tensor,
-            epsilon=self.epsilon,
-        )
 
-    def _functional_rmsnorm_then_quant(
-        self,
-        rmsnorm_result_buffer,
-        quant_result_buffer,
-        input_tensor,
-        weight_tensor,
-        scale_tensor,
-    ):
-        if self.quant_op is None:
-            raise RuntimeError(
-                "_RMSNormAndQuantOpHelper was not initialized with a quant_op."
-            )
-        rmsnorm_out_tuple = self._functional_rmsnorm(
-            rmsnorm_result_buffer, input_tensor, weight_tensor
-        )
-        quant_out_tuple = torch.ops.higher_order.auto_functionalized(
-            self.quant_op,
-            result=quant_result_buffer,
-            input=rmsnorm_out_tuple[1],
-            scale=scale_tensor,
-        )
-        return quant_out_tuple
-
-    def _functional_fused_add_rmsnorm_then_quant(
-        self,
-        quant_result_buffer,
-        input_tensor,
-        residual_tensor,
-        weight_tensor,
-        scale_tensor,
-    ):
-        if self.quant_op is None:
-            raise RuntimeError(
-                "_RMSNormAndQuantOpHelper was not initialized with a quant_op."
-            )
-        fused_add_rmsnorm_out_tuple = self._functional_fused_add_rmsnorm(
-            input_tensor, residual_tensor, weight_tensor
-        )
-        quant_out_tuple = torch.ops.higher_order.auto_functionalized(
-            self.quant_op,
-            result=quant_result_buffer,
-            input=fused_add_rmsnorm_out_tuple[1],
-            scale=scale_tensor,
-        )
-        return quant_out_tuple, fused_add_rmsnorm_out_tuple[2]
-
-
-class _SequenceParallelPatternHelper(_RMSNormAndQuantOpHelper):
+class _SequenceParallelPatternHelper:
     """Helper for sequence parallelism patterns."""
 
     def __init__(
@@ -109,10 +41,10 @@ def __init__(
         epsilon: float,
         dtype: torch.dtype,
         device: str,
-        quant_op: torch._ops.OpOverload | None = None,
-        **kwargs,
     ):
-        super().__init__(epsilon, dtype, device, quant_op=quant_op, **kwargs)
+        self.epsilon = epsilon
+        self.dtype = dtype
+        self.device = device
         self.tp_group = get_tp_group()
         self.tp_size = get_tensor_model_parallel_world_size()
 
@@ -131,36 +63,34 @@ def _all_gather(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class FirstAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
+
     def get_inputs(self):
         input = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
-        permute = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
         arg3_1 = torch.empty([4], device=self.device, dtype=self.dtype)
 
-        return [input, permute, arg3_1]
+        return [input, arg3_1]
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
             input: torch.Tensor,
-            permute: torch.Tensor,
             arg3_1: torch.Tensor,
         ):
             all_reduce = self._all_reduce(input)
-            rmsnorm = self._functional_rmsnorm(permute, all_reduce, arg3_1)
+            rmsnorm = self.rmsnorm_matcher(all_reduce, arg3_1)
 
-            return rmsnorm[1], all_reduce
+            return rmsnorm, all_reduce
 
         def replacement(
             input: torch.Tensor,
-            permute: torch.Tensor,
             arg3_1: torch.Tensor,
         ):
             reduce_scatter = self._reduce_scatter(input)
 
-            rmsnorm_result = torch.empty_like(reduce_scatter)
-            rmsnorm = self._functional_rmsnorm(rmsnorm_result, reduce_scatter, arg3_1)
-
-            all_gather = self._all_gather(rmsnorm[1])
-
+            rmsnorm = self.rmsnorm_matcher(reduce_scatter, arg3_1)
+            all_gather = self._all_gather(rmsnorm)
             return all_gather, reduce_scatter
 
         pm.register_replacement(
@@ -169,6 +99,10 @@ def replacement(
 
 
 class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
+
     def get_inputs(self):
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
 
@@ -188,67 +122,34 @@ def pattern(
             rms_norm_weights: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                all_reduce, residual, rms_norm_weights
-            )
-            return rmsnorm[1], rmsnorm[2]
+            rmsnorm = self.rmsnorm_matcher(all_reduce, rms_norm_weights, residual)
+            return rmsnorm[0], rmsnorm[1]
 
         def replacement(
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
+            # pattern matcher replaces from top-to-bottom,
+            # so residual is still the full size here.
+            # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                reduce_scatter, residual, rms_norm_weights
-            )
-            all_gather = self._all_gather(rmsnorm[1])
-            return all_gather, rmsnorm[2]
+            residual = residual[0 : reduce_scatter.size(0), ...]
+            rmsnorm = self.rmsnorm_matcher(reduce_scatter, rms_norm_weights, residual)
+            all_gather = self._all_gather(rmsnorm[0])
+            # shape of residual changes but that's fine,
+            # next node is already slicing it, now becomes a noop
+            return all_gather, rmsnorm[1]
 
         pm.register_replacement(
             pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
         )
-
-
-class LastAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
-    def get_inputs(self):
-        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-        return [
-            residual,
-            mm_1,
-            rms_norm_weights,
-        ]
-
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            all_reduce = self._all_reduce(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                all_reduce, residual, rms_norm_weights
-            )
-            return rmsnorm[1]
-
-        def replacement(
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            reduce_scatter = self._reduce_scatter(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                reduce_scatter, residual, rms_norm_weights
-            )
-            normalized = self._all_gather(rmsnorm[1])
-            return normalized
-
         pm.register_replacement(
-            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+            get_first_out_wrapper(pattern),
+            get_first_out_wrapper(replacement),
+            self.get_inputs(),
+            pm.fwd_only,
+            pm_pass,
         )
 
 
@@ -257,52 +158,41 @@ def replacement(
 
 class FirstAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
     def __init__(
-        self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str,
     ):
-        super().__init__(epsilon, dtype, device, quant_op=op)
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
+        self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self):
         input = torch.zeros([1, 8, 4], device=self.device, dtype=self.dtype)
-        rmsnorm_result = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
-        quant_result = torch.empty([1, 8, 4], device=self.device, dtype=FP8_DTYPE)
         weight = torch.empty([4], device=self.device, dtype=self.dtype)
         scale = torch.tensor(1.0, device=self.device, dtype=torch.float32)
-        return [input, rmsnorm_result, quant_result, weight, scale]
+        return [input, weight, scale]
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
             input: torch.Tensor,
-            rmsnorm_result: torch.Tensor,
-            quant_result: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
         ):
             all_reduce = self._all_reduce(input)
-            static_fp8 = self._functional_rmsnorm_then_quant(
-                rmsnorm_result, quant_result, all_reduce, weight, scale
-            )
-            return static_fp8[1], all_reduce
+            rms = self.rmsnorm_matcher(all_reduce, weight)
+            quant, _ = self.quant_matcher(rms, scale)
+            return quant, all_reduce
 
         def replacement(
             input: torch.Tensor,
-            rmsnorm_result: torch.Tensor,
-            quant_result: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
         ):
             reduce_scatter = self._reduce_scatter(input)
-
-            rmsnorm_result = torch.empty_like(
-                reduce_scatter, dtype=rmsnorm_result.dtype
-            )
-            quant_result = torch.empty_like(
-                rmsnorm_result,  # Output of RMSNorm
-                dtype=quant_result.dtype,
-            )
-            static_fp8 = self._functional_rmsnorm_then_quant(
-                rmsnorm_result, quant_result, reduce_scatter, weight, scale
-            )
-            all_gather = self._all_gather(static_fp8[1])
+            rms = self.rmsnorm_matcher(reduce_scatter, weight)
+            quant, _ = self.quant_matcher(rms, scale)
+            all_gather = self._all_gather(quant)
 
             return all_gather, reduce_scatter
 
@@ -312,118 +202,64 @@ def replacement(
 
 
 class MiddleAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
-    def __init__(
-        self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload
-    ):
-        super().__init__(epsilon, dtype, device, quant_op=op)
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
+        self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self):
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
         residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
         rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE)
         scale = torch.empty([1, 1], device=self.device, dtype=torch.float32)
 
-        return [
-            result,
-            residual,
-            mm_1,
-            rms_norm_weights,
-            scale,
-        ]
+        return [residual, mm_1, rms_norm_weights, scale]
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
-            result: torch.Tensor,
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            static_fp8, rmsnorm_residual_out = (
-                self._functional_fused_add_rmsnorm_then_quant(  # noqa: E501
-                    result, all_reduce, residual, rms_norm_weights, scale
-                )
+            rms, residual_out = self.rmsnorm_matcher(
+                all_reduce, rms_norm_weights, residual
             )
-            return static_fp8[1], rmsnorm_residual_out
+            quant, _ = self.quant_matcher(rms, scale)
+            return quant, residual_out
 
         def replacement(
-            result: torch.Tensor,
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
+            # pattern matcher replaces from top-to-bottom,
+            # so residual is still the full size here.
+            # add a temporary slice which will become a noop
+            # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
-            quant_result_buf = torch.empty_like(reduce_scatter, dtype=result.dtype)
-            static_fp8, rmsnorm_residual_out = (
-                self._functional_fused_add_rmsnorm_then_quant(  # noqa: E501
-                    quant_result_buf, reduce_scatter, residual, rms_norm_weights, scale
-                )
+            residual = residual[0 : reduce_scatter.size(0), ...]
+            rms, residual_out = self.rmsnorm_matcher(
+                reduce_scatter, rms_norm_weights, residual
             )
-            all_gather = self._all_gather(static_fp8[1])
-            return all_gather, rmsnorm_residual_out
+            quant, _ = self.quant_matcher(rms, scale)
+            all_gather = self._all_gather(quant)
+            # shape of residual changes but that's fine,
+            # next node is already slicing it, now becomes a noop
+            return all_gather, residual_out
 
         pm.register_replacement(
             pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
         )
 
-
-class LastAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
-    def __init__(
-        self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload
-    ):
-        super().__init__(epsilon, dtype, device, quant_op=op)
-
-    def get_inputs(self):
-        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE)
-        scale = torch.empty([1, 1], device=self.device, dtype=torch.float32)
-
-        return [
-            result,
-            residual,
-            mm_1,
-            rms_norm_weights,
-            scale,
-        ]
-
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(
-            result: torch.Tensor,
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-            scale: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            all_reduce = self._all_reduce(mm_1)
-            static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant(
-                result, all_reduce, residual, rms_norm_weights, scale
-            )
-            return static_fp8[1]
-
-        def replacement(
-            result: torch.Tensor,
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-            scale: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            reduce_scatter = self._reduce_scatter(mm_1)
-            quant_result_buf = torch.empty_like(reduce_scatter, dtype=result.dtype)
-            static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant(
-                quant_result_buf, reduce_scatter, residual, rms_norm_weights, scale
-            )
-            normalized = self._all_gather(static_fp8[1])
-            return normalized
-
         pm.register_replacement(
-            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+            get_first_out_wrapper(pattern),
+            get_first_out_wrapper(replacement),
+            self.get_inputs(),
+            pm.fwd_only,
+            pm_pass,
         )
 
 
@@ -445,27 +281,45 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
     GEMM + ReduceScatter and AllGather + GEMM fusions. These fusions can
     significantly reduce communication overhead and improve overall model
     performance.
+
+
+    This pass splits up the residual tensor across TP ranks and hence divides its size.
+    Because the pattern matcher starts at the end of the graph, the replacement
+    contains a slice that temporarily conforms the input residual to the correct size.
+    After all patterns have been matched, we use a NoOpEliminationPass to clean up
+    what have now become no-op slices.
+
+    Note that an older version of the pass did not need this as it operated only on
+    custom rms_norm and fused_rms_norm_add custom ops which did not complain about
+    mismatched shapes during replacement. So this approach has the same assumption that
+    correctness is only maintained if all rms_norm operations are split across ranks.
+
+    Correctness-wise, this is approach strictly better than before - before,
+    the graph was incorrect semantically and shape-wise during the pass.
+    With this approach there's only semantic incorrectness during the pass.
+    Both approaches restore a correct graph once all patterns are matched.
     """
 
     @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
+        # Used to cleanup redundant views created temporarily
+        # to circumvent residual shape change issues
+        self.noop_cleanup = NoOpEliminationPass(config)
+        self.noop_cleanup.pass_name = f"{self.pass_name}.{self.noop_cleanup.pass_name}"
+
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="sequence_parallelism_pass"
         )
 
         for epsilon in [1e-5, 1e-6]:
             # RMSNorm + Static FP8 quantization patterns
-            fp8_quant_op = torch.ops._C.static_scaled_fp8_quant.default
             FirstAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device, fp8_quant_op
+                epsilon, self.model_dtype, self.device
             ).register(self.patterns)
             MiddleAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device, fp8_quant_op
-            ).register(self.patterns)
-            LastAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device, fp8_quant_op
+                epsilon, self.model_dtype, self.device
             ).register(self.patterns)
 
             # Normal RMSNorm patterns
@@ -477,9 +331,6 @@ def __init__(self, config: VllmConfig):
                 epsilon, self.model_dtype, self.device
             ).register(self.patterns)
 
-            LastAllReduceRMSNormPattern(
-                epsilon, self.model_dtype, self.device
-            ).register(self.patterns)
         self.dump_patterns(config, self.patterns)
 
     def is_applicable(self, shape: int | None) -> bool:
@@ -508,3 +359,5 @@ def is_applicable(self, shape: int | None) -> bool:
     def __call__(self, graph: fx.Graph):
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", self.matched_count)
+        # Clean up reshape nodes
+        self.noop_cleanup(graph)
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 87f6b6eed851..bd98be48588f 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -445,8 +445,6 @@ def __post_init__(self):
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
             self.compilation_config.pass_config.enable_sequence_parallelism = True
-        if self.compilation_config.pass_config.enable_sequence_parallelism:
-            self.compilation_config.custom_ops.append("+rms_norm")
 
         if current_platform.support_static_graph_mode():
             # if cudagraph_mode is not explicitly set by users, set default
@@ -620,6 +618,32 @@ def __post_init__(self):
         if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
             self.compilation_config.set_splitting_ops_for_v1()
 
+        if self.compilation_config.pass_config.enable_sequence_parallelism:
+            # With pipeline parallelism or dynamo partitioning,
+            # native rms norm tracing errors due to incorrect residual shape.
+            # Use custom rms norm to unblock. In the future,
+            # the pass will operate on higher-level IR to avoid the issue.
+            # TODO: https://github.com/vllm-project/vllm/issues/27894
+            is_fullgraph = (
+                self.compilation_config.use_inductor_graph_partition
+                or len(self.compilation_config.splitting_ops) == 0
+            )
+            if self.parallel_config.pipeline_parallel_size > 1 or not is_fullgraph:
+                if "-rms_norm" not in self.compilation_config.custom_ops:
+                    self.compilation_config.custom_ops.append("+rms_norm")
+                else:
+                    regime = (
+                        "Dynamo partition"
+                        if not is_fullgraph
+                        else "pipeline parallelism"
+                    )
+                    logger.warning_once(
+                        "Sequence parallelism not supported with"
+                        "native rms_norm when using %s, "
+                        "this will likely lead to an error.",
+                        regime,
+                    )
+
         # final check of cudagraph mode after all possible updates
         if current_platform.is_cuda_alike():
             if (

From cb15ee28db037cff93a32aa237c862fc949824ce Mon Sep 17 00:00:00 2001
From: tingtinggithub <streamttt@gmail.com>
Date: Sat, 15 Nov 2025 04:18:08 -0800
Subject: [PATCH 099/578] Allow Gemma3 to take image embeddings (#28483)

Signed-off-by: tingtinggithub <streamttt@gmail.com>
---
 docs/models/supported_models.md         |  2 +-
 vllm/model_executor/models/gemma3_mm.py | 77 ++++++++++++++++++-------
 vllm/multimodal/parse.py                | 11 ++--
 vllm/v1/engine/processor.py             |  8 ++-
 4 files changed, 69 insertions(+), 29 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 9cdf644c3cc5..6eb0947fe568 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -669,7 +669,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ |
 | `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
-| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
+| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>E+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
 | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 02fb7ef31dc9..8e2bbe8f7990 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal
+from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 from torch import nn
@@ -20,7 +20,12 @@
     MultiModalFieldConfig,
     MultiModalKwargsItems,
 )
-from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
+from vllm.multimodal.parse import (
+    ImageEmbeddingItems,
+    ImageProcessorItems,
+    ImageSize,
+    MultiModalDataItems,
+)
 from vllm.multimodal.processing import (
     BaseMultiModalProcessor,
     BaseProcessingInfo,
@@ -71,7 +76,15 @@ class Gemma3ImagePixelInputs(TensorSchema):
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
 
-Gemma3ImageInputs = Gemma3ImagePixelInputs
+class Gemma3ImageEmbeddingInputs(TensorSchema):
+    type: Literal["image_embeds"] = "image_embeds"
+    image_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("ni", "nf", "hs"),
+    ]
+
+
+Gemma3ImageInputs: TypeAlias = Gemma3ImagePixelInputs | Gemma3ImageEmbeddingInputs
 
 
 class Gemma3ProcessingInfo(BaseProcessingInfo):
@@ -178,8 +191,9 @@ def get_num_crops(
     def get_image_repl(
         self,
         *,
-        image_width: int,
-        image_height: int,
+        image_width: int | None,
+        image_height: int | None,
+        num_crops: int | None = None,
         processor: Gemma3Processor | None,
     ) -> PromptUpdateDetails[str]:
         if processor is None:
@@ -187,11 +201,13 @@ def get_image_repl(
 
         boi_token = processor.boi_token
 
-        num_crops = self.get_num_crops(
-            image_width=image_width,
-            image_height=image_height,
-            processor=processor,
-        )
+        if num_crops is None:
+            assert image_width is not None and image_height is not None
+            num_crops = self.get_num_crops(
+                image_width=image_width,
+                image_height=image_height,
+                processor=processor,
+            )
 
         if num_crops == 0:
             image_text = boi_token
@@ -321,6 +337,7 @@ def _get_mm_fields_config(
         return dict(
             pixel_values=MultiModalFieldConfig.flat_from_sizes("image", num_patches),
             num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -333,7 +350,19 @@ def _get_prompt_updates(
         image_token = hf_processor.boi_token
 
         def get_replacement_gemma3(item_idx: int):
-            images = mm_items.get_items("image", ImageProcessorItems)
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems)
+            )
+
+            if isinstance(images, ImageEmbeddingItems):
+                # For image embedding inputs, only support no crops cases
+                # since it's not supported in hf processor anyway
+                return self.info.get_image_repl(
+                    image_width=None,
+                    image_height=None,
+                    num_crops=0,
+                    processor=hf_processor,
+                )
 
             image_size = images.get_image_size(item_idx)
             return self.info.get_image_repl(
@@ -557,17 +586,19 @@ def _parse_and_validate_image_input(
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
-        assert image_embeds is None, "Gemma3 does not support image_embeds."
-        if pixel_values is None:
-            return None
 
-        image_size = self.config.vision_config.image_size
-
-        return Gemma3ImagePixelInputs(
-            pixel_values=pixel_values,
-            num_patches=num_patches,
-            resolve_bindings={"h": image_size, "w": image_size},
-        )
+        if pixel_values is not None:
+            image_size = self.config.vision_config.image_size
+            return Gemma3ImagePixelInputs(
+                pixel_values=pixel_values,
+                num_patches=num_patches,
+                resolve_bindings={"h": image_size, "w": image_size},
+            )
+        elif image_embeds is not None:
+            return Gemma3ImageEmbeddingInputs(
+                image_embeds=image_embeds,
+                type="image_embeds",
+            )
 
     def _image_pixels_to_features(
         self,
@@ -579,7 +610,9 @@ def _image_pixels_to_features(
     def _process_image_input(
         self,
         image_input: Gemma3ImageInputs,
-    ) -> list[torch.Tensor]:
+    ) -> torch.Tensor | list[torch.Tensor]:
+        if image_input["type"] == "image_embeds":
+            return image_input["image_embeds"]
         assert self.vision_tower is not None
 
         pixel_values = image_input["pixel_values"]
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 2fa3f6ebcc11..810f29072a0f 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -359,8 +359,9 @@ def __init__(
         )
         self.video_needs_metadata = video_needs_metadata
 
-    def _is_embeddings(
-        self, data: object
+    @classmethod
+    def is_embeddings(
+        cls, data: object
     ) -> TypeGuard[torch.Tensor | list[torch.Tensor]]:
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
@@ -420,7 +421,7 @@ def _parse_audio_data(
         ):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return AudioEmbeddingItems(data)
 
         data_items: list[AudioItem]
@@ -458,7 +459,7 @@ def _parse_image_data(
         if self._is_empty(data):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return ImageEmbeddingItems(data)
 
         if (
@@ -484,7 +485,7 @@ def _parse_video_data(
         if self._is_empty(data):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return VideoEmbeddingItems(data)
 
         data_items: list[VideoItem]
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 0404f6ff2771..fffd075a5165 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -14,6 +14,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import processor_cache_from_config
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
+from vllm.multimodal.parse import MultiModalDataParser
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
@@ -340,7 +341,12 @@ def _extract_mm_data(p: PromptType):
 
         mm_uuids: dict[str, list[str | None] | str] = {}
         for modality, data in mm_data.items():
-            n = len(data) if isinstance(data, list) else 1
+            # Hash each item for embedding inputs.
+            n = (
+                len(data)
+                if isinstance(data, list) or MultiModalDataParser.is_embeddings(data)
+                else 1
+            )
             mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
         return mm_uuids
 

From 89d3679221023fc18fd47df8fc426347fa9694e1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Nov 2025 21:33:27 +0800
Subject: [PATCH 100/578] [Doc] Fix failing doc build (#28772)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/README.md                         |  4 +-
 docs/cli/bench/latency.md              |  4 +-
 docs/cli/bench/serve.md                |  4 +-
 docs/cli/bench/sweep/plot.md           |  4 +-
 docs/cli/bench/sweep/serve.md          |  4 +-
 docs/cli/bench/sweep/serve_sla.md      |  4 +-
 docs/cli/bench/throughput.md           |  4 +-
 docs/cli/chat.md                       |  4 +-
 docs/cli/complete.md                   |  4 +-
 docs/cli/run-batch.md                  |  4 +-
 docs/cli/serve.md                      |  4 +-
 docs/configuration/serve_args.md       |  2 +-
 docs/mkdocs/hooks/generate_argparse.py | 77 ++++++++++++++++----------
 docs/usage/README.md                   |  2 +-
 14 files changed, 72 insertions(+), 53 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 0608794e7e65..0c279c19f96c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
 Where to get started with vLLM depends on the type of user. If you are looking to:
 
 - Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
-- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
-- Build vLLM, we recommend starting with [Developer Guide](./contributing)
+- Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
+- Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
 
 For information about the development of vLLM, see:
 
diff --git a/docs/cli/bench/latency.md b/docs/cli/bench/latency.md
index 21ab13e63781..ea7ea7321ffc 100644
--- a/docs/cli/bench/latency.md
+++ b/docs/cli/bench/latency.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_latency.md"
+--8<-- "docs/argparse/bench_latency.inc.md"
diff --git a/docs/cli/bench/serve.md b/docs/cli/bench/serve.md
index f7c415c6becb..f7dc8036cc26 100644
--- a/docs/cli/bench/serve.md
+++ b/docs/cli/bench/serve.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_serve.md"
+--8<-- "docs/argparse/bench_serve.inc.md"
diff --git a/docs/cli/bench/sweep/plot.md b/docs/cli/bench/sweep/plot.md
index f29bffb64655..a101330e093c 100644
--- a/docs/cli/bench/sweep/plot.md
+++ b/docs/cli/bench/sweep/plot.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_sweep_plot.md"
+--8<-- "docs/argparse/bench_sweep_plot.inc.md"
diff --git a/docs/cli/bench/sweep/serve.md b/docs/cli/bench/sweep/serve.md
index 5b5f91a951ed..f0468f06fc28 100644
--- a/docs/cli/bench/sweep/serve.md
+++ b/docs/cli/bench/sweep/serve.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_sweep_serve.md"
+--8<-- "docs/argparse/bench_sweep_serve.inc.md"
diff --git a/docs/cli/bench/sweep/serve_sla.md b/docs/cli/bench/sweep/serve_sla.md
index 5f8ab6005e50..5642ec67eb00 100644
--- a/docs/cli/bench/sweep/serve_sla.md
+++ b/docs/cli/bench/sweep/serve_sla.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_sweep_serve_sla.md"
+--8<-- "docs/argparse/bench_sweep_serve_sla.inc.md"
diff --git a/docs/cli/bench/throughput.md b/docs/cli/bench/throughput.md
index e4ff5ce43c9c..e7f618fb4d14 100644
--- a/docs/cli/bench/throughput.md
+++ b/docs/cli/bench/throughput.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_throughput.md"
+--8<-- "docs/argparse/bench_throughput.inc.md"
diff --git a/docs/cli/chat.md b/docs/cli/chat.md
index b006cb8de60d..0246bd431b10 100644
--- a/docs/cli/chat.md
+++ b/docs/cli/chat.md
@@ -1,5 +1,5 @@
 # vllm chat
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/chat.md"
+--8<-- "docs/argparse/chat.inc.md"
diff --git a/docs/cli/complete.md b/docs/cli/complete.md
index 400359acf4fb..eb2ffdaabac2 100644
--- a/docs/cli/complete.md
+++ b/docs/cli/complete.md
@@ -1,5 +1,5 @@
 # vllm complete
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/complete.md"
+--8<-- "docs/argparse/complete.inc.md"
diff --git a/docs/cli/run-batch.md b/docs/cli/run-batch.md
index f7d401b8dad2..758fbda28397 100644
--- a/docs/cli/run-batch.md
+++ b/docs/cli/run-batch.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/run-batch.md"
+--8<-- "docs/argparse/run-batch.inc.md"
diff --git a/docs/cli/serve.md b/docs/cli/serve.md
index 2c8f9d320f5d..35652fec587b 100644
--- a/docs/cli/serve.md
+++ b/docs/cli/serve.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/serve.md"
+--8<-- "docs/argparse/serve.inc.md"
diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md
index c1cc5577bc7a..baaf21f01f06 100644
--- a/docs/configuration/serve_args.md
+++ b/docs/configuration/serve_args.md
@@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
 ## CLI Arguments
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
-To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
+To see the available options, take a look at the [CLI Reference](../cli/README.md)!
 
 ## Configuration file
 
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index ce1c5c53cf35..735074c08b8c 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import importlib
+import importlib.metadata
+import importlib.util
 import logging
 import sys
 import traceback
-from argparse import SUPPRESS, HelpFormatter
+from argparse import SUPPRESS, Action, HelpFormatter
+from collections.abc import Iterable
+from importlib.machinery import ModuleSpec
 from pathlib import Path
-from typing import Literal
+from typing import TYPE_CHECKING, Literal
 from unittest.mock import MagicMock, patch
 
 from pydantic_core import core_schema
@@ -19,6 +22,11 @@
 sys.path.insert(0, str(ROOT_DIR))
 
 
+def mock_if_no_torch(mock_module: str, mock: MagicMock):
+    if not importlib.util.find_spec("torch"):
+        sys.modules[mock_module] = mock
+
+
 # Mock custom op code
 class MockCustomOp:
     @staticmethod
@@ -29,18 +37,21 @@ def decorator(cls):
         return decorator
 
 
-noop = lambda *a, **k: None
-sys.modules["vllm._C"] = MagicMock()
-sys.modules["vllm.model_executor.custom_op"] = MagicMock(CustomOp=MockCustomOp)
-sys.modules["vllm.utils.torch_utils"] = MagicMock(direct_register_custom_op=noop)
+mock_if_no_torch("vllm._C", MagicMock())
+mock_if_no_torch("vllm.model_executor.custom_op", MagicMock(CustomOp=MockCustomOp))
+mock_if_no_torch(
+    "vllm.utils.torch_utils", MagicMock(direct_register_custom_op=lambda *a, **k: None)
+)
+
 
 # Mock any version checks by reading from compiled CI requirements
 with open(ROOT_DIR / "requirements/test.txt") as f:
     VERSIONS = dict(line.strip().split("==") for line in f if "==" in line)
 importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0"
 
+
 # Make torch.nn.Parameter safe to inherit from
-sys.modules["torch.nn"] = MagicMock(Parameter=object)
+mock_if_no_torch("torch.nn", MagicMock(Parameter=object))
 
 
 class PydanticMagicMock(MagicMock):
@@ -49,31 +60,34 @@ class PydanticMagicMock(MagicMock):
     def __init__(self, *args, **kwargs):
         name = kwargs.pop("name", None)
         super().__init__(*args, **kwargs)
-        self.__spec__ = importlib.machinery.ModuleSpec(name, None)
+        self.__spec__ = ModuleSpec(name, None)
 
     def __get_pydantic_core_schema__(self, source_type, handler):
         return core_schema.any_schema()
 
 
-def auto_mock(module, attr, max_mocks=100):
+def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
     """Function that automatically mocks missing modules during imports."""
-    logger.info("Importing %s from %s", attr, module)
+    logger.info("Importing %s from %s", attr, module_name)
+
     for _ in range(max_mocks):
         try:
+            module = importlib.import_module(module_name)
+
             # First treat attr as an attr, then as a submodule
-            return getattr(
-                importlib.import_module(module),
-                attr,
-                importlib.import_module(f"{module}.{attr}"),
-            )
+            if hasattr(module, attr):
+                return getattr(module, attr)
+
+            return importlib.import_module(f"{module_name}.{attr}")
         except ModuleNotFoundError as e:
+            assert e.name is not None
             logger.info("Mocking %s for argparse doc generation", e.name)
             sys.modules[e.name] = PydanticMagicMock(name=e.name)
-        except Exception as e:
-            logger.warning("Failed to import %s.%s: %s", module, attr, e)
+        except Exception:
+            logger.exception("Failed to import %s.%s: %s", module_name, attr)
 
     raise ImportError(
-        f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
+        f"Failed to import {module_name}.{attr} after mocking {max_mocks} imports"
     )
 
 
@@ -91,21 +105,26 @@ def auto_mock(module, attr, max_mocks=100):
 CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
 openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
 openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
-FlexibleArgumentParser = auto_mock(
-    "vllm.utils.argparse_utils", "FlexibleArgumentParser"
-)
+
+if TYPE_CHECKING:
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = auto_mock(
+        "vllm.utils.argparse_utils", "FlexibleArgumentParser"
+    )
 
 
 class MarkdownFormatter(HelpFormatter):
     """Custom formatter that generates markdown for argument groups."""
 
-    def __init__(self, prog, starting_heading_level=3):
-        super().__init__(prog, max_help_position=float("inf"), width=float("inf"))
+    def __init__(self, prog: str, starting_heading_level: int = 3):
+        super().__init__(prog, max_help_position=sys.maxsize, width=sys.maxsize)
+
         self._section_heading_prefix = "#" * starting_heading_level
         self._argument_heading_prefix = "#" * (starting_heading_level + 1)
         self._markdown_output = []
 
-    def start_section(self, heading):
+    def start_section(self, heading: str):
         if heading not in {"positional arguments", "options"}:
             heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
             self._markdown_output.append(heading_md)
@@ -113,14 +132,14 @@ def start_section(self, heading):
     def end_section(self):
         pass
 
-    def add_text(self, text):
+    def add_text(self, text: str):
         if text:
             self._markdown_output.append(f"{text.strip()}\n\n")
 
     def add_usage(self, usage, actions, groups, prefix=None):
         pass
 
-    def add_arguments(self, actions):
+    def add_arguments(self, actions: Iterable[Action]):
         for action in actions:
             if len(action.option_strings) == 0 or "--help" in action.option_strings:
                 continue
@@ -169,7 +188,7 @@ def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
         # Auto-mock runtime imports
         if tb_list := traceback.extract_tb(e.__traceback__):
             path = Path(tb_list[-1].filename).relative_to(ROOT_DIR)
-            auto_mock(module=".".join(path.parent.parts), attr=path.stem)
+            auto_mock(module_name=".".join(path.parent.parts), attr=path.stem)
             return create_parser(add_cli_args, **kwargs)
         else:
             raise e
@@ -209,7 +228,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
 
     # Generate documentation for each parser
     for stem, parser in parsers.items():
-        doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
+        doc_path = ARGPARSE_DOC_DIR / f"{stem}.inc.md"
         # Specify encoding for building on Windows
         with open(doc_path, "w", encoding="utf-8") as f:
             f.write(super(type(parser), parser).format_help())
diff --git a/docs/usage/README.md b/docs/usage/README.md
index 0c63d01f0f99..4e8ece2c0605 100644
--- a/docs/usage/README.md
+++ b/docs/usage/README.md
@@ -1,6 +1,6 @@
 # Using vLLM
 
-First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
+First, vLLM must be [installed](../getting_started/installation/README.md) for your chosen device in either a Python or Docker environment.
 
 Then, vLLM supports the following usage patterns:
 

From 085a5253321a66d7aac0f990d82417ad85ec0eb0 Mon Sep 17 00:00:00 2001
From: hwhaokun <haokun0405@163.com>
Date: Sat, 15 Nov 2025 21:44:12 +0800
Subject: [PATCH 101/578] [Model] Fix lmhead init bug of bailing_moe (#28777)

Signed-off-by: hwhaokun <haokun0405@163.com>
Co-authored-by: zhaozx-cn <zhaozx2116@163.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/bailing_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 6e1e5b1ddc50..024425bb2440 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -599,7 +599,7 @@ def __init__(
                     config.vocab_size,
                     config.hidden_size,
                     quant_config=quant_config,
-                    prefix=f"{prefix}.lm_head",
+                    prefix=maybe_prefix(prefix, "lm_head"),
                 )
             self.logits_processor = LogitsProcessor(config.vocab_size)
         else:

From e439c784fa318dbc23c04b0730bee0fccf46481d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eldar=20Kurti=C4=87?=
 <8884008+eldarkurtic@users.noreply.github.com>
Date: Sat, 15 Nov 2025 15:12:02 +0100
Subject: [PATCH 102/578] Add support for Eagle with separate lm-head and
 embed_tokens layers (#28549)

Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com>
---
 tests/v1/spec_decode/test_eagle.py           |  33 +++---
 tests/v1/spec_decode/test_mtp.py             |   4 +
 vllm/model_executor/models/deepseek_eagle.py |   3 +-
 vllm/model_executor/models/deepseek_v2.py    |   4 +-
 vllm/model_executor/models/interfaces.py     |  70 ++++++++++++-
 vllm/model_executor/models/llama.py          |   6 +-
 vllm/model_executor/models/llama4_eagle.py   |   3 +-
 vllm/model_executor/models/llama_eagle.py    |   3 +-
 vllm/model_executor/models/llama_eagle3.py   |   3 +-
 vllm/model_executor/models/minicpm_eagle.py  |  12 ++-
 vllm/model_executor/models/utils.py          |  23 +++++
 vllm/v1/spec_decode/eagle.py                 | 103 +++++++++++++------
 12 files changed, 204 insertions(+), 63 deletions(-)

diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 421da5241555..805b8c86b080 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -324,6 +324,7 @@ def test_prepare_inputs_padded():
 @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
 @pytest.mark.parametrize("pp_size", [1, 2])
 @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
+@pytest.mark.parametrize("use_distinct_lm_head", [True, False])
 @mock.patch("vllm.v1.spec_decode.eagle.get_pp_group")
 @mock.patch("vllm.v1.spec_decode.eagle.get_layers_from_vllm_config")
 @mock.patch("vllm.v1.spec_decode.eagle.get_model")
@@ -335,6 +336,7 @@ def test_load_model(
     attn_backend,
     pp_size,
     use_distinct_embed_tokens,
+    use_distinct_lm_head,
     monkeypatch,
 ):
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
@@ -350,12 +352,13 @@ def test_load_model(
 
     # Setup draft model mock
     mock_model = mock.MagicMock()
+    mock_model.model = mock.MagicMock()
+    mock_model.has_own_embed_tokens = use_distinct_embed_tokens
     if use_distinct_embed_tokens:
-        # Some models can have a different hidden size than the target model,
-        # so we test that their embed_tokens doesn't get overwritten
-        mock_model.model.embed_tokens.weight.shape = (131072, 2048)
-    else:
-        mock_model.model.embed_tokens.weight.shape = (131072, 4096)
+        mock_model.model.embed_tokens = mock.MagicMock()
+    mock_model.has_own_lm_head = use_distinct_lm_head
+    if use_distinct_lm_head:
+        mock_model.lm_head = mock.MagicMock()
 
     mock_get_model.return_value = mock_model
 
@@ -391,15 +394,13 @@ class _TargetModelStub(LlamaForCausalLM):
 
     target_model = mock.create_autospec(_TargetModelStub, instance=True)
     target_model.model = mock.MagicMock()
-    target_model.model.embed_tokens.weight.shape = (131072, 4096)
+    target_model.lm_head = mock.MagicMock()
+    target_model.model.embed_tokens = mock.MagicMock()
 
     from vllm.model_executor.models import SupportsMultiModal
 
     assert not isinstance(target_model, SupportsMultiModal)
 
-    if method == "eagle":
-        target_model.lm_head = mock.MagicMock()
-
     # Create proposer using the helper function
     proposer = _create_proposer(method, num_speculative_tokens=8)
 
@@ -409,18 +410,18 @@ class _TargetModelStub(LlamaForCausalLM):
     # Verify common interactions
     mock_get_model.assert_called_once()
 
-    # Verify that EAGLE models gain the lm head from the target model
-    if method == "eagle":
-        assert proposer.model.lm_head == target_model.lm_head
+    # Verify that the lm head is set correctly
+    if use_distinct_lm_head:
+        assert proposer.model.lm_head is not target_model.lm_head
+    else:
+        assert proposer.model.lm_head is target_model.lm_head
 
     # Verify that the embed tokens are set correctly
     # If pp_size is > 1, the embed tokens should be distinct
     if pp_size > 1 or use_distinct_embed_tokens:
-        assert proposer.model.model.embed_tokens != target_model.model.embed_tokens
+        assert proposer.model.model.embed_tokens is not target_model.model.embed_tokens
     else:
-        # When pp_size is 1 and the draft and target models have
-        # embed_tokens of the same shape, they should be shared.
-        assert proposer.model.model.embed_tokens == target_model.model.embed_tokens
+        assert proposer.model.model.embed_tokens is target_model.model.embed_tokens
 
 
 @pytest.mark.parametrize("method", ["eagle", "eagle3"])
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index 6d59b58e739e..c5c0491abaf7 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -67,6 +67,10 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers, mock_get_pp_gro
     mock_model = mock.MagicMock()
     mock_model.model.embed_tokens.weight.shape = (131072, 4096)
     mock_get_model.return_value = mock_model
+    # MTP does not have its own embed_tokens or lm_head
+    # so it should share them with the target model
+    mock_model.has_own_embed_tokens = False
+    mock_model.has_own_lm_head = False
 
     target_attn_layers = {"target_attn_1": mock.MagicMock()}
     all_attn_layers = {**target_attn_layers, "draft_attn_1": mock.MagicMock()}
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index 9e834a73f8e5..3fb04c3b70dd 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -26,7 +26,7 @@
 )
 from vllm.utils import init_logger
 
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
 
 logger = init_logger(__name__)
 
@@ -250,6 +250,7 @@ def transform(inputs):
             name, loaded_weight = inputs
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, loaded_weight
 
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 115818d903a6..e8ee9951d611 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -85,7 +85,7 @@
 )
 from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec
 
-from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
+from .interfaces import MixtureOfExperts, SupportsEagle, SupportsLoRA, SupportsPP
 from .utils import (
     PPMissingLayer,
     is_pp_missing_parameter,
@@ -1311,7 +1311,7 @@ def update_physical_experts_metadata(
 
 
 class DeepseekV2ForCausalLM(
-    nn.Module, SupportsPP, DeepseekV2MixtureOfExperts, SupportsLoRA
+    nn.Module, SupportsPP, DeepseekV2MixtureOfExperts, SupportsLoRA, SupportsEagle
 ):
     packed_modules_mapping = {
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 929bfaaee5cb..dc4caf2f02f9 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -932,13 +932,73 @@ def supports_transcription(
 
 
 @runtime_checkable
-class SupportsEagle3(Protocol):
+class SupportsEagleBase(Protocol):
+    """Base interface for models that support EAGLE-based speculative decoding."""
+
+    has_own_lm_head: bool = False
+    """
+    A flag that indicates this model has trained its own lm_head.
+    """
+
+    has_own_embed_tokens: bool = False
+    """
+    A flag that indicates this model has trained its own input embeddings.
+    """
+
+
+@overload
+def supports_any_eagle(model: type[object]) -> TypeIs[type[SupportsEagleBase]]: ...
+
+
+@overload
+def supports_any_eagle(model: object) -> TypeIs[SupportsEagleBase]: ...
+
+
+def supports_any_eagle(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsEagleBase]] | TypeIs[SupportsEagleBase]:
+    """Check if model supports any EAGLE variant (1, 2, or 3)."""
+    return supports_eagle(model) or supports_eagle3(model)
+
+
+@runtime_checkable
+class SupportsEagle(SupportsEagleBase, Protocol):
+    """The interface required for models that support
+    EAGLE-1 and EAGLE-2 speculative decoding."""
+
+    supports_eagle: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports EAGLE-1 and EAGLE-2 
+    speculative decoding.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+
+@overload
+def supports_eagle(model: type[object]) -> TypeIs[type[SupportsEagle]]: ...
+
+
+@overload
+def supports_eagle(model: object) -> TypeIs[SupportsEagle]: ...
+
+
+def supports_eagle(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsEagle]] | TypeIs[SupportsEagle]:
+    return isinstance(model, SupportsEagle)
+
+
+@runtime_checkable
+class SupportsEagle3(SupportsEagleBase, Protocol):
     """The interface required for models that support
-    EAGLE3 speculative decoding."""
+    EAGLE-3 speculative decoding."""
 
     supports_eagle3: ClassVar[Literal[True]] = True
     """
-    A flag that indicates this model supports EAGLE3 
+    A flag that indicates this model supports EAGLE-3 
     speculative decoding.
 
     Note:
@@ -949,7 +1009,7 @@ class SupportsEagle3(Protocol):
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         """
         Set which layers should output auxiliary
-        hidden states for EAGLE3.
+        hidden states for EAGLE-3.
 
         Args:
             layers: Tuple of layer indices that should output auxiliary
@@ -960,7 +1020,7 @@ def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
     def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         """
         Get the layer indices that should output auxiliary hidden states
-        for EAGLE3.
+        for EAGLE-3.
 
         Returns:
             Tuple of layer indices for auxiliary hidden state outputs.
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index c49a1ea817f9..0a3f37c30ab5 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -58,7 +58,7 @@
 )
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
+from .interfaces import SupportsEagle, SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
@@ -529,7 +529,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
-class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
+class LlamaForCausalLM(
+    nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3
+):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index e8716d652415..660c8f1bb522 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -35,7 +35,7 @@
 from vllm.model_executor.models.utils import extract_layer_index
 
 from .interfaces import SupportsMultiModal
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
 
 logger = init_logger(__name__)
 
@@ -212,6 +212,7 @@ def transform(inputs):
             name, weight = self.permute_qk_weight_for_rotary(name, loaded_weight)
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, weight
 
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index ab2a9f6f06db..0287132c5637 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -17,7 +17,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM
 
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
 
 logger = init_logger(__name__)
 
@@ -179,6 +179,7 @@ def transform(inputs):
             name, loaded_weight = inputs
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, loaded_weight
 
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 6edc9519dfbb..a3bcc5eeb32b 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -23,7 +23,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
 
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
 
 logger = init_logger(__name__)
 
@@ -324,6 +324,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
             if "embed_tokens" in name:
                 includes_embed_tokens = True
             model_weights[name] = loaded_weight
+            process_eagle_weight(self, name)
 
         skip_substrs = []
         if not includes_draft_id_mapping:
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index 0ca31913485d..d0cdb70aa857 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -43,7 +43,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsEagle, SupportsLoRA, SupportsPP
 from .minicpm import MiniCPMAttention as EagleMiniCPMAttention
 from .minicpm import MiniCPMMLP as EagleMiniCPMMLP
 from .minicpm import MiniCPMMoE as EagleMiniCPMMoE
@@ -52,6 +52,7 @@
     is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     maybe_prefix,
+    process_eagle_weight,
 )
 
 
@@ -289,7 +290,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
-class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -376,8 +377,13 @@ def compute_logits(
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        def transform(inputs):
+            name, loaded_weight = inputs
+            process_eagle_weight(self, name)
+            return name, loaded_weight
+
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(map(transform, weights))
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index e5663c8a057a..0d811fbc7585 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -19,6 +19,7 @@
 )
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import supports_any_eagle
 from vllm.multimodal import NestedTensors
 from vllm.sequence import IntermediateTensors
 from vllm.utils.math_utils import cdiv
@@ -825,3 +826,25 @@ def sequence_parallel_chunk_impl_fake(x: torch.Tensor) -> torch.Tensor:
     fake_impl=sequence_parallel_chunk_impl_fake,
     tags=(torch.Tag.needs_fixed_stride_order,),
 )
+
+
+def process_eagle_weight(
+    model: nn.Module,
+    name: str,
+) -> None:
+    """
+    Update EAGLE model flags based on loaded weight name.
+    This should be called during weight loading to detect if a model
+    has its own lm_head or embed_tokens weight.
+    Args:
+        model: The model instance (must support EAGLE)
+        name: The name of the weight to process
+    """
+    if not supports_any_eagle(model):
+        return
+
+    # To prevent overriding with target model's layers
+    if "lm_head" in name:
+        model.has_own_lm_head = True
+    if "embed_tokens" in name:
+        model.has_own_embed_tokens = True
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index f3b34544f8d9..ed602f39d0f9 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -991,6 +991,7 @@ def load_model(self, target_model: nn.Module) -> None:
             target_language_model = target_model.get_language_model()
         else:
             target_language_model = target_model
+
         # share embed_tokens with the target model if needed
         if get_pp_group().world_size == 1:
             if hasattr(target_language_model.model, "embed_tokens"):
@@ -1002,52 +1003,92 @@ def load_model(self, target_model: nn.Module) -> None:
                     "Target model does not have 'embed_tokens' or 'embedding' attribute"
                 )
 
-            # Check if shapes match and we found the embedding
-            eagle_shape = self.model.model.embed_tokens.weight.shape
-            target_shape = target_embed_tokens.weight.shape
-            if eagle_shape == target_shape:
-                logger.info(
-                    "Assuming the EAGLE head shares the same vocab embedding"
-                    " with the target model."
-                )
-                del self.model.model.embed_tokens
-                self.model.model.embed_tokens = target_embed_tokens
+            share_embeddings = False
+            if hasattr(self.model, "has_own_embed_tokens"):
+                # EAGLE model
+                if not self.model.has_own_embed_tokens:
+                    share_embeddings = True
+                    logger.info(
+                        "Detected EAGLE model without its own embed_tokens in the"
+                        " checkpoint. Sharing target model embedding weights with the"
+                        " draft model."
+                    )
+                elif (
+                    isinstance(target_embed_tokens.weight, torch.Tensor)
+                    and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
+                    and torch.equal(
+                        target_embed_tokens.weight, self.model.model.embed_tokens.weight
+                    )
+                ):
+                    share_embeddings = True
+                    logger.info(
+                        "Detected EAGLE model with embed_tokens identical to the target"
+                        " model. Sharing target model embedding weights with the draft"
+                        " model."
+                    )
+                else:
+                    logger.info(
+                        "Detected EAGLE model with distinct embed_tokens weights. "
+                        "Keeping separate embedding weights from the target model."
+                    )
             else:
+                # MTP model
+                share_embeddings = True
                 logger.info(
-                    "The EAGLE head's vocab embedding will be loaded separately"
-                    " from the target model."
+                    "Detected MTP model. "
+                    "Sharing target model embedding weights with the draft model."
                 )
+
+            if share_embeddings:
+                if hasattr(self.model.model, "embed_tokens"):
+                    del self.model.model.embed_tokens
+                self.model.model.embed_tokens = target_embed_tokens
         else:
             logger.info(
-                "The EAGLE head's vocab embedding will be loaded separately"
+                "The draft model's vocab embedding will be loaded separately"
                 " from the target model."
             )
 
         # share lm_head with the target model if needed
-        # some model definition do not define lm_head explicitly
-        # and reuse embed_tokens for lm_head, e.g., CohereForCausalLM
-        if self.vllm_config.speculative_config.method != "eagle3":
-            if hasattr(target_language_model, "lm_head"):
-                logger.info("Loading EAGLE LM head weights from the target model.")
-                self.model.lm_head = target_language_model.lm_head
-        else:
-            if (
-                hasattr(self.model, "lm_head")
-                and hasattr(target_language_model, "lm_head")
-                and self.model.lm_head.weight.shape
-                == target_language_model.lm_head.weight.shape
+        share_lm_head = False
+        if hasattr(self.model, "has_own_lm_head"):
+            # EAGLE model
+            if not self.model.has_own_lm_head:
+                share_lm_head = True
+                logger.info(
+                    "Detected EAGLE model without its own lm_head in the checkpoint. "
+                    "Sharing target model lm_head weights with the draft model."
+                )
+            elif (
+                hasattr(target_language_model, "lm_head")
+                and isinstance(target_language_model.lm_head.weight, torch.Tensor)
+                and isinstance(self.model.lm_head.weight, torch.Tensor)
+                and torch.equal(
+                    target_language_model.lm_head.weight, self.model.lm_head.weight
+                )
             ):
+                share_lm_head = True
                 logger.info(
-                    "Assuming the EAGLE head shares the same lm_head"
-                    " with the target model."
+                    "Detected EAGLE model with lm_head identical to the target model. "
+                    "Sharing target model lm_head weights with the draft model."
                 )
-                del self.model.lm_head
-                self.model.lm_head = target_language_model.lm_head
             else:
                 logger.info(
-                    "The EAGLE head's lm_head will be loaded separately"
-                    " from the target model."
+                    "Detected EAGLE model with distinct lm_head weights. "
+                    "Keeping separate lm_head weights from the target model."
                 )
+        else:
+            # MTP model
+            share_lm_head = True
+            logger.info(
+                "Detected MTP model. "
+                "Sharing target model lm_head weights with the draft model."
+            )
+
+        if share_lm_head and hasattr(target_language_model, "lm_head"):
+            if hasattr(self.model, "lm_head"):
+                del self.model.lm_head
+            self.model.lm_head = target_language_model.lm_head
 
     @torch.inference_mode()
     def dummy_run(

From 637f292196237982558936166540ed8d153a75eb Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 15 Nov 2025 08:44:14 -0800
Subject: [PATCH 103/578] [CI] Fix broken pipeline (#28781)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .buildkite/test-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 723f311a2646..4ac76aba67b9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -926,7 +926,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
 
-- label: Blackwell Fusion & Compile Tests # 30 min
+- label: Blackwell Fusion and Compile Tests # 30 min
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   gpu: b200

From 07cadab27a23bf1fbc1090f77fcc650eeb1612e8 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sat, 15 Nov 2025 19:03:09 +0000
Subject: [PATCH 104/578] [Model][Qwen3VL] Cache positional embedding indices 
 (#28475)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/model_executor/models/qwen3_vl.py | 57 +++++++++++++++-----------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index fa6b71bf9268..7f0c9372991d 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -25,7 +25,7 @@
 """Inference-only Qwen3VL model compatible with HuggingFace weights."""
 
 from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
-from functools import partial
+from functools import lru_cache, partial
 from itertools import islice
 from typing import Any
 
@@ -416,30 +416,41 @@ def dtype(self) -> torch.dtype:
     def device(self) -> torch.device:
         return self.patch_embed.proj.weight.device
 
+    @staticmethod
+    @lru_cache(maxsize=1024)
+    def rot_pos_ids(h: int, w: int, spatial_merge_size: int) -> torch.Tensor:
+        hpos_ids = np.broadcast_to(np.arange(h).reshape(h, 1), (h, w))
+        h_div = h // spatial_merge_size
+        w_div = w // spatial_merge_size
+        hpos_ids = hpos_ids.reshape(
+            h_div,
+            spatial_merge_size,
+            w_div,
+            spatial_merge_size,
+        )
+        hpos_ids = hpos_ids.transpose(0, 2, 1, 3)
+        hpos_ids = hpos_ids.flatten()
+
+        wpos_ids = np.broadcast_to(np.arange(w).reshape(1, w), (h, w))
+        wpos_ids = wpos_ids.reshape(
+            h_div,
+            spatial_merge_size,
+            w_div,
+            spatial_merge_size,
+        )
+        wpos_ids = wpos_ids.transpose(0, 2, 1, 3)
+        wpos_ids = wpos_ids.flatten()
+
+        return torch.from_numpy(np.stack([hpos_ids, wpos_ids], axis=-1))
+
     def rot_pos_emb(self, grid_thw: list[list[int]]):
-        pos_ids = []
         max_grid_size = max(max(h, w) for _, h, w in grid_thw)
-        for t, h, w in grid_thw:
-            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
-            hpos_ids = hpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            )
-            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
-            hpos_ids = hpos_ids.flatten()
-
-            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
-            wpos_ids = wpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            )
-            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
-            wpos_ids = wpos_ids.flatten()
-            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = [
+            self.rot_pos_ids(h, w, self.spatial_merge_size)
+            if t == 1
+            else self.rot_pos_ids(h, w, self.spatial_merge_size).repeat(t, 1)
+            for t, h, w in grid_thw
+        ]
         pos_ids = torch.cat(pos_ids, dim=0)
         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)

From 2bb4435cb7e2e2317b0f20803347690fb38fe6b4 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Sat, 15 Nov 2025 20:27:50 +0100
Subject: [PATCH 105/578] [Doc]: fix typos in various files (#28567)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 docs/design/moe_kernel_features.md     | 2 +-
 docs/features/quantization/quark.md    | 2 +-
 vllm/compilation/compiler_interface.py | 2 +-
 vllm/compilation/decorators.py         | 4 ++--
 vllm/v1/worker/gpu_model_runner.py     | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index ee224e6922fb..7663b82266f0 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 
 ## Fused MoE Experts Kernels
 
-The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
+The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
 
 Each kernel must be provided with one of the supported input activation formats.  Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
 
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index be0702f4c9e1..bd7bc186e13a 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -298,7 +298,7 @@ There are two steps to generate and deploy a mixed precision model quantized wit
 
 Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later.
 
-As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benifits. They are:
+As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benefits. They are:
 
 - amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
 - amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index b0cdb08884a3..11cf0f85c178 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -299,7 +299,7 @@ def initialize_cache(
         self.base_cache_dir = cache_dir[: -len(prefix)] if prefix else cache_dir
         if disable_cache:
             return
-        # redirect the cache directory to a sub-directory
+        # redirect the cache directory to a subdirectory
         # set flags so that Inductor and Triton store their cache
         # in the cache_dir, then users only need to copy the cache_dir
         # to another machine to reuse the cache.
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index e325bca73abb..11a18c0e6bb7 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -159,7 +159,7 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
 
     `mark_unbacked_dims` is a dictionary that maps argument names with a dynamic
     dim to be decorated with `mark_unbacked`.  This is useful if we would like to
-    enforce that dynamo do not specialize on 0/1 values in the case of dummy input
+    enforce that dynamo does not specialize on 0/1 values in the case of dummy input
     such as for vision model compilation
     """
 
@@ -483,7 +483,7 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
     Context manager to set/unset customized cudagraph partition wrappers.
 
     If we're using Inductor-based graph partitioning, we currently have the
-    whole `fx.Graph` before Inductor lowering and and the piecewise
+    whole `fx.Graph` before Inductor lowering and the piecewise
     splitting happens after all graph passes and fusions. Here, we add
     a custom hook for Inductor to wrap each partition with our static
     graph wrapper class to maintain more control over static graph
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d0d6164180e6..6590ca54af68 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2871,7 +2871,7 @@ def propose_draft_token_ids(
             "gpu_model_runner: set_async_sampled_token_ids"
         ):
             # Save ref of sampled_token_ids CPU tensor if the batch contains
-            # any requests with sampling params that that require output ids.
+            # any requests with sampling params that require output ids.
             self.input_batch.set_async_sampled_token_ids(
                 async_output.sampled_token_ids_cpu,
                 async_output.async_copy_ready_event,

From be263f76451ad8a32baf0b935d3f0432d05300e6 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sat, 15 Nov 2025 17:35:06 -0500
Subject: [PATCH 106/578] [BugFix] Fix `AssertionError: DCP not support
 reorder_batch_threshold > 1 now.`  (#28751)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6590ca54af68..ffbac5fe12f7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -630,16 +630,6 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
             return
 
         if self.reorder_batch_threshold is not None:
-            # NOTE(lucas): currently no backend supports the custom masking
-            #  required for DCP with q_len > 1, so we assert here. Remove this
-            #  assert once the custom mask is support is added to FA3.
-            if (
-                self.dcp_world_size > 1
-                and envs.VLLM_ATTENTION_BACKEND != "FLASH_ATTN_MLA"
-            ):
-                assert self.reorder_batch_threshold == 1, (
-                    "DCP not support reorder_batch_threshold > 1 now."
-                )
             reorder_batch_to_split_decodes_and_prefills(
                 self.input_batch,
                 scheduler_output,

From f849ee739cdb3d82fce1660a6fd91806e8ae9bff Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Sun, 16 Nov 2025 00:22:17 -0500
Subject: [PATCH 107/578] Adding a benchmark for batch invariance (#28161)

Signed-off-by: Bram Wasti <bwasti@meta.com>
Signed-off-by: Bram Wasti <bwasti@fb.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 benchmarks/benchmark_batch_invariance.py | 380 +++++++++++++++++++++++
 1 file changed, 380 insertions(+)
 create mode 100755 benchmarks/benchmark_batch_invariance.py

diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py
new file mode 100755
index 000000000000..b5c16c42de46
--- /dev/null
+++ b/benchmarks/benchmark_batch_invariance.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
+
+This benchmark runs the same workload twice:
+1. With VLLM_BATCH_INVARIANT=0 (baseline)
+2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
+
+And reports the timing and throughput metrics for comparison.
+
+Environment variables:
+    VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
+    VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
+    VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
+    VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
+    VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
+    VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
+    VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
+    VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
+    VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
+    VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
+    VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
+
+Example usage:
+    # Benchmark qwen3 (default)
+    python benchmarks/benchmark_batch_invariance.py
+
+    # Benchmark deepseek with 8 GPUs
+    VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
+        python benchmarks/benchmark_batch_invariance.py
+
+    # Quick test with fewer trials
+    VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
+        python benchmarks/benchmark_batch_invariance.py
+"""
+
+import contextlib
+import os
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+
+
+def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
+    """Generate a random prompt for benchmarking."""
+    prompt_templates = [
+        "Question: What is the capital of France?\nAnswer: The capital of France is",
+        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
+        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
+        "Once upon a time in a distant galaxy, there lived",
+        "The old man walked slowly down the street, remembering",
+        "In the year 2157, humanity finally discovered",
+        "To implement a binary search tree in Python, first we need to",
+        "The algorithm works by iterating through the array and",
+        "Here's how to optimize database queries using indexing:",
+        "The Renaissance was a period in European history that",
+        "Climate change is caused by several factors including",
+        "The human brain contains approximately 86 billion neurons which",
+        "I've been thinking about getting a new laptop because",
+        "Yesterday I went to the store and bought",
+        "My favorite thing about summer is definitely",
+    ]
+
+    base_prompt = random.choice(prompt_templates)
+
+    if max_words < min_words:
+        max_words = min_words
+    target_words = random.randint(min_words, max_words)
+
+    if target_words > 50:
+        padding_text = (
+            " This is an interesting topic that deserves more explanation. "
+            * (target_words // 50)
+        )
+        base_prompt = base_prompt + padding_text
+
+    return base_prompt
+
+
+def run_benchmark_with_batch_invariant(
+    model: str,
+    tp_size: int,
+    max_batch_size: int,
+    num_trials: int,
+    min_prompt: int,
+    max_prompt: int,
+    max_tokens: int,
+    temperature: float,
+    gpu_mem_util: float,
+    max_model_len: int,
+    backend: str,
+    batch_invariant: bool,
+    seed: int = 12345,
+) -> dict:
+    """
+    Run the benchmark with the specified configuration.
+
+    Returns a dict with timing and throughput metrics.
+    """
+    random.seed(seed)
+
+    # Set environment variables
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    if batch_invariant:
+        os.environ["VLLM_BATCH_INVARIANT"] = "1"
+    else:
+        os.environ["VLLM_BATCH_INVARIANT"] = "0"
+
+    print(f"\n{'=' * 80}")
+    print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
+    print(f"  Model: {model}")
+    print(f"  TP Size: {tp_size}")
+    print(f"  Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Trials: {num_trials}")
+    print(f"  Max Tokens: {max_tokens}")
+    print(f"{'=' * 80}\n")
+
+    sampling = SamplingParams(
+        temperature=temperature,
+        top_p=0.95,
+        max_tokens=max_tokens,
+        seed=20240919,
+    )
+
+    needle_prompt = "There once was a "
+
+    llm = None
+    try:
+        # Create LLM engine
+        start_init = time.perf_counter()
+        llm = LLM(
+            model=model,
+            max_num_seqs=max_batch_size,
+            gpu_memory_utilization=gpu_mem_util,
+            max_model_len=max_model_len,
+            dtype="bfloat16",
+            tensor_parallel_size=tp_size,
+            enable_prefix_caching=False,
+        )
+        init_time = time.perf_counter() - start_init
+        print(f"Engine initialization time: {init_time:.2f}s\n")
+
+        # Generate baseline
+        print("Generating baseline (warmup)...")
+        baseline_out = llm.generate([needle_prompt], sampling)
+        assert len(baseline_out) == 1
+        baseline_text = baseline_out[0].outputs[0].text
+        print(f"Baseline output: '{baseline_text[:50]}...'\n")
+
+        # Run trials and measure timing
+        trial_times: list[float] = []
+        total_tokens = 0
+        total_prompts = 0
+
+        for trial in range(num_trials):
+            # Create a batch
+            prompts: list[str] = []
+            batch_size = random.randint(max_batch_size // 2, max_batch_size)
+            needle_pos = random.randint(0, batch_size - 1)
+            for i in range(batch_size):
+                if i == needle_pos:
+                    prompts.append(needle_prompt)
+                else:
+                    prompts.append(_random_prompt(min_prompt, max_prompt))
+
+            # Measure time for this trial
+            start_time = time.perf_counter()
+            outputs = llm.generate(prompts, sampling)
+            trial_time = time.perf_counter() - start_time
+
+            trial_times.append(trial_time)
+            total_prompts += len(prompts)
+
+            # Count tokens
+            for output in outputs:
+                if output.outputs:
+                    total_tokens += len(output.outputs[0].token_ids)
+
+            print(
+                f"Trial {trial + 1}/{num_trials}: "
+                f"batch_size={batch_size}, "
+                f"time={trial_time:.2f}s"
+            )
+
+            # Verify needle output still matches
+            needle_output = outputs[needle_pos]
+            assert needle_output.prompt == needle_prompt
+
+        # Compute statistics
+        avg_time = sum(trial_times) / len(trial_times)
+        min_time = min(trial_times)
+        max_time = max(trial_times)
+        throughput = total_tokens / sum(trial_times)
+        prompts_per_sec = total_prompts / sum(trial_times)
+
+        print(f"\n{'=' * 80}")
+        print("RESULTS:")
+        print(f"  Average time per trial: {avg_time:.2f}s")
+        print(f"  Min time: {min_time:.2f}s")
+        print(f"  Max time: {max_time:.2f}s")
+        print(f"  Total tokens generated: {total_tokens}")
+        print(f"  Total prompts processed: {total_prompts}")
+        print(f"  Throughput: {throughput:.2f} tokens/s")
+        print(f"  Prompts/s: {prompts_per_sec:.2f}")
+        print(f"{'=' * 80}\n")
+
+        return {
+            "init_time": init_time,
+            "avg_time": avg_time,
+            "min_time": min_time,
+            "max_time": max_time,
+            "total_tokens": total_tokens,
+            "total_prompts": total_prompts,
+            "throughput": throughput,
+            "prompts_per_sec": prompts_per_sec,
+            "trial_times": trial_times,
+        }
+
+    finally:
+        # Cleanup
+        if llm is not None:
+            with contextlib.suppress(Exception):
+                llm.shutdown()
+
+
+def main():
+    # Check platform support
+    if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
+        print("ERROR: Requires CUDA and >= Hopper (SM90)")
+        print(f"Current platform: {current_platform.device_type}")
+        if current_platform.is_cuda():
+            print(f"Device capability: {current_platform.get_device_capability()}")
+        return 1
+
+    # Read configuration from environment
+    model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
+    tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
+    max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
+    num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
+    min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
+    max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
+    max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
+    temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
+    gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
+    max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
+    backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
+
+    print("\n" + "=" * 80)
+    print("VLLM BATCH INVARIANCE BENCHMARK")
+    print("=" * 80)
+    print("\nConfiguration:")
+    print(f"  Model: {model}")
+    print(f"  Tensor Parallel Size: {tp_size}")
+    print(f"  Attention Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Number of Trials: {num_trials}")
+    print(f"  Prompt Length Range: {min_prompt}-{max_prompt} words")
+    print(f"  Max Tokens to Generate: {max_tokens}")
+    print(f"  Temperature: {temperature}")
+    print(f"  GPU Memory Utilization: {gpu_mem_util}")
+    print(f"  Max Model Length: {max_model_len}")
+    print("=" * 80)
+
+    # Run benchmark WITHOUT batch invariance (baseline)
+    print("\n" + "=" * 80)
+    print("PHASE 1: Running WITHOUT batch invariance (baseline)")
+    print("=" * 80)
+    baseline_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=False,
+    )
+
+    # Run benchmark WITH batch invariance
+    print("\n" + "=" * 80)
+    print("PHASE 2: Running WITH batch invariance")
+    print("=" * 80)
+    batch_inv_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=True,
+    )
+
+    # Compare results
+    print("\n" + "=" * 80)
+    print("COMPARISON: Batch Invariance vs Baseline")
+    print("=" * 80)
+
+    init_overhead_pct = (
+        (batch_inv_results["init_time"] - baseline_results["init_time"])
+        / baseline_results["init_time"]
+        * 100
+    )
+    time_overhead_pct = (
+        (batch_inv_results["avg_time"] - baseline_results["avg_time"])
+        / baseline_results["avg_time"]
+        * 100
+    )
+    throughput_change_pct = (
+        (batch_inv_results["throughput"] - baseline_results["throughput"])
+        / baseline_results["throughput"]
+        * 100
+    )
+
+    print("\nInitialization Time:")
+    print(f"  Baseline:         {baseline_results['init_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['init_time']:.2f}s")
+    print(f"  Overhead:         {init_overhead_pct:+.2f}%")
+
+    print("\nAverage Trial Time:")
+    print(f"  Baseline:         {baseline_results['avg_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['avg_time']:.2f}s")
+    print(f"  Overhead:         {time_overhead_pct:+.2f}%")
+
+    print("\nThroughput (tokens/s):")
+    print(f"  Baseline:         {baseline_results['throughput']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['throughput']:.2f}")
+    print(f"  Change:           {throughput_change_pct:+.2f}%")
+
+    print("\nPrompts/s:")
+    print(f"  Baseline:         {baseline_results['prompts_per_sec']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['prompts_per_sec']:.2f}")
+
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    if time_overhead_pct > 0:
+        print(
+            f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
+            "overhead"
+        )
+    else:
+        print(
+            f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
+            "faster (unexpected!)"
+        )
+
+    if abs(throughput_change_pct) < 1.0:
+        print("Throughput difference is negligible (< 1%)")
+    elif throughput_change_pct < 0:
+        print(
+            f"Throughput decreased by {-throughput_change_pct:.1f}% "
+            "with batch invariance"
+        )
+    else:
+        print(
+            f"Throughput increased by {throughput_change_pct:.1f}% "
+            "with batch invariance (unexpected!)"
+        )
+
+    print("=" * 80 + "\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())

From d231876ce31d8738a6e13a13591ae7d90d8b93f7 Mon Sep 17 00:00:00 2001
From: ai-jz <156989844+ai-jz@users.noreply.github.com>
Date: Sat, 15 Nov 2025 23:04:32 -0800
Subject: [PATCH 108/578] [Benchmark] Fix client seed synchronization in
 multi-turn benchmark (#28512)

Signed-off-by: ai-jz <aijz.xplr@gmail.com>
---
 benchmarks/multi_turn/benchmark_serving_multi_turn.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index ae9e9753441a..772d685ad90f 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -561,8 +561,11 @@ async def client_main(
         f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}"  # noqa: E501
     )
 
-    random.seed(args.seed)
-    np.random.seed(args.seed)
+    # Set unique seed per client (each client runs in its own process)
+    # Add 1 to ensure no client uses the same seed as the main process
+    client_seed = args.seed + client_id + 1
+    random.seed(client_seed)
+    np.random.seed(client_seed)
 
     # Active conversations
     active_convs: ConversationsMap = {}
@@ -1490,6 +1493,7 @@ async def main() -> None:
             f"Invalid --warmup-percentage={args.warmup_percentage}"
         ) from None
 
+    # Set global seeds for main process
     random.seed(args.seed)
     np.random.seed(args.seed)
 

From a55b64635c272ff1f34d20593140faa1fcbe4580 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Sun, 16 Nov 2025 16:04:50 +0800
Subject: [PATCH 109/578] [Model] Allow users to control skip reading cache per
 request. (#28194)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
---
 .../pooling/test_extract_hidden_states.py     | 29 +++++++++++++++++--
 vllm/pooling_params.py                        | 12 ++++++++
 vllm/sampling_params.py                       |  8 +++++
 vllm/v1/core/kv_cache_manager.py              | 11 ++++---
 vllm/v1/request.py                            | 15 ++++++++++
 5 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/tests/models/language/pooling/test_extract_hidden_states.py b/tests/models/language/pooling/test_extract_hidden_states.py
index f8e3fa7d1560..0d41b93233d5 100644
--- a/tests/models/language/pooling/test_extract_hidden_states.py
+++ b/tests/models/language/pooling/test_extract_hidden_states.py
@@ -11,7 +11,7 @@
     ["Qwen/Qwen3-0.6B"],
 )
 @torch.inference_mode
-def test_embed_models(hf_runner, vllm_runner, model: str):
+def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
     n_prompt_tokens = [55, 56, 57]
     token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
 
@@ -21,7 +21,7 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
         enforce_eager=True,
         runner="pooling",
         enable_chunked_prefill=False,
-        enable_prefix_caching=False,
+        enable_prefix_caching=True,
     ) as vllm_model:
         pooling_outputs = vllm_model.llm.encode(
             [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
@@ -30,4 +30,29 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
 
         for n, output in zip(n_prompt_tokens, pooling_outputs):
             assert len(output.prompt_token_ids) == n
+            assert len(output.outputs.data) == n
             assert output.num_cached_tokens == 0
+
+        # test enable_prefix_caching plus all pooling
+        # we need to skip reading cache at this request by
+        # request.skip_reading_prefix_cache
+        pooling_outputs = vllm_model.llm.encode(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+            pooling_task="token_embed",
+        )
+
+        for n, output in zip(n_prompt_tokens, pooling_outputs):
+            assert len(output.prompt_token_ids) == n
+            assert len(output.outputs.data) == n
+            assert output.num_cached_tokens == 0
+
+        # skip_reading_prefix_cache can still write to cache
+        # to accelerate following requests
+        pooling_outputs = vllm_model.llm.encode(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+            pooling_task="embed",
+        )
+
+        for n, output in zip(n_prompt_tokens, pooling_outputs):
+            assert len(output.prompt_token_ids) == n
+            assert output.num_cached_tokens > 0
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 72a8320cc1bf..5c3dfa8ac9cb 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -57,6 +57,7 @@ class PoolingParams(
     ## Internal use only
     task: PoolingTask | None = None
     requires_token_ids: bool = False
+    skip_reading_prefix_cache: bool = None
     extra_kwargs: dict[str, Any] | None = None
     output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
 
@@ -93,6 +94,8 @@ def verify(
         # plugin task uses io_processor.parse_request to verify inputs,
         # skipping PoolingParams verify
         if self.task == "plugin":
+            if self.skip_reading_prefix_cache is None:
+                self.skip_reading_prefix_cache = True
             return
 
         # NOTE: Task validation needs to done against the model instance,
@@ -122,6 +125,15 @@ def _merge_default_parameters(
             if getattr(self, k, None) is None:
                 setattr(self, k, getattr(pooler_config, k))
 
+        if self.skip_reading_prefix_cache is None:
+            # If prefix caching is enabled,
+            # the output of all pooling may less than n_prompt_tokens,
+            # we need to skip reading cache at this request.
+            if self.task in ["token_embed", "token_classify"]:
+                self.skip_reading_prefix_cache = True
+            else:
+                self.skip_reading_prefix_cache = False
+
         self._verify_step_pooling(pooler_config, valid_parameters)
 
     def _verify_step_pooling(
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index dd820840410e..901d66163452 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -254,6 +254,8 @@ class SamplingParams(
     generated token can complete the sequence."""
     _bad_words_token_ids: list[list[int]] | None = None
 
+    skip_reading_prefix_cache: bool = None
+
     @staticmethod
     def from_optional(
         n: int | None = 1,
@@ -414,6 +416,12 @@ def __post_init__(self) -> None:
             self.structured_outputs = self.guided_decoding
             self.guided_decoding = None
 
+        if self.skip_reading_prefix_cache is None:
+            # If prefix caching is enabled,
+            # the output of prompt logprobs may less than n_prompt_tokens,
+            # we need to skip reading cache at this request.
+            self.skip_reading_prefix_cache = self.prompt_logprobs is not None
+
     def _verify_args(self) -> None:
         if not isinstance(self.n, int):
             raise ValueError(f"n must be an int, but is of type {type(self.n)}")
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 63a1ff06e404..7f405fc248ac 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -185,12 +185,11 @@ def get_computed_blocks(self, request: Request) -> tuple[KVCacheBlocks, int]:
                 - A list of blocks that are computed for the request.
                 - The number of computed tokens.
         """
-        # Prefix caching is disabled or
-        # When the request requires prompt logprobs, we skip prefix caching.
-        if not self.enable_caching or (
-            request.sampling_params is not None
-            and request.sampling_params.prompt_logprobs is not None
-        ):
+        # We skip finding the prefix cache hit when prefix caching is
+        # disabled or the request is marked as skipping kv cache read
+        # (which happens when the request requires prompt logprobs
+        # or calls a pooling model with all pooling).
+        if not self.enable_caching or request.skip_reading_prefix_cache:
             return self.empty_kv_cache_blocks, 0
 
         # NOTE: When all tokens hit the cache, we must recompute the last token
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 7a5f1183ed48..3d92906fbf4b 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -127,6 +127,8 @@ def __init__(
             self.get_hash_new_full_blocks = partial(block_hasher, self)
             self.block_hashes = self.get_hash_new_full_blocks()
 
+        self.skip_reading_prefix_cache = self.get_skip_reading_prefix_cache()
+
     @classmethod
     def from_engine_core_request(
         cls,
@@ -180,6 +182,19 @@ def num_tokens_with_spec(self) -> int:
     def num_output_tokens(self) -> int:
         return len(self._output_token_ids)
 
+    def get_skip_reading_prefix_cache(self) -> bool:
+        if (
+            self.sampling_params is not None
+            and self.sampling_params.skip_reading_prefix_cache is not None
+        ):
+            return self.sampling_params.skip_reading_prefix_cache
+        elif (
+            self.pooling_params is not None
+            and self.pooling_params.skip_reading_prefix_cache is not None
+        ):
+            return self.pooling_params.skip_reading_prefix_cache
+        return False
+
     def is_finished(self) -> bool:
         return RequestStatus.is_finished(self.status)
 

From b316ac658985f542618316b4285bd213dfdde046 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Sun, 16 Nov 2025 01:01:21 -0800
Subject: [PATCH 110/578] [V1] Support MP Executor for multi node distributed
 inference (#23691)

Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 tests/distributed/test_multiproc_executor.py  | 437 ++++++++++++++++++
 vllm/config/parallel.py                       |  40 ++
 .../device_communicators/shm_broadcast.py     | 110 ++++-
 vllm/distributed/parallel_state.py            |  77 ++-
 vllm/engine/arg_utils.py                      |  91 +++-
 vllm/entrypoints/cli/serve.py                 |  31 +-
 vllm/v1/engine/utils.py                       |  15 +-
 vllm/v1/executor/multiproc_executor.py        | 197 ++++++--
 vllm/v1/worker/gpu_worker.py                  |  10 +-
 vllm/v1/worker/worker_base.py                 |   4 +-
 10 files changed, 930 insertions(+), 82 deletions(-)
 create mode 100644 tests/distributed/test_multiproc_executor.py

diff --git a/tests/distributed/test_multiproc_executor.py b/tests/distributed/test_multiproc_executor.py
new file mode 100644
index 000000000000..e741a79bc4ed
--- /dev/null
+++ b/tests/distributed/test_multiproc_executor.py
@@ -0,0 +1,437 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Integration tests for MultiprocExecutor at the executor level.
+This test directly tests the executor without going through the LLM interface,
+focusing on executor initialization, RPC calls, and distributed execution.
+"""
+
+import multiprocessing
+import os
+
+from tests.utils import multi_gpu_test
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import get_open_port
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+
+MODEL = "facebook/opt-125m"
+
+
+def create_vllm_config(
+    tensor_parallel_size: int = 1,
+    pipeline_parallel_size: int = 1,
+    max_model_len: int = 256,
+    gpu_memory_utilization: float = 0.3,
+    distributed_executor_backend: str = "mp",
+    nnodes: int = 1,
+    node_rank: int = 0,
+    master_port: int = 0,
+) -> VllmConfig:
+    """Create a VllmConfig for testing using EngineArgs."""
+    engine_args = EngineArgs(
+        model=MODEL,
+        tensor_parallel_size=tensor_parallel_size,
+        pipeline_parallel_size=pipeline_parallel_size,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+    )
+    vllm_config = engine_args.create_engine_config()
+
+    # Override distributed node settings if needed
+    if nnodes > 1 or node_rank > 0:
+        vllm_config.parallel_config.nnodes = nnodes
+        vllm_config.parallel_config.node_rank = node_rank
+        vllm_config.parallel_config.master_port = master_port
+    if nnodes > 1:
+        vllm_config.parallel_config.disable_custom_all_reduce = True
+
+    return vllm_config
+
+
+def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput:
+    """Create a minimal SchedulerOutput for testing."""
+    # This is a simplified version - in practice you'd need proper
+    # SchedulerOutput construction based on the actual vLLM v1 API
+    return SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_resumed_reqs=[],
+        scheduled_running_reqs=[],
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+    )
+
+
+def test_multiproc_executor_initialization():
+    """Test that MultiprocExecutor can be initialized with proper config."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor - this should initialize workers
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor properties
+    assert executor.world_size == 1, "World size should be 1 for single GPU"
+    assert executor.local_world_size == 1, "Local world size should be 1"
+    assert hasattr(executor, "workers"), "Executor should have workers"
+    assert len(executor.workers) == 1, "Should have 1 worker for single GPU"
+
+    # Clean up
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_initialization_tensor_parallel():
+    """Test MultiprocExecutor initialization with tensor parallelism."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor properties
+    assert executor.world_size == 2, "World size should be 2 for TP=2"
+    assert executor.local_world_size == 2, "Local world size should be 2"
+    assert len(executor.workers) == 2, "Should have 2 workers for TP=2"
+
+    # Verify output rank calculation
+    output_rank = executor._get_output_rank()
+    assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1"
+
+    # Clean up
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_collective_rpc():
+    """Test collective RPC calls to all workers."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test check_health RPC - should work without errors
+        executor.check_health()
+
+        # Test that RPC works correctly
+        # Note: We're just testing that the RPC mechanism works,
+        # not testing actual model execution here
+        assert not executor.is_failed, "Executor should not be in failed state"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_failure_callback():
+    """Test failure callback registration and invocation."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test callback registration
+        callback_invoked = []
+
+        def test_callback():
+            callback_invoked.append(True)
+
+        # Register callback
+        executor.register_failure_callback(test_callback)
+
+        # Callback should not be invoked yet
+        assert len(callback_invoked) == 0, "Callback should not be invoked immediately"
+
+        # Simulate failure
+        executor.is_failed = True
+
+        # Register another callback - should be invoked immediately
+        executor.register_failure_callback(test_callback)
+        assert len(callback_invoked) == 1, (
+            "Callback should be invoked when executor is failed"
+        )
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_worker_monitor():
+    """Test that worker monitor is set up correctly."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Verify all worker processes are alive
+        for worker in executor.workers:
+            assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive"
+
+        # Verify executor is not in failed state
+        assert not executor.is_failed, "Executor should not be in failed state"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+        # After shutdown, workers should be terminated
+        import time
+
+        time.sleep(0.5)  # Give processes time to terminate
+        for worker in executor.workers:
+            assert not worker.proc.is_alive(), (
+                f"Worker rank {worker.rank} should terminate after shutdown"
+            )
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_get_response_message_queues():
+    """Test message queue retrieval for different ranks."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Get all message queues
+        all_queues = executor.get_response_mqs()
+        assert len(all_queues) == 2, "Should have 2 message queues for 2 workers"
+
+        # Get message queue for specific rank
+        rank0_queue = executor.get_response_mqs(unique_reply_rank=0)
+        assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0"
+
+        rank1_queue = executor.get_response_mqs(unique_reply_rank=1)
+        assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_shutdown_cleanup():
+    """Test that shutdown properly cleans up resources."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor is set up
+    assert hasattr(executor, "workers"), "Executor should have workers"
+    assert len(executor.workers) > 0, "Should have at least one worker"
+
+    # Shutdown
+    executor.shutdown()
+
+    # Verify cleanup
+    import time
+
+    time.sleep(0.5)  # Give processes time to terminate
+
+    for worker in executor.workers:
+        assert not worker.proc.is_alive(), "Worker processes should be terminated"
+
+    # Verify shutdown event is set
+    assert executor.shutdown_event.is_set(), "Shutdown event should be set"
+
+    # Multiple shutdowns should be safe (idempotent)
+    executor.shutdown()
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=4)
+def test_multiproc_executor_pipeline_parallel():
+    """Test MultiprocExecutor with pipeline parallelism."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Verify executor properties
+        assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2"
+        assert len(executor.workers) == 4, "Should have 4 workers"
+
+        # Verify output rank calculation
+        # For TP=2, PP=2: output should be from the last PP stage (ranks 2-3)
+        # Specifically rank 2 (first rank of last PP stage)
+        output_rank = executor._get_output_rank()
+        assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)"
+
+        # Verify max_concurrent_batches for pipeline parallel
+        assert executor.max_concurrent_batches == 2, (
+            "Max concurrent batches should equal PP size"
+        )
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_properties():
+    """Test various executor properties and configurations."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test supports_pp property
+        assert MultiprocExecutor.supports_pp is True, (
+            "MultiprocExecutor should support pipeline parallelism"
+        )
+
+        # Test world_size calculation
+        assert executor.world_size == (
+            executor.parallel_config.tensor_parallel_size
+            * executor.parallel_config.pipeline_parallel_size
+        ), "World size should equal TP * PP"
+
+        # Test local_world_size calculation
+        assert executor.local_world_size == (
+            executor.parallel_config.world_size // executor.parallel_config.nnodes
+        ), "Local world size should be world_size / nnodes"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=4)
+def test_multiproc_executor_multi_node():
+    """
+    Test MultiprocExecutor with multi-node configuration.
+    This simulates 2 nodes with TP=4:
+    - Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2
+    - Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
+    Total world_size = 4, nnodes = 2
+    """
+    port = get_open_port()
+    # symm_mem does not work for simulating multi instance in single node
+    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+    def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int):
+        """Run a single node's executor."""
+        executor = None
+        try:
+            # Set CUDA_VISIBLE_DEVICES for this node
+            if node_rank == 0:
+                os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+            else:
+                os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
+
+            # Create config for this node
+            vllm_config = create_vllm_config(
+                tensor_parallel_size=4,  # Total TP across all nodes
+                pipeline_parallel_size=1,
+                nnodes=2,  # 2 nodes
+                node_rank=node_rank,
+                master_port=port,  # same port
+            )
+
+            # Create executor for this node
+            executor = MultiprocExecutor(vllm_config=vllm_config)
+
+            # Verify node-specific properties
+            assert executor.world_size == 4, (
+                f"World size should be 4 on node {node_rank}"
+            )
+            assert executor.local_world_size == 2, (
+                f"Local world size should be 2 on node {node_rank}"
+            )
+            assert len(executor.workers) == 2, (
+                f"Should have 2 local workers on node {node_rank}"
+            )
+
+            # Verify worker ranks are correct for this node
+            expected_ranks = [node_rank * 2, node_rank * 2 + 1]
+            actual_ranks = sorted([w.rank for w in executor.workers])
+            assert actual_ranks == expected_ranks, (
+                f"Node {node_rank} should have workers "
+                f"with ranks {expected_ranks}, got {actual_ranks}"
+            )
+            # Verify all workers are alive
+            for worker in executor.workers:
+                assert worker.proc.is_alive(), (
+                    f"Worker rank {worker.rank} should be alive on node {node_rank}"
+                )
+            # executor.gen
+            # Put success result in queue BEFORE shutdown to avoid hanging
+            result_queue.put({"node": node_rank, "success": True})
+            import time
+
+            time.sleep(2)
+            executor.shutdown()
+        except Exception as e:
+            # Put failure result in queue
+            result_queue.put({"node": node_rank, "success": False, "error": str(e)})
+            raise e
+        finally:
+            if executor is not None:
+                executor.shutdown()
+
+    # Create a queue to collect results from both processes
+    result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue()
+
+    # Start both node processes
+    processes = []
+    for node_rank in range(2):
+        p = multiprocessing.Process(
+            target=run_node,
+            args=(node_rank, result_queue, port),
+            name=f"Node{node_rank}",
+        )
+        p.start()
+        processes.append(p)
+
+    # Wait for both processes to complete
+    all_completed = True
+    for p in processes:
+        p.join(timeout=60)
+        if p.is_alive():
+            p.terminate()
+            p.join(timeout=20)
+            if p.is_alive():
+                p.kill()
+                p.join()
+            all_completed = False
+
+    # Check results from both nodes
+    results: list[dict[str, int | bool]] = []
+    while len(results) < 2:
+        try:
+            result = result_queue.get(timeout=1)
+            results.append(result)
+        except Exception:
+            pass
+    assert all_completed, "Not all processes completed successfully"
+    assert len(results) == 2, f"Expected 2 results, got {len(results)}"
+    assert results[0]["success"], f"Node 0 failed: {results[0]}"
+    assert results[1]["success"], f"Node 1 failed: {results[1]}"
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 61bcd15e06a8..9a6326d62e82 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -210,6 +210,18 @@ class ParallelConfig:
     class is dynamically inherited by the worker class. This is used to inject
     new attributes and methods to the worker class for use in collective_rpc
     calls."""
+    master_addr: str = "127.0.0.1"
+    """distributed master address for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    master_port: int = 29501
+    """distributed master port for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    node_rank: int = 0
+    """distributed node rank for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    nnodes: int = 1
+    """num of nodes for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
 
     world_size: int = Field(init=False)
     """world_size is TPxPP, it affects the number of workers we create."""
@@ -387,6 +399,23 @@ def use_sequence_parallel_moe(self) -> bool:
             and self.data_parallel_size > 1
         )
 
+    @property
+    def node_rank_within_dp(self) -> int:
+        return self.node_rank % self.nnodes_within_dp
+
+    @property
+    def nnodes_within_dp(self) -> int:
+        if self.nnodes == 1:
+            return 1
+        data_parallel_node_size = (
+            self.data_parallel_size // self.data_parallel_size_local
+        )
+        return self.nnodes // data_parallel_node_size
+
+    @property
+    def local_world_size(self) -> int:
+        return self.world_size // self.nnodes_within_dp
+
     @staticmethod
     def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
         tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu")
@@ -528,6 +557,8 @@ def __post_init__(self) -> None:
             ray_found = ray_utils.ray_is_available()
             if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
                 backend = "uni"
+            elif current_platform.is_cuda() and self.nnodes > 1:
+                backend = "mp"
             elif (
                 current_platform.is_cuda()
                 and cuda_device_count_stateless() < self.world_size
@@ -565,6 +596,10 @@ def __post_init__(self) -> None:
                 "max_parallel_loading_workers is currently "
                 "not supported and will be ignored."
             )
+        if self.distributed_executor_backend != "mp" and self.nnodes > 1:
+            raise ValueError(
+                "nnodes > 1 can only be set when distributed exectuor backend is mp."
+            )
 
     @property
     def use_ray(self) -> bool:
@@ -607,6 +642,11 @@ def _verify_args(self) -> Self:
                 "Disabled the custom all-reduce kernel because it is not "
                 "supported on current platform."
             )
+        if self.nnodes > 1:
+            self.disable_custom_all_reduce = True
+            logger.debug(
+                "Disabled the custom all-reduce since we are running on multi-node."
+            )
         if self.ray_workers_use_nsight and not self.use_ray:
             raise ValueError(
                 "Unable to use nsight profiling unless workers run with Ray."
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 5046cac2e90a..052df19e34d7 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -8,7 +8,7 @@
 from multiprocessing import shared_memory
 from pickle import PickleBuffer
 from threading import Event
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 from unittest.mock import patch
 
 import torch
@@ -602,13 +602,87 @@ def broadcast_object(self, obj=None):
             return obj
         return self.dequeue()
 
+    @staticmethod
+    def create_from_process_group_single_reader(
+        pg: ProcessGroup,
+        max_chunk_bytes,
+        max_chunks,
+        reader_rank: int = 0,
+        blocking: bool = False,
+    ) -> tuple["MessageQueue", list[Handle]]:
+        """
+        Creates a MessageQueue for a process group with a single reader.
+
+        This method is designed for scenarios where only one process (the reader)
+        will consume messages, and all other processes are writers. It sets up
+        the shared memory buffer and communication handles accordingly, and
+        gathers the handles from all processes to the reader.
+
+        Args:
+            pg (ProcessGroup): The torch distributed process group.
+            max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer.
+            max_chunks (int): Maximum number of chunks in the buffer.
+            reader_rank (int, optional): The global rank that will act as the reader.
+                Defaults to 0.
+            blocking (bool, optional): If True, blocks until all processes are ready.
+                Defaults to False.
+
+        Returns:
+            tuple[MessageQueue, list[Handle]]:
+            The MessageQueue instance for the calling process,
+            and a list of handles (only non-empty for the reader process).
+        """
+        local_size = torch.cuda.device_count()
+        rank = dist.get_rank()
+        same_node = rank // local_size == reader_rank // local_size
+        buffer_io = MessageQueue(
+            n_reader=1,
+            n_local_reader=1 if same_node else 0,
+            max_chunk_bytes=max_chunk_bytes,
+            max_chunks=max_chunks,
+        )
+        handle = buffer_io.export_handle()
+        handles = [None] * dist.get_world_size(pg) if rank == reader_rank else None
+        dist.gather_object(handle, handles, dst=reader_rank, group=pg)
+        if blocking:
+            buffer_io.wait_until_ready()
+        return buffer_io, cast(list[Handle], handles or [])
+
     @staticmethod
     def create_from_process_group(
         pg: ProcessGroup | StatelessProcessGroup,
         max_chunk_bytes,
         max_chunks,
-        writer_rank=0,
+        writer_rank: int = 0,
+        external_writer_handle=None,
+        blocking: bool = True,
     ) -> "MessageQueue":
+        """
+        Creates a MessageQueue for a distributed process group with one writer and
+        multiple readers.
+
+        This method is designed for scenarios where one process (the writer) sends
+        messages, and all other processes (the readers) receive messages. It sets up
+        the shared memory buffer and socket communication handles accordingly, and
+        broadcasts the handle from the writer to all readers.
+
+        Args:
+            pg (ProcessGroup | StatelessProcessGroup): The torch distributed process
+                group.
+            max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer.
+            max_chunks (int): Maximum number of chunks in the buffer.
+            writer_rank (int, optional): The global rank that will act as the writer.
+                Defaults to 0.
+            external_writer_handle (Handle, optional): Used when there is a handle
+                from an external Message Queue. If provided, use this handle to init
+                PG writer message queue instead of creating a new one. Defaults to None.
+            blocking (bool, optional): If True, blocks until all processes are ready.
+                Defaults to True.
+
+        Returns:
+            MessageQueue: The MessageQueue instance for the calling process.
+
+        """
         if isinstance(pg, ProcessGroup):
             group_rank = dist.get_rank(pg)
             group_world_size = dist.get_world_size(pg)
@@ -617,23 +691,26 @@ def create_from_process_group(
             group_rank = pg.rank
             group_world_size = pg.world_size
             global_ranks = list(range(pg.world_size))
-
         from vllm.distributed.parallel_state import in_the_same_node_as
 
         status = in_the_same_node_as(pg, source_rank=writer_rank)
-        same_node_ranks = [i for i, s in enumerate(status) if s]
-        n_reader = group_world_size - 1
-        n_local_reader = len(same_node_ranks) - 1
-        local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
-        buffer_io: MessageQueue
         if group_rank == writer_rank:
-            buffer_io = MessageQueue(
-                n_reader=n_reader,
-                n_local_reader=n_local_reader,
-                local_reader_ranks=local_reader_ranks,
-                max_chunk_bytes=max_chunk_bytes,
-                max_chunks=max_chunks,
-            )
+            if external_writer_handle is not None:
+                buffer_io = MessageQueue.create_from_handle(
+                    external_writer_handle, group_rank
+                )
+            else:
+                same_node_ranks = [i for i, s in enumerate(status) if s]
+                n_reader = group_world_size - 1
+                n_local_reader = len(same_node_ranks) - 1
+                local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
+                buffer_io = MessageQueue(
+                    n_reader=n_reader,
+                    n_local_reader=n_local_reader,
+                    local_reader_ranks=local_reader_ranks,
+                    max_chunk_bytes=max_chunk_bytes,
+                    max_chunks=max_chunks,
+                )
             handle = buffer_io.export_handle()
             if isinstance(pg, ProcessGroup):
                 dist.broadcast_object_list(
@@ -651,5 +728,6 @@ def create_from_process_group(
             else:
                 handle = pg.broadcast_obj(None, writer_rank)
             buffer_io = MessageQueue.create_from_handle(handle, group_rank)
-        buffer_io.wait_until_ready()
+        if blocking:
+            buffer_io.wait_until_ready()
         return buffer_io
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index c78e6a32733c..852c4c644433 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -385,6 +385,33 @@ def __init__(
             torch.ops._C, "init_shm_manager"
         )
 
+    def create_mq_broadcaster(
+        self, writer_rank=0, external_writer_handle=None, blocking=True
+    ):
+        from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+
+        return MessageQueue.create_from_process_group(
+            self.cpu_group,
+            1 << 22,
+            6,
+            writer_rank=writer_rank,
+            external_writer_handle=external_writer_handle,
+            blocking=blocking,
+        )
+
+    def create_single_reader_mq_broadcasters(
+        self, reader_rank_in_group=0, blocking=False
+    ):
+        from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+
+        return MessageQueue.create_from_process_group_single_reader(
+            self.cpu_group,
+            1 << 22,
+            6,
+            reader_rank=self.ranks[reader_rank_in_group],
+            blocking=blocking,
+        )
+
     @property
     def first_rank(self):
         """Return the global rank of the first process in the group"""
@@ -997,6 +1024,7 @@ def combine(
 
 
 _WORLD: GroupCoordinator | None = None
+_INNER_DP_WORLD: GroupCoordinator | None = None
 _NODE_COUNT: int | None = None
 
 
@@ -1005,6 +1033,11 @@ def get_world_group() -> GroupCoordinator:
     return _WORLD
 
 
+def get_inner_dp_world_group() -> GroupCoordinator:
+    assert _INNER_DP_WORLD is not None, "inner dp world group is not initialized"
+    return _INNER_DP_WORLD
+
+
 def init_world_group(
     ranks: list[int], local_rank: int, backend: str
 ) -> GroupCoordinator:
@@ -1023,12 +1056,13 @@ def init_model_parallel_group(
     backend: str,
     use_message_queue_broadcaster: bool = False,
     group_name: str | None = None,
+    use_device_communicator: bool = True,
 ) -> GroupCoordinator:
     return GroupCoordinator(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_device_communicator=True,
+        use_device_communicator=use_device_communicator,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
         group_name=group_name,
     )
@@ -1143,7 +1177,14 @@ def init_distributed_environment(
     from vllm.config import get_current_vllm_config
 
     config = get_current_vllm_config()
-    if (
+    if config is not None and config.parallel_config.nnodes > 1:
+        parallel_config = config.parallel_config
+        ip = parallel_config.master_addr
+        rank = parallel_config.data_parallel_rank * world_size + rank
+        world_size = parallel_config.world_size_across_dp
+        port = parallel_config.master_port
+        distributed_init_method = get_distributed_init_method(ip, port)
+    elif (
         config is not None
         and config.parallel_config.data_parallel_size > 1
         and config.parallel_config.distributed_executor_backend != "external_launcher"
@@ -1164,6 +1205,14 @@ def init_distributed_environment(
             distributed_init_method,
         )
     if not torch.distributed.is_initialized():
+        logger.info(
+            "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s",
+            world_size,
+            rank,
+            local_rank,
+            distributed_init_method,
+            backend,
+        )
         assert distributed_init_method is not None, (
             "distributed_init_method must be provided when initializing "
             "distributed environment"
@@ -1192,16 +1241,36 @@ def init_distributed_environment(
         # local rank not set, this usually happens in single-node
         # setting, where we can use rank as local rank
         local_rank = envs.LOCAL_RANK if distributed_init_method == "env://" else rank
-    global _WORLD, _NODE_COUNT
+    global _WORLD, _NODE_COUNT, _INNER_DP_WORLD
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
         _WORLD = init_world_group(ranks, local_rank, backend)
-        _NODE_COUNT = _node_count(_WORLD.cpu_group)
+        if config.parallel_config.nnodes > 1:
+            _NODE_COUNT = config.parallel_config.nnodes
+        else:
+            _NODE_COUNT = _node_count(_WORLD.cpu_group)
         logger.debug("Detected %d nodes in the distributed environment", _NODE_COUNT)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size"
         )
+    if config.parallel_config.nnodes_within_dp > 1:
+        if parallel_config.data_parallel_size > 1:
+            world_size_inner_dp = parallel_config.world_size
+            group_ranks = [
+                [dp_rank * world_size_inner_dp + i for i in range(world_size_inner_dp)]
+                for dp_rank in range(parallel_config.data_parallel_size)
+            ]
+            _INNER_DP_WORLD = init_model_parallel_group(
+                group_ranks,
+                get_world_group().local_rank,
+                backend,
+                use_message_queue_broadcaster=True,
+                group_name="inner_dp_world",
+                use_device_communicator=False,
+            )
+        else:
+            _INNER_DP_WORLD = _WORLD
 
 
 def initialize_model_parallel(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 999ed780c20b..d011dfdbfbb2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -384,6 +384,10 @@ class EngineArgs:
     ) = ParallelConfig.distributed_executor_backend
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
+    master_addr: str = ParallelConfig.master_addr
+    master_port: int = ParallelConfig.master_port
+    nnodes: int = ParallelConfig.nnodes
+    node_rank: int = ParallelConfig.node_rank
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
     decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size
     dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size
@@ -394,6 +398,7 @@ class EngineArgs:
     data_parallel_address: str | None = None
     data_parallel_rpc_port: int | None = None
     data_parallel_hybrid_lb: bool = False
+    data_parallel_external_lb: bool = False
     data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     all2all_backend: str | None = ParallelConfig.all2all_backend
@@ -749,6 +754,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "-pp",
             **parallel_kwargs["pipeline_parallel_size"],
         )
+        parallel_group.add_argument("--master-addr", **parallel_kwargs["master_addr"])
+        parallel_group.add_argument("--master-port", **parallel_kwargs["master_port"])
+        parallel_group.add_argument("--nnodes", "-n", **parallel_kwargs["nnodes"])
+        parallel_group.add_argument("--node-rank", "-r", **parallel_kwargs["node_rank"])
         parallel_group.add_argument(
             "--tensor-parallel-size", "-tp", **parallel_kwargs["tensor_parallel_size"]
         )
@@ -803,7 +812,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='Backend for data parallel, either "mp" or "ray".',
         )
         parallel_group.add_argument(
-            "--data-parallel-hybrid-lb", **parallel_kwargs["data_parallel_hybrid_lb"]
+            "--data-parallel-hybrid-lb",
+            "-dph",
+            **parallel_kwargs["data_parallel_hybrid_lb"],
+        )
+        parallel_group.add_argument(
+            "--data-parallel-external-lb",
+            "-dpe",
+            **parallel_kwargs["data_parallel_external_lb"],
         )
         parallel_group.add_argument(
             "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"]
@@ -1428,12 +1444,56 @@ def create_engine_config(
         assert not headless or not self.data_parallel_hybrid_lb, (
             "data_parallel_hybrid_lb is not applicable in headless mode"
         )
-
-        data_parallel_external_lb = self.data_parallel_rank is not None
+        assert not (self.data_parallel_hybrid_lb and self.data_parallel_external_lb), (
+            "data_parallel_hybrid_lb and data_parallel_external_lb cannot both be True."
+        )
+        assert self.data_parallel_backend == "mp" or self.nnodes == 1, (
+            "nnodes > 1 is only supported with data_parallel_backend=mp"
+        )
+        inferred_data_parallel_rank = 0
+        if self.nnodes > 1:
+            world_size = (
+                self.data_parallel_size
+                * self.pipeline_parallel_size
+                * self.tensor_parallel_size
+            )
+            world_size_within_dp = (
+                self.pipeline_parallel_size * self.tensor_parallel_size
+            )
+            local_world_size = world_size // self.nnodes
+            assert world_size % self.nnodes == 0, (
+                f"world_size={world_size} must be divisible by nnodes={self.nnodes}."
+            )
+            assert self.node_rank < self.nnodes, (
+                f"node_rank={self.node_rank} must be less than nnodes={self.nnodes}."
+            )
+            inferred_data_parallel_rank = (
+                self.node_rank * local_world_size
+            ) // world_size_within_dp
+            if self.data_parallel_size > 1 and self.data_parallel_external_lb:
+                self.data_parallel_rank = inferred_data_parallel_rank
+                logger.info(
+                    "Inferred data_parallel_rank %d from node_rank %d for external lb",
+                    self.data_parallel_rank,
+                    self.node_rank,
+                )
+            elif self.data_parallel_size_local is None:
+                # Infer data parallel size local for internal dplb:
+                self.data_parallel_size_local = max(
+                    local_world_size // world_size_within_dp, 1
+                )
+        data_parallel_external_lb = (
+            self.data_parallel_external_lb or self.data_parallel_rank is not None
+        )
         # Local DP rank = 1, use pure-external LB.
         if data_parallel_external_lb:
+            assert self.data_parallel_rank is not None, (
+                "data_parallel_rank or node_rank must be spefified if "
+                "data_parallel_external_lb is enable."
+            )
             assert self.data_parallel_size_local in (1, None), (
-                "data_parallel_size_local must be 1 when data_parallel_rank is set"
+                "data_parallel_size_local must be 1 or None when data_parallel_rank "
+                "is set"
             )
             data_parallel_size_local = 1
             # Use full external lb if we have local_size of 1.
@@ -1447,6 +1507,11 @@ def create_engine_config(
 
             if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
                 # Use full external lb if we have local_size of 1.
+                logger.warning(
+                    "data_parallel_hybrid_lb is not eligible when "
+                    "data_parallel_size_local = 1, autoswitch to "
+                    "data_parallel_external_lb."
+                )
                 data_parallel_external_lb = True
                 self.data_parallel_hybrid_lb = False
 
@@ -1454,7 +1519,15 @@ def create_engine_config(
                 # Disable hybrid LB mode if set for a single node
                 self.data_parallel_hybrid_lb = False
 
-            self.data_parallel_rank = self.data_parallel_start_rank or 0
+            self.data_parallel_rank = (
+                self.data_parallel_start_rank or inferred_data_parallel_rank
+            )
+            if self.nnodes > 1:
+                logger.info(
+                    "Inferred data_parallel_rank %d from node_rank %d",
+                    self.data_parallel_rank,
+                    self.node_rank,
+                )
         else:
             assert not self.data_parallel_hybrid_lb, (
                 "data_parallel_size_local must be set to use data_parallel_hybrid_lb."
@@ -1484,7 +1557,9 @@ def create_engine_config(
                     "data_parallel_backend can only be ray or mp, got %s",
                     self.data_parallel_backend,
                 )
-                data_parallel_address = ParallelConfig.data_parallel_master_ip
+                data_parallel_address = (
+                    self.master_addr or ParallelConfig.data_parallel_master_ip
+                )
         else:
             data_parallel_address = self.data_parallel_address
 
@@ -1517,6 +1592,10 @@ def create_engine_config(
             data_parallel_rank=self.data_parallel_rank or 0,
             data_parallel_external_lb=data_parallel_external_lb,
             data_parallel_size_local=data_parallel_size_local,
+            master_addr=self.master_addr,
+            master_port=self.master_port,
+            nnodes=self.nnodes,
+            node_rank=self.node_rank,
             data_parallel_master_ip=data_parallel_address,
             data_parallel_rpc_port=data_parallel_rpc_port,
             data_parallel_backend=self.data_parallel_backend,
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 2678658dd126..96608f360e17 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -24,6 +24,7 @@
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
 from vllm.v1.executor import Executor
+from vllm.v1.executor.multiproc_executor import MultiprocExecutor
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
 from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
 
@@ -97,18 +98,40 @@ def run_headless(args: argparse.Namespace):
     if local_engine_count <= 0:
         raise ValueError("data_parallel_size_local must be > 0 in headless mode")
 
-    host = parallel_config.data_parallel_master_ip
-    port = engine_args.data_parallel_rpc_port  # add to config too
-    handshake_address = get_tcp_uri(host, port)
+    shutdown_requested = False
 
     # Catch SIGTERM and SIGINT to allow graceful shutdown.
     def signal_handler(signum, frame):
+        nonlocal shutdown_requested
         logger.debug("Received %d signal.", signum)
-        raise SystemExit
+        if not shutdown_requested:
+            shutdown_requested = True
+            raise SystemExit
 
     signal.signal(signal.SIGTERM, signal_handler)
     signal.signal(signal.SIGINT, signal_handler)
 
+    if parallel_config.node_rank_within_dp > 0:
+        from vllm.version import __version__ as VLLM_VERSION
+
+        # Run headless workers (for multi-node PP/TP).
+        host = parallel_config.master_addr
+        head_node_address = f"{host}:{parallel_config.master_port}"
+        logger.info(
+            "Launching vLLM (v%s) headless multiproc executor, "
+            "with head node address %s for torch.distributed process group.",
+            VLLM_VERSION,
+            head_node_address,
+        )
+
+        executor = MultiprocExecutor(vllm_config, monitor_workers=False)
+        executor.start_worker_monitor(inline=True)
+        return
+
+    host = parallel_config.data_parallel_master_ip
+    port = parallel_config.data_parallel_rpc_port
+    handshake_address = get_tcp_uri(host, port)
+
     logger.info(
         "Launching %d data parallel engine(s) in headless mode, "
         "with head node address %s.",
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index e74519b21aa6..d65cad7af03d 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -183,15 +183,19 @@ def set_device_control_env_var(
     for engine subprocess.
     """
     world_size = vllm_config.parallel_config.world_size
+    local_world_size = vllm_config.parallel_config.local_world_size
     evar = current_platform.device_control_env_var
 
-    value = get_device_indices(evar, local_dp_rank, world_size)
+    value = get_device_indices(evar, local_dp_rank, world_size, local_world_size)
     with patch.dict(os.environ, values=((evar, value),)):
         yield
 
 
 def get_device_indices(
-    device_control_env_var: str, local_dp_rank: int, world_size: int
+    device_control_env_var: str,
+    local_dp_rank: int,
+    world_size: int,
+    local_world_size: int | None = None,
 ):
     """
     Returns a comma-separated string of device indices for the specified
@@ -200,10 +204,15 @@ def get_device_indices(
     For example, if world_size=2 and local_dp_rank=1, and there are 4 devices,
     this will select devices 2 and 3 for local_dp_rank=1.
     """
+    if local_world_size is None:
+        local_world_size = world_size
     try:
         value = ",".join(
             str(current_platform.device_id_to_physical_device_id(i))
-            for i in range(local_dp_rank * world_size, (local_dp_rank + 1) * world_size)
+            for i in range(
+                local_dp_rank * world_size,
+                local_dp_rank * world_size + local_world_size,
+            )
         )
     except IndexError as e:
         raise Exception(
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 881e6ef40aaf..ad2ece50f981 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -10,7 +10,7 @@
 import traceback
 import weakref
 from collections import deque
-from collections.abc import Callable
+from collections.abc import Callable, Sequence
 from concurrent.futures import Future, InvalidStateError
 from contextlib import suppress
 from dataclasses import dataclass
@@ -34,6 +34,7 @@
     get_dcp_group,
     get_dp_group,
     get_ep_group,
+    get_inner_dp_world_group,
     get_pp_group,
     get_tp_group,
 )
@@ -90,6 +91,10 @@ def wait_for_response(self, get_response: Callable):
 class MultiprocExecutor(Executor):
     supports_pp: bool = True
 
+    def __init__(self, vllm_config: VllmConfig, monitor_workers: bool = True):
+        self.monitor_workers = monitor_workers
+        super().__init__(vllm_config)
+
     def _init_executor(self) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
@@ -99,6 +104,12 @@ def _init_executor(self) -> None:
         self.failure_callback: FailureCallback | None = None
 
         self.world_size = self.parallel_config.world_size
+        assert self.world_size % self.parallel_config.nnodes_within_dp == 0, (
+            f"global world_size ({self.parallel_config.world_size}) must be "
+            f"divisible by nnodes_within_dp "
+            f"({self.parallel_config.nnodes_within_dp}). "
+        )
+        self.local_world_size = self.parallel_config.local_world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
         pp_parallel_size = self.parallel_config.pipeline_parallel_size
         assert self.world_size == tensor_parallel_size * pp_parallel_size, (
@@ -116,27 +127,37 @@ def _init_executor(self) -> None:
         distributed_init_method = get_distributed_init_method(
             get_loopback_ip(), get_open_port()
         )
-
+        self.rpc_broadcast_mq: MessageQueue | None = None
+        scheduler_output_handle: Handle | None = None
         # Initialize worker and set up message queues for SchedulerOutputs
         # and ModelRunnerOutputs
-        max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
-        self.rpc_broadcast_mq = MessageQueue(
-            self.world_size, self.world_size, max_chunk_bytes=max_chunk_bytes
-        )
-        scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
-
+        if self.parallel_config.node_rank_within_dp == 0:
+            # For leader node within each dp rank,
+            # each dp will have its own leader multiproc executor.
+            max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+            self.rpc_broadcast_mq = MessageQueue(
+                self.world_size,
+                self.local_world_size,
+                max_chunk_bytes=max_chunk_bytes,
+                connect_ip=self.parallel_config.master_addr,
+            )
+            scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
         # Create workers
         context = get_mp_context()
         shared_worker_lock = context.Lock()
         unready_workers: list[UnreadyWorkerProcHandle] = []
         success = False
         try:
-            for rank in range(self.world_size):
+            global_start_rank = (
+                self.local_world_size * self.parallel_config.node_rank_within_dp
+            )
+            for local_rank in range(self.local_world_size):
+                global_rank = global_start_rank + local_rank
                 unready_workers.append(
                     WorkerProc.make_worker_process(
                         vllm_config=self.vllm_config,
-                        local_rank=rank,
-                        rank=rank,
+                        local_rank=local_rank,
+                        rank=global_rank,
                         distributed_init_method=distributed_init_method,
                         input_shm_handle=scheduler_output_handle,
                         shared_worker_lock=shared_worker_lock,
@@ -145,15 +166,38 @@ def _init_executor(self) -> None:
 
             # Workers must be created before wait_for_ready to avoid
             # deadlock, since worker.init_device() does a device sync.
+
+            # Wait for all local workers to be ready.
             self.workers = WorkerProc.wait_for_ready(unready_workers)
 
+            # Start background thread to monitor worker health if not in headless mode.
+            if self.monitor_workers:
+                self.start_worker_monitor()
+
+            self.response_mqs = []
+            # Only leader node have remote response mqs
+            if self.parallel_config.node_rank_within_dp == 0:
+                for rank in range(self.world_size):
+                    if rank < self.local_world_size:
+                        local_message_queue = self.workers[rank].worker_response_mq
+                        assert local_message_queue is not None
+                        self.response_mqs.append(local_message_queue)
+                    else:
+                        remote_message_queue = self.workers[0].peer_worker_response_mqs[
+                            rank
+                        ]
+                        assert remote_message_queue is not None
+                        self.response_mqs.append(remote_message_queue)
+
             # Ensure message queues are ready. Will deadlock if re-ordered
             # Must be kept consistent with the WorkerProc.
-            self.rpc_broadcast_mq.wait_until_ready()
-            for w in self.workers:
-                w.worker_response_mq.wait_until_ready()
 
-            self.start_worker_monitor()
+            # Wait for all input mqs to be ready.
+            if self.rpc_broadcast_mq is not None:
+                self.rpc_broadcast_mq.wait_until_ready()
+            # Wait for all remote response mqs to be ready.
+            for response_mq in self.response_mqs:
+                response_mq.wait_until_ready()
             success = True
         finally:
             if not success:
@@ -168,7 +212,7 @@ def _init_executor(self) -> None:
 
         self.output_rank = self._get_output_rank()
 
-    def start_worker_monitor(self):
+    def start_worker_monitor(self, inline=False) -> None:
         workers = self.workers
         self_ref = weakref.ref(self)
 
@@ -192,9 +236,13 @@ def monitor_workers():
                 _self.failure_callback = None
                 callback()
 
-        Thread(
-            target=monitor_workers, daemon=True, name="MultiprocWorkerMonitor"
-        ).start()
+        if not inline:
+            Thread(
+                target=monitor_workers, daemon=True, name="MultiprocWorkerMonitor"
+            ).start()
+            return
+
+        monitor_workers()
 
     def register_failure_callback(self, callback: FailureCallback):
         if self.is_failed:
@@ -247,7 +295,9 @@ def collective_rpc(  # type: ignore[override]
     ) -> Any | list[Any] | Future[Any | list[Any]]:
         """Returns single result if unique_reply_rank and/or kv_output_aggregator
         is provided, otherwise list."""
-
+        assert self.rpc_broadcast_mq is not None, (
+            "collective_rpc should not be called on follower node"
+        )
         if self.is_failed:
             raise RuntimeError("Executor failed.")
 
@@ -269,20 +319,20 @@ def collective_rpc(  # type: ignore[override]
             send_method = cloudpickle.dumps(method, protocol=pickle.HIGHEST_PROTOCOL)
         self.rpc_broadcast_mq.enqueue((send_method, args, kwargs, output_rank))
 
-        workers = (
-            (self.workers[output_rank],) if output_rank is not None else self.workers
-        )
+        response_mqs: Sequence[MessageQueue] = self.response_mqs
+        if output_rank is not None:
+            response_mqs = (response_mqs[output_rank],)
 
         shutdown_event = self.shutdown_event
 
         def get_response():
             responses = []
-            for w in workers:
+            for mq in response_mqs:
                 dequeue_timeout = (
                     None if deadline is None else (deadline - time.monotonic())
                 )
                 try:
-                    status, result = w.worker_response_mq.dequeue(
+                    status, result = mq.dequeue(
                         timeout=dequeue_timeout, cancel=shutdown_event
                     )
                 except TimeoutError as e:
@@ -391,17 +441,26 @@ class UnreadyWorkerProcHandle:
 class WorkerProcHandle:
     proc: BaseProcess
     rank: int
-    worker_response_mq: MessageQueue  # The worker process writes to this MQ
+    # The worker process writes to this MQ in single-node mode
+    worker_response_mq: MessageQueue | None
+    # This is only non empty on driver node,
+    # the peer worker process i writes to MQ
+    # `peer_worker_response_mqs[i]`
+    peer_worker_response_mqs: list[MessageQueue | None]
     death_writer: Connection | None = None
 
     @classmethod
     def from_unready_handle(
-        cls, unready_handle: UnreadyWorkerProcHandle, worker_response_mq: MessageQueue
+        cls,
+        unready_handle: UnreadyWorkerProcHandle,
+        worker_response_mq: MessageQueue | None,
+        peer_worker_response_mqs: list[MessageQueue | None],
     ) -> "WorkerProcHandle":
         return cls(
             proc=unready_handle.proc,
             rank=unready_handle.rank,
             worker_response_mq=worker_response_mq,
+            peer_worker_response_mqs=peer_worker_response_mqs,
             death_writer=unready_handle.death_writer,
         )
 
@@ -411,6 +470,38 @@ class WorkerProc:
 
     READY_STR = "READY"
 
+    def _init_message_queues(
+        self, input_shm_handle: Handle, vllm_config: VllmConfig
+    ) -> None:
+        if vllm_config.parallel_config.nnodes_within_dp == 1:
+            # Initialize MessageQueue for receiving SchedulerOutput
+            self.rpc_broadcast_mq = MessageQueue.create_from_handle(
+                input_shm_handle, self.worker.rank
+            )
+
+            # Initializes a message queue for sending the model output
+            self.worker_response_mq: MessageQueue = MessageQueue(1, 1)
+            self.peer_response_handles = []
+        else:
+            # Initialize remote MessageQueue for receiving SchedulerOutput across nodes
+            self.rpc_broadcast_mq = get_inner_dp_world_group().create_mq_broadcaster(
+                external_writer_handle=input_shm_handle,
+                # Since there is external_writer_handle from executor proc,
+                # where the ready signal from actual writer is sent out of the
+                # create_mq_broadcaster method and after this setup, we make it
+                # non blocking. The handshake will be triggered when
+                # worker.rpc_broadcast_mq.wait_until_ready() is called
+                blocking=False,
+            )
+            # Initializes remote message queue for sending the model output to the
+            # driver worker, exposing peer_response_handles for driver worker
+            # that include handles for all ranks
+            self.worker_response_mq, self.peer_response_handles = (
+                get_inner_dp_world_group().create_single_reader_mq_broadcasters(
+                    reader_rank_in_group=0
+                )
+            )
+
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -421,13 +512,15 @@ def __init__(
         shared_worker_lock: LockType,
     ):
         self.rank = rank
-        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
+        wrapper = WorkerWrapperBase(
+            vllm_config=vllm_config, rpc_rank=local_rank, global_rank=rank
+        )
         # TODO: move `init_worker` to executor level as a collective rpc call
         all_kwargs: list[dict] = [
             {} for _ in range(vllm_config.parallel_config.world_size)
         ]
         is_driver_worker = rank % vllm_config.parallel_config.tensor_parallel_size == 0
-        all_kwargs[rank] = {
+        all_kwargs[local_rank] = {
             "vllm_config": vllm_config,
             "local_rank": local_rank,
             "rank": rank,
@@ -438,14 +531,6 @@ def __init__(
         wrapper.init_worker(all_kwargs)
         self.worker = wrapper
 
-        # Initialize MessageQueue for receiving SchedulerOutput
-        self.rpc_broadcast_mq = MessageQueue.create_from_handle(
-            input_shm_handle, self.worker.rank
-        )
-
-        # Initializes a message queue for sending the model output
-        self.worker_response_mq = MessageQueue(1, 1)
-
         scheduler_config = vllm_config.scheduler_config
         self.use_async_scheduling = scheduler_config.async_scheduling
         if self.use_async_scheduling:
@@ -466,6 +551,7 @@ def __init__(
         )
 
         # Load model
+        self._init_message_queues(input_shm_handle, vllm_config)
         self.worker.load_model()
 
         # Enable environment variable cache (e.g. assume no more
@@ -512,6 +598,27 @@ def make_worker_process(
         # death_reader in child will get EOFError
         return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
 
+    @staticmethod
+    def wait_for_response_handle_ready(
+        handles: dict[str, Any], proc_handle: UnreadyWorkerProcHandle
+    ) -> WorkerProcHandle:
+        response_handle = handles["handle"]
+        worker_response_mq: MessageQueue | None = None
+        if len(response_handle.local_reader_ranks) > 0:
+            worker_response_mq = MessageQueue.create_from_handle(response_handle, 0)
+        peer_response_handles = handles["peer_response_handles"]
+        peer_worker_response_mqs = [
+            MessageQueue.create_from_handle(handle, -1)
+            if handle.remote_subscribe_addr is not None
+            else None
+            for handle in peer_response_handles
+        ]
+        return WorkerProcHandle.from_unready_handle(
+            proc_handle,
+            worker_response_mq,
+            peer_worker_response_mqs=peer_worker_response_mqs,
+        )
+
     @staticmethod
     def wait_for_ready(
         unready_proc_handles: list[UnreadyWorkerProcHandle],
@@ -537,16 +644,10 @@ def wait_for_ready(
                     if response["status"] != "READY":
                         raise e
 
-                    # Extract the message queue handle.
-                    worker_response_mq = MessageQueue.create_from_handle(
-                        response["handle"], 0
-                    )
-                    ready_proc_handles[unready_proc_handle.rank] = (
-                        WorkerProcHandle.from_unready_handle(
-                            unready_proc_handle, worker_response_mq
-                        )
+                    idx = unready_proc_handle.rank % len(ready_proc_handles)
+                    ready_proc_handles[idx] = WorkerProc.wait_for_response_handle_ready(
+                        response, unready_proc_handle
                     )
-
                 except EOFError:
                     e.__suppress_context__ = True
                     raise e from None
@@ -618,12 +719,14 @@ def monitor_parent_death():
                 {
                     "status": WorkerProc.READY_STR,
                     "handle": worker.worker_response_mq.export_handle(),
+                    "peer_response_handles": worker.peer_response_handles,
                 }
             )
 
             # Ensure message queues are ready. Will deadlock if re-ordered.
             # Must be kept consistent with the Executor
-            worker.rpc_broadcast_mq.wait_until_ready()
+            if worker.rpc_broadcast_mq is not None:
+                worker.rpc_broadcast_mq.wait_until_ready()
             worker.worker_response_mq.wait_until_ready()
             ready_writer.close()
             ready_writer = None
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 283e3744bcf6..42a844d96558 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -189,6 +189,7 @@ def init_device(self):
                 and self.parallel_config.distributed_executor_backend
                 not in ["ray", "external_launcher"]
                 and self.vllm_config.parallel_config.data_parallel_backend != "ray"
+                and self.vllm_config.parallel_config.nnodes_within_dp == 1
             ):
                 # Use local DP rank if available, otherwise use global DP rank.
                 dp_local_rank = self.parallel_config.data_parallel_rank_local
@@ -205,7 +206,14 @@ def init_device(self):
                 assert self.local_rank < torch.cuda.device_count(), (
                     f"DP adjusted local rank {self.local_rank} is out of bounds. "
                 )
-
+            visible_device_count = (
+                torch.cuda.device_count() if torch.cuda.is_available() else 0
+            )
+            assert self.parallel_config.local_world_size <= visible_device_count, (
+                f"local_world_size ({self.parallel_config.local_world_size}) must be "
+                f"less than or equal to the number of visible devices "
+                f"({visible_device_count})."
+            )
             self.device = torch.device(f"cuda:{self.local_rank}")
             current_platform.set_device(self.device)
 
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 3991c16eefba..16f321c08077 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -180,6 +180,7 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         rpc_rank: int = 0,
+        global_rank: int | None = None,
     ) -> None:
         """
         Initialize the worker wrapper with the given vllm_config and rpc_rank.
@@ -192,6 +193,7 @@ def __init__(
         group.
         """
         self.rpc_rank = rpc_rank
+        self.global_rank = self.rpc_rank if global_rank is None else global_rank
         self.worker: WorkerBase | None = None
 
         # do not store this `vllm_config`, `init_worker` will set the final
@@ -312,7 +314,7 @@ def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None:
             assert self.worker is not None
 
     def initialize_from_config(self, kv_cache_configs: list[Any]) -> None:
-        kv_cache_config = kv_cache_configs[self.rpc_rank]
+        kv_cache_config = kv_cache_configs[self.global_rank]
         with set_current_vllm_config(self.vllm_config):
             self.worker.initialize_from_config(kv_cache_config)  # type: ignore
 

From af02c409702f2f41eb13471ce3224e3315e19d89 Mon Sep 17 00:00:00 2001
From: Dezhan <dezhantu@gmail.com>
Date: Sun, 16 Nov 2025 01:46:29 -0800
Subject: [PATCH 111/578] Fixed gpt-oss _load_weights_other() parameter
 position bug (#28715)

Co-authored-by: Dezhan Tu <dztu@meta.com>
---
 vllm/model_executor/models/gpt_oss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 692ef605fe17..328c8c0ac4b7 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -641,8 +641,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             )
         else:
             return self._load_weights_other(
-                ep_rank_end,
                 ep_rank_start,
+                ep_rank_end,
                 heads_per_rank,
                 head_start,
                 weights,

From 3bc11757984ce256905d1b8517d50b514af8b175 Mon Sep 17 00:00:00 2001
From: scottzh8 <scottzh@meta.com>
Date: Sun, 16 Nov 2025 02:20:57 -0800
Subject: [PATCH 112/578] [Bugfix] Fix host and port join for ipv6 in bench
 serve (#28679)

Signed-off-by: Scott Zhang <scottzh@fb.com>
Co-authored-by: Scott Zhang <scottzh@fb.com>
---
 vllm/benchmarks/serve.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 0e9b0fbe2c02..dddb050ec180 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -49,6 +49,7 @@
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.gc_utils import freeze_gc_heap
+from vllm.utils.network_utils import join_host_port
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
@@ -1333,8 +1334,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         api_url = f"{args.base_url}{args.endpoint}"
         base_url = f"{args.base_url}"
     else:
-        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
-        base_url = f"http://{args.host}:{args.port}"
+        host_port = join_host_port(args.host, args.port)
+        api_url = f"http://{host_port}{args.endpoint}"
+        base_url = f"http://{host_port}"
 
     # Headers
     headers = None

From 8d259fad6cd5a93bef04d00640e132e84c0c9b20 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Sun, 16 Nov 2025 05:12:45 -0800
Subject: [PATCH 113/578] Fix gpt oss weight loading with EP + bf16 (#28765)

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 vllm/model_executor/models/gpt_oss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 328c8c0ac4b7..7df3b087ccb8 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -494,8 +494,8 @@ def _load_weights_mxfp4(
 
     def _load_weights_other(
         self,
-        ep_rank_start: int,
         ep_rank_end: int,
+        ep_rank_start: int,
         heads_per_rank: int,
         head_start: int,
         weights: Iterable[tuple[str, torch.Tensor]],

From 63fed5550609b96b578d2512aefced09efe76e1e Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Sun, 16 Nov 2025 15:30:06 +0100
Subject: [PATCH 114/578] [Doc]: fix typos in various files (#28811)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 docs/contributing/benchmarks.md                    | 2 +-
 docs/design/cuda_graphs.md                         | 2 +-
 docs/features/custom_arguments.md                  | 2 +-
 docs/features/custom_logitsprocs.md                | 8 ++++----
 docs/getting_started/installation/cpu.md           | 2 +-
 docs/getting_started/installation/cpu.s390x.inc.md | 2 +-
 docs/getting_started/installation/cpu.x86.inc.md   | 2 +-
 7 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index ec0dfc4199d1..c9bc9cfe28a3 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -983,7 +983,7 @@ each document has close to 512 tokens.
 
 Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
 with an embedding model, also set `--no_reranker`. Because in this case the query is
-treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
+treated as an individual prompt by the server, here we send `random_batch_size - 1` documents
 to account for the extra prompt which is the query. The token accounting to report the
 throughput numbers correctly is also adjusted.
 
diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
index aac7b76eea26..66bf3b27d1f5 100644
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -128,7 +128,7 @@ A [CUDAGraphWrapper][vllm.compilation.cuda_graph.CUDAGraphWrapper] instance wrap
 3. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, the wrapper will perform CUDA Graphs capture (if key does not exist, create
 a new entry and cache it) or replay (if key exists in the cache).
 
-The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and cenralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
+The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and centralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
 
 #### Nested Wrapper design
 
diff --git a/docs/features/custom_arguments.md b/docs/features/custom_arguments.md
index 7a650d0e79c2..728a2c89901d 100644
--- a/docs/features/custom_arguments.md
+++ b/docs/features/custom_arguments.md
@@ -5,7 +5,7 @@ You can use vLLM *custom arguments* to pass in arguments which are not part of t
 Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
 
 !!! note
-    Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise invalid custom arguments can cause unexpected behaviour.
+    Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise, invalid custom arguments can cause unexpected behaviour.
 
 ## Offline Custom Arguments
 
diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md
index 52fcc44efacc..5ddef9db1611 100644
--- a/docs/features/custom_logitsprocs.md
+++ b/docs/features/custom_logitsprocs.md
@@ -71,7 +71,7 @@ Logits processor `update_state()` implementations should assume the following mo
 
         * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
 
-        * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
+        * **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
 
 5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
 
@@ -286,7 +286,7 @@ Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) wh
 
 ## Ways to Load Your Custom Logits Processor in vLLM
 
-Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests.
+Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits processors cannot be loaded on-demand for individual requests.
 
 This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor.
 
@@ -438,7 +438,7 @@ The examples below show how a user would pass a custom argument (`target_token`)
 
 ## Best Practices for Writing Custom Logits Processors
 
-Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently.
+Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus, it is important to implement these methods efficiently.
 
 * Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
     * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
@@ -465,4 +465,4 @@ Once vLLM loads a logits processor during initialization, then vLLM will invoke
 
     * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default
 
-* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
+* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However, the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index e8bfca0e5e88..be99cef3723e 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -104,7 +104,7 @@ Currently, there are no pre-built CPU wheels.
 
 ### Which `dtype` should be used?
 
-- Currently vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.  
+- Currently, vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.  
 
 ### How to launch a vLLM service on CPU?
 
diff --git a/docs/getting_started/installation/cpu.s390x.inc.md b/docs/getting_started/installation/cpu.s390x.inc.md
index 442c2b4ec64e..c2163139a7c5 100644
--- a/docs/getting_started/installation/cpu.s390x.inc.md
+++ b/docs/getting_started/installation/cpu.s390x.inc.md
@@ -2,7 +2,7 @@
 
 vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
 
-Currently the CPU implementation for s390x architecture supports FP32 datatype only.
+Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
 
 !!! warning
     There are no pre-built wheels or images for this device, so you must build vLLM from source.
diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md
index 00f3b726b1a0..310f179cb89c 100644
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -83,7 +83,7 @@ uv pip install dist/*.whl
 !!! example "Troubleshooting"
     - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
     - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
-    - `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
+    - `AMD` requires at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
     - If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
     ```toml title="pyproject.toml"
     [build-system]

From ac1daf32337d312e7a575901da2e19857f4c0be1 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Mon, 17 Nov 2025 01:03:21 +0800
Subject: [PATCH 115/578] fix comment typo (#28802)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/envs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 7987e5fb83fd..6bf05803e14e 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -423,7 +423,7 @@ def get_vllm_port() -> int | None:
         raise ValueError(f"VLLM_PORT '{port}' must be a valid integer") from err
 
 
-# The begin-* and end* here are used by the documentation generator
+# The start-* and end* here are used by the documentation generator
 # to extract the used env vars.
 
 # --8<-- [start:env-vars-definition]

From 5a87076d6ee60a2cf681dada9e971b4ee3e6063e Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sun, 16 Nov 2025 17:37:15 +0000
Subject: [PATCH 116/578] [Model][QwenVL] Optimize `Qwen2_5_VisionAttention`
 q,k preparation (#28769)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/models/dots_ocr.py   |  4 +-
 vllm/model_executor/models/qwen2_5_vl.py | 48 ++++++++++++------------
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 405af8f8be42..f46caaa095c6 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -39,8 +39,8 @@
 )
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
-from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention
 from vllm.model_executor.models.qwen2_vl import (
+    Qwen2VisionAttention,
     Qwen2VLDummyInputsBuilder,
     Qwen2VLMultiModalProcessor,
     Qwen2VLProcessingInfo,
@@ -328,7 +328,7 @@ def forward(
         # [S, C] -> [S, B=1, C]
         x = hidden_states.unsqueeze(1)
         x, _ = self.qkv(x)
-        q, k, v = Qwen2_5_VisionAttention.split_qkv(self, x)
+        q, k, v = Qwen2VisionAttention.split_qkv(self, x)
         bs = q.shape[1]
         # [S,B,H,D] -> [B,S,H,D]
         q = q.permute(1, 0, 2, 3).contiguous()
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 7617929e93ac..897dd7ef29f1 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -359,23 +359,6 @@ def __init__(
             AttentionBackendEnum.ROCM_AITER_FA,
         }
 
-    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
-        # [s, b, 3 * head * head_dim]
-        seq_len, bs, _ = qkv.shape
-
-        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
-        q, k, v = qkv.chunk(3, dim=2)
-
-        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
-        new_shape = (
-            seq_len,
-            bs,
-            self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head,
-        )
-        q, k, v = (x.view(*new_shape) for x in (q, k, v))
-        return q, k, v
-
     def forward(
         self,
         x: torch.Tensor,
@@ -386,17 +369,32 @@ def forward(
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
+        seq_len, batch_size, _ = x.shape
 
-        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
-        q, k, v = self.split_qkv(x)
-        batch_size = q.shape[1]
+        qkv = einops.rearrange(
+            x,
+            "s b (three head head_dim) -> b s three head head_dim",
+            three=3,
+            head=self.num_attention_heads_per_partition,
+        )
 
-        q, k, v = (einops.rearrange(x, "s b ... -> b s ...") for x in (q, k, v))
         if rotary_pos_emb is not None:
-            # [2 * b, s, heads, head_dim]
-            qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
-            q, k = torch.chunk(qk_rotated, 2, dim=0)
+            qk, v = qkv[:, :, :2], qkv[:, :, 2]
+
+            qk_reshaped = einops.rearrange(
+                qk, "b s two head head_dim -> (two b) s head head_dim", two=2
+            )
+            qk_rotated = apply_rotary_pos_emb_vision(qk_reshaped, rotary_pos_emb)
+            qk_rotated = qk_rotated.view(
+                2,
+                batch_size,
+                seq_len,
+                self.num_attention_heads_per_partition,
+                self.hidden_size_per_attention_head,
+            )
+            q, k = qk_rotated.unbind(dim=0)
+        else:
+            q, k, v = qkv.unbind(dim=2)
 
         if self.is_flash_attn_backend:
             context_layer = vit_flash_attn_wrapper(

From 03ee48111de7372a1231872f26262e7c46ab1c83 Mon Sep 17 00:00:00 2001
From: amirkl94 <203507526+amirkl94@users.noreply.github.com>
Date: Sun, 16 Nov 2025 20:39:44 +0200
Subject: [PATCH 117/578] Feature: Support Relu2 in FusedMoE fp8 cutlass path
 (#27261)

---
 tests/kernels/moe/test_flashinfer.py          | 18 +++++++---
 .../fused_moe/flashinfer_cutlass_moe.py       | 11 +++++--
 .../layers/quantization/modelopt.py           | 33 +++++++++++--------
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index 3a681d4603f8..218df4a2632c 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -77,10 +77,14 @@ class TestData:
 
     @staticmethod
     def make_moe_tensors_8bit(
-        m: int, k: int, n: int, e: int, reorder: bool
+        m: int, k: int, n: int, e: int, reorder: bool, activation: str = "silu"
     ) -> "TestData":
+        is_gated = activation != "relu2_no_mul"
+
         hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
-        w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16)
+        w13 = torch.randn(
+            (e, (2 * n) if is_gated else n, k), device="cuda", dtype=torch.bfloat16
+        )
         w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16)
 
         # Scale to fp8
@@ -190,18 +194,22 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("activation", ["silu", "relu2_no_mul"])
 def test_flashinfer_cutlass_moe_fp8_no_graph(
     m: int,
     n: int,
     k: int,
     e: int,
     topk: int,
+    activation: str,
     monkeypatch,
 ):
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
-        td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)
+        td = TestData.make_moe_tensors_8bit(
+            m, k, n, e, reorder=False, activation=activation
+        )
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
@@ -233,7 +241,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=False,
-            activation="silu",
+            activation=activation,
             global_num_experts=e,
             expert_map=None,
             apply_router_weight_on_input=True,
@@ -253,7 +261,7 @@ def get_fused_moe_quant_config(n: torch.nn.Module) -> FusedMoEQuantConfig:
             td.layer,
             topk_weights,
             topk_ids,
-            activation="silu",
+            activation=activation,
             global_num_experts=e,
             expert_map=None,
             apply_router_weight_on_input=True,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 943695f921ad..f864634c6617 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -148,8 +148,14 @@ def apply(
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool | None,
     ):
-        assert activation == "silu", (
-            "Only activation silu is supported in FlashInferExperts"
+        from flashinfer.fused_moe.core import ActivationType
+
+        activation_str_to_value_map = {
+            "silu": ActivationType.Swiglu,  # This is the default
+            "relu2_no_mul": ActivationType.Relu2,
+        }
+        assert activation in activation_str_to_value_map, (
+            f"{activation=} missing from {activation_str_to_value_map.keys()=}"
         )
 
         # Select quantization metadata based on FP8 format/path
@@ -215,6 +221,7 @@ def apply(
             ep_size=self.ep_size,
             ep_rank=self.ep_rank,
             output=output,
+            activation_type=activation_str_to_value_map[activation],
             # Informs FlashInfer to use the block-scale decoding path when True
             use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
         )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index e14753c60c48..cf6325eb85df 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -354,12 +354,18 @@ def __init__(
 
         self.cutlass_fp8_supported = cutlass_fp8_supported()
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
-        if (
-            envs.VLLM_USE_FLASHINFER_MOE_FP8
-            and has_flashinfer_moe()
-            and self.moe.is_act_and_mul
-        ):
+        if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
+            if (
+                self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+                and not self.moe.is_act_and_mul
+            ):
+                logger.info_once(
+                    "Non-gated MoE is not supported for min-latency mode,"
+                    "falling back to high-throughput mode"
+                )
+                self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
+
             logger.info_once(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
             )
@@ -557,10 +563,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             )
 
         if self.flashinfer_moe_backend is not None:
-            layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
-            register_moe_scaling_factors(layer)
+            if self.moe.is_act_and_mul:
+                layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
             if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
                 rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
+        register_moe_scaling_factors(layer)
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -570,13 +577,13 @@ def get_fused_moe_quant_config(
 
         return fp8_w8a8_moe_quant_config(
             w1_scale=layer.w13_weight_scale,
-            g1_alphas=(layer.w13_weight_scale * layer.w13_input_scale).squeeze(),
+            g1_alphas=layer.output1_scales_gate_scalar.squeeze(),
             w2_scale=layer.w2_weight_scale,
-            g2_alphas=(layer.w2_weight_scale * layer.w2_input_scale).squeeze(),
+            g2_alphas=layer.output2_scales_scalar.squeeze(),
             a1_scale=layer.w13_input_scale,
             a1_gscale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
-            a2_gscale=1.0 / layer.w2_input_scale,
+            a2_gscale=layer.w2_input_scale_inv,
             per_act_token_quant=False,
         )
 
@@ -642,9 +649,9 @@ def apply(
         )
 
         if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert not renormalize
-            assert activation == "silu", (
-                f"Expected 'silu' activation but got {activation}"
+            assert activation in ("silu", "relu2_no_mul"), (
+                "Expected activation to be in ('silu', 'relu2_no_mul'),"
+                f"but got {activation}"
             )
             return flashinfer_cutlass_moe_fp8(
                 x,

From 80b6080ddcad0653daa6b776eb71a5a7029b70d8 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sun, 16 Nov 2025 14:46:46 -0800
Subject: [PATCH 118/578] [BugFix] Fix async scheduling + chunked prefill +
 preemption (#28787)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/e2e/test_async_scheduling.py | 10 ++++------
 vllm/v1/core/sched/scheduler.py       |  4 +---
 vllm/v1/utils.py                      |  3 +++
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index dbe403ece051..c4aca82416cd 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -65,9 +65,8 @@ def test_without_spec_decoding(
         (True, "mp", True, None, False),
         (True, "uni", True, None, False),
         (False, "mp", True, None, True),
-        # Async scheduling + preemption + chunked prefill needs to be fixed (WIP)
-        # (True, "mp", True, None, True),
-        # (True, "uni", True, None, True),
+        (True, "mp", True, None, True),
+        (True, "uni", True, None, True),
     ]
 
     run_tests(
@@ -103,9 +102,8 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
         (False, "mp", True, spec_config_short, True),
         (True, "uni", True, spec_config, False),
         (True, "uni", True, spec_config_short, False),
-        # Async scheduling + preemption + chunked prefill needs to be fixed (WIP)
-        #  (True, "mp", True, spec_config, True),
-        #  (True, "uni", True, spec_config_short, True),
+        (True, "mp", True, spec_config, True),
+        (True, "uni", True, spec_config_short, True),
     ]
 
     run_tests(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index bc15979dea62..8e62542337a7 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -778,9 +778,7 @@ def _make_cached_request_data(
                 assert not scheduled_in_prev_step
                 resumed_req_ids.add(req_id)
             if not scheduled_in_prev_step:
-                all_token_ids[req_id] = req.all_token_ids[
-                    : req.num_computed_tokens + num_tokens
-                ]
+                all_token_ids[req_id] = req.all_token_ids.copy()
             new_block_ids.append(
                 req_to_new_blocks[req_id].get_block_ids(allow_none=True)
             )
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index a401f6d74cdd..29099d1e9b17 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -97,6 +97,9 @@ def __len__(self):
     def __repr__(self):
         return f"ConstantList({self._x})"
 
+    def copy(self) -> list[T]:
+        return self._x.copy()
+
 
 class CpuGpuBuffer:
     """Buffer to easily copy tensors between CPU and GPU."""

From 561253b37faadaafe68168ea32d8d8157621a6b4 Mon Sep 17 00:00:00 2001
From: jiahanc <173873397+jiahanc@users.noreply.github.com>
Date: Sun, 16 Nov 2025 18:02:42 -0800
Subject: [PATCH 119/578] [Performance][Fix] update nvfp4 code to support
 renorm routing (#28569)

Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../layers/quantization/modelopt.py            | 18 +++++++++++-------
 .../quantization/utils/flashinfer_utils.py     |  5 ++++-
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index cf6325eb85df..476521813f46 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -15,6 +15,7 @@
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
+    RoutingMethodType,
     fp8_w8a8_moe_quant_config,
     nvfp4_moe_quant_config,
 )
@@ -1657,16 +1658,19 @@ def apply(
             use_llama4_routing = (
                 custom_routing_function is Llama4MoE.custom_routing_function
             )
-            routing_method_type = flashinfer.RoutingMethodType.DeepSeekV3
+            routing_method_type = layer.routing_method_type
             if use_llama4_routing:
-                routing_method_type = flashinfer.RoutingMethodType.Llama4
+                routing_method_type = RoutingMethodType.Llama4
+            router_logits = (
+                router_logits.to(torch.float32)
+                if routing_method_type == RoutingMethodType.DeepSeekV3
+                else router_logits
+            )
             routing_bias = e_score_correction_bias
             if routing_bias is not None:
                 routing_bias = routing_bias.to(torch.bfloat16)
             out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
-                routing_logits=router_logits
-                if use_llama4_routing
-                else router_logits.to(torch.float32),
+                routing_logits=router_logits,
                 routing_bias=routing_bias,
                 hidden_states=hidden_states_fp4,
                 hidden_states_scale=hidden_states_scale_linear_fp4.view(
@@ -1690,8 +1694,8 @@ def apply(
                 output2_scale_scalar=layer.g2_alphas.data,
                 num_experts=global_num_experts,
                 top_k=top_k,
-                n_group=num_expert_group if num_expert_group is not None else 0,
-                topk_group=topk_group if topk_group is not None else 0,
+                n_group=num_expert_group,
+                topk_group=topk_group,
                 intermediate_size=layer.intermediate_size_per_partition,
                 local_expert_offset=layer.ep_rank * layer.local_num_experts,
                 local_num_experts=layer.local_num_experts,
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index d9e9b4240271..f22e17945d1f 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -291,5 +291,8 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
 
 def is_flashinfer_supporting_global_sf(backend: FlashinferMoeBackend | None) -> bool:
     # TODO(shuw@nvidia): Update when new backends are added.
-    backends_supporting_global_sf = (FlashinferMoeBackend.CUTLASS,)
+    backends_supporting_global_sf = (
+        FlashinferMoeBackend.CUTLASS,
+        FlashinferMoeBackend.TENSORRT_LLM,
+    )
     return backend in backends_supporting_global_sf

From d64429bb369d4087f9f91609e7275c4901d65aea Mon Sep 17 00:00:00 2001
From: liuzhenwei <zhenwei.liu@intel.com>
Date: Mon, 17 Nov 2025 11:01:33 +0800
Subject: [PATCH 120/578] [NIXL][XPU] update install script of NIXL (#28778)

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 docker/Dockerfile.xpu                    | 3 ++-
 tools/install_nixl_from_source_ubuntu.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 4e6ef8f5ca13..5d5b82c4fa5a 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \
     libxext6 \
     libgl1 \
     lsb-release \
+    libaio-dev \
     numactl \
     wget \
     vim \
@@ -68,8 +69,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 RUN python3 -m pip install -e tests/vllm_test_utils
 
 # install nixl from source code
+ENV NIXL_VERSION=0.7.0
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
-ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip uninstall oneccl oneccl-devel -y
diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
index 4a20b6b7bb8f..a786abba95ad 100644
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -175,6 +175,7 @@ def build_and_install_prerequisites(args):
     build_env["LD_LIBRARY_PATH"] = (
         f"{ucx_lib_path}:{ucx_plugin_path}:{existing_ld_path}".strip(":")
     )
+    build_env["LDFLAGS"] = "-Wl,-rpath,$ORIGIN"
     print(f"--> Using LD_LIBRARY_PATH: {build_env['LD_LIBRARY_PATH']}", flush=True)
 
     temp_wheel_dir = os.path.join(ROOT_DIR, "temp_wheelhouse")

From 60e089f0b90b1fe9b65224b069c953927d1f3b44 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Mon, 17 Nov 2025 12:52:11 +0800
Subject: [PATCH 121/578] [ROCm][Qwen3-32B] Fix AITER MHA accuracy issue cause
 by #25763 (#28670)

Signed-off-by: Xiake Sun <xiake.sun@amd.com>
---
 vllm/v1/attention/backends/rocm_aiter_fa.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index ad454daa582e..ea611848b0e8 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -729,7 +729,7 @@ def forward(
                     cu_seqlens_k=attn_metadata.prefill_metadata.query_start_loc,
                     max_seqlen_q=attn_metadata.prefill_metadata.max_query_len,
                     max_seqlen_k=attn_metadata.prefill_metadata.max_seq_len,
-                    min_seqlen_q=attn_metadata.prefill_metadata.min_query_len,
+                    min_seqlen_q=1,
                     dropout_p=0.0,
                     softmax_scale=self.scale,
                     causal=True,
@@ -759,7 +759,7 @@ def forward(
                     cu_seqlens_q=attn_metadata.extend_metadata.query_start_loc,
                     max_seqlen_q=attn_metadata.extend_metadata.max_query_len,
                     max_seqlen_k=attn_metadata.extend_metadata.max_seq_len,
-                    min_seqlen_q=attn_metadata.extend_metadata.min_query_len,
+                    min_seqlen_q=1,
                     block_table=attn_metadata.block_table[
                         num_decodes : num_decodes + num_extends
                     ],

From 6f374192442381b37a6a6ba29045c74a8ee2486d Mon Sep 17 00:00:00 2001
From: Jay Caldwell <111952840+jscaldwell55@users.noreply.github.com>
Date: Sun, 16 Nov 2025 23:54:46 -0600
Subject: [PATCH 122/578] [Bugfix][Model] Prevent special token leakage in
 KimiK2ToolParser streaming mode (#28543)

Signed-off-by: Jscaldwell55 <jay.s.caldwell@gmail.com>
---
 tests/tool_use/test_kimi_k2_tool_parser.py    | 593 ++++++++++++++++++
 .../tool_parsers/kimi_k2_tool_parser.py       | 202 +++++-
 2 files changed, 790 insertions(+), 5 deletions(-)

diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py
index c358589dbc29..33dabbc7e7b9 100644
--- a/tests/tool_use/test_kimi_k2_tool_parser.py
+++ b/tests/tool_use/test_kimi_k2_tool_parser.py
@@ -209,3 +209,596 @@ def test_streaming_no_tool_calls(kimi_k2_tool_parser):
     assert result is not None
     assert hasattr(result, "content")
     assert result.content == " without any tool calls."
+
+
+def test_token_leak_between_section_and_tool_begin(kimi_k2_tool_parser):
+    """
+    Test that text between <|tool_calls_section_begin|> and <|tool_call_begin|>
+    is suppressed and does not leak into reasoning_delta.
+    This is the main vulnerability being fixed.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Get token IDs for the markers
+    section_begin_token_id = kimi_k2_tool_parser.vocab.get(
+        "<|tool_calls_section_begin|>"
+    )
+    tool_call_begin_token_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+
+    # Simulate streaming sequence:
+    # Delta 1: "I'll help you with that. "
+    result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="I'll help you with that. ",
+        delta_text="I'll help you with that. ",
+        previous_token_ids=[],
+        current_token_ids=[1, 2, 3],  # Regular tokens
+        delta_token_ids=[1, 2, 3],
+        request=None,
+    )
+    assert result1 is not None
+    assert result1.content == "I'll help you with that. "
+
+    # Delta 2: "<|tool_calls_section_begin|>"
+    prev_ids = [1, 2, 3]
+    curr_ids = prev_ids + [section_begin_token_id]
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. ",
+        current_text="I'll help you with that. <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[section_begin_token_id],
+        request=None,
+    )
+    # Section marker should be stripped and suppressed
+    assert result2 is None or (result2.content is None or result2.content == "")
+
+    # Delta 3: " spurious text or tokens " (THE LEAK SCENARIO)
+    prev_ids = curr_ids
+    curr_ids = curr_ids + [4, 5]
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. <|tool_calls_section_begin|>",
+        current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ",
+        delta_text=" spurious text ",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[4, 5],
+        request=None,
+    )
+    # CRITICAL: This text should be suppressed, NOT returned as reasoning_delta
+    assert result3 is None or (result3.content is None or result3.content == "")
+
+    # Delta 4: "<|tool_call_begin|>..."
+    prev_ids = curr_ids
+    curr_ids = curr_ids + [tool_call_begin_token_id]
+    _result4 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ",
+        current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text <|tool_call_begin|>",
+        delta_text="<|tool_call_begin|>",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[tool_call_begin_token_id],
+        request=None,
+    )
+    # Now we're in tool call mode, result depends on internal state
+    # The key is that the spurious text from Delta 3 was not leaked
+
+
+def test_split_markers_across_deltas(kimi_k2_tool_parser):
+    """
+    Test that markers split across delta chunks are correctly detected
+    via the rolling buffer mechanism.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_token_id = kimi_k2_tool_parser.vocab.get(
+        "<|tool_calls_section_begin|>"
+    )
+
+    # Delta 1: "...reasoning<|tool_calls_sec"
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning",
+        current_text="Some reasoning<|tool_calls_sec",
+        delta_text="<|tool_calls_sec",
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, 3],  # Partial token
+        delta_token_ids=[3],
+        request=None,
+    )
+    # Partial token not recognized yet, might be buffered
+    # Should return as content or None (depends on implementation)
+
+    # Delta 2: "tion_begin|> "  (completes the marker)
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning<|tool_calls_sec",
+        current_text="Some reasoning<|tool_calls_section_begin|> ",
+        delta_text="tion_begin|> ",
+        previous_token_ids=[1, 2, 3],
+        current_token_ids=[1, 2, section_begin_token_id, 4],
+        delta_token_ids=[section_begin_token_id, 4],
+        request=None,
+    )
+    # Now the complete marker should be detected via buffer
+    # The parser should enter tool section mode
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+
+def test_marker_variants(kimi_k2_tool_parser):
+    """Test that both singular and plural marker variants are recognized."""
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Test singular variant: <|tool_call_section_begin|> (note: singular "call")
+    singular_token_id = kimi_k2_tool_parser.vocab.get("<|tool_call_section_begin|>")
+
+    if singular_token_id is not None:  # Only test if tokenizer supports it
+        _result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+            previous_text="Reasoning ",
+            current_text="Reasoning <|tool_call_section_begin|>",
+            delta_text="<|tool_call_section_begin|>",
+            previous_token_ids=[1, 2],
+            current_token_ids=[1, 2, singular_token_id],
+            delta_token_ids=[singular_token_id],
+            request=None,
+        )
+        # Should enter tool section mode with singular variant too
+        assert kimi_k2_tool_parser.in_tool_section is True
+
+
+def test_reentry_to_reasoning_after_tool_section(kimi_k2_tool_parser):
+    """
+    Test that after exiting a tool section with <|tool_calls_section_end|>,
+    subsequent text is correctly returned as reasoning content.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Exit tool section
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|><|tool_calls_section_end|>",
+        delta_text="<|tool_calls_section_end|>",
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id, section_end_id],
+        delta_token_ids=[section_end_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Subsequent reasoning text should be returned normally
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|><|tool_calls_section_end|>",
+        current_text="<|tool_calls_section_begin|><|tool_calls_section_end|> More reasoning",
+        delta_text=" More reasoning",
+        previous_token_ids=[section_begin_id, section_end_id],
+        current_token_ids=[section_begin_id, section_end_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+    assert result3 is not None
+    assert result3.content == " More reasoning"
+
+
+def test_empty_tool_section(kimi_k2_tool_parser):
+    """Test an empty tool section (begin immediately followed by end)."""
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Section begin
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[1],
+        current_token_ids=[1, section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+
+    # Immediate section end
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning <|tool_calls_section_begin|>",
+        current_text="Reasoning <|tool_calls_section_begin|><|tool_calls_section_end|>",
+        delta_text="<|tool_calls_section_end|>",
+        previous_token_ids=[1, section_begin_id],
+        current_token_ids=[1, section_begin_id, section_end_id],
+        delta_token_ids=[section_end_id],
+        request=None,
+    )
+    # Should exit cleanly without errors
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+
+def test_malformed_tool_section_recovery(kimi_k2_tool_parser):
+    """
+    Test that the parser recovers from a malformed tool section
+    that never closes properly.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Simulate a lot of text without proper tool calls or section end
+    # This should trigger the error recovery mechanism
+    large_text = "x" * 10000  # Exceeds max_section_chars
+
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|>" + large_text,
+        delta_text=large_text,
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id] + list(range(100, 100 + len(large_text))),
+        delta_token_ids=list(range(100, 100 + len(large_text))),
+        request=None,
+    )
+
+    # Parser should have force-exited the tool section
+    assert kimi_k2_tool_parser.in_tool_section is False
+    # And returned the content as reasoning
+    assert result2 is not None
+    assert result2.content == large_text
+
+
+def test_state_reset(kimi_k2_tool_parser):
+    """Test that reset_streaming_state() properly clears all state."""
+    # Put parser in a complex state
+    kimi_k2_tool_parser.in_tool_section = True
+    kimi_k2_tool_parser.token_buffer = "some buffer"
+    kimi_k2_tool_parser.current_tool_id = 5
+    kimi_k2_tool_parser.prev_tool_call_arr = [{"id": "test"}]
+    kimi_k2_tool_parser.section_char_count = 1000
+
+    # Reset
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Verify all state is cleared
+    assert kimi_k2_tool_parser.in_tool_section is False
+    assert kimi_k2_tool_parser.token_buffer == ""
+    assert kimi_k2_tool_parser.current_tool_id == -1
+    assert kimi_k2_tool_parser.prev_tool_call_arr == []
+    assert kimi_k2_tool_parser.section_char_count == 0
+    assert kimi_k2_tool_parser.current_tool_name_sent is False
+    assert kimi_k2_tool_parser.streamed_args_for_tool == []
+
+
+def test_section_begin_noise_tool_begin_same_chunk(kimi_k2_tool_parser):
+    """
+    Test that begin→noise→tool_begin within the SAME chunk suppresses
+    the noise text correctly (not just across chunks).
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    tool_call_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+
+    # Single delta containing: section_begin + spurious text + tool_call_begin
+    combined_text = "<|tool_calls_section_begin|> noise text <|tool_call_begin|>"
+
+    result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning " + combined_text,
+        delta_text=combined_text,
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id, 3, 4, tool_call_begin_id],
+        delta_token_ids=[section_begin_id, 3, 4, tool_call_begin_id],
+        request=None,
+    )
+
+    # The noise text should NOT leak into content
+    # Result should either be None/empty or start tool call parsing
+    if result is not None and result.content is not None:
+        # If content is returned, it should not contain the noise
+        assert "noise text" not in result.content
+        assert result.content == "" or result.content.strip() == ""
+
+
+def test_stream_ends_without_section_end_marker(kimi_k2_tool_parser):
+    """
+    Test that if the stream ends (EOF) without a proper section end marker,
+    the parser doesn't leak text, doesn't crash, and resets state cleanly.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Some content in tool section
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|> partial content",
+        delta_text=" partial content",
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+    # Content should be suppressed
+    assert result2.content == "" or result2.content is None
+
+    # Stream ends (EOF) - no more deltas, no section_end marker
+    # Simulate this by manually checking state and resetting
+    # (In real usage, the request handler would call reset_streaming_state)
+    assert kimi_k2_tool_parser.in_tool_section is True  # Still in section
+
+    # Reset state (as would happen between requests)
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Verify clean slate
+    assert kimi_k2_tool_parser.in_tool_section is False
+    assert kimi_k2_tool_parser.token_buffer == ""
+
+    # Next request should work normally
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="New reasoning",
+        delta_text="New reasoning",
+        previous_token_ids=[],
+        current_token_ids=[20, 21],
+        delta_token_ids=[20, 21],
+        request=None,
+    )
+    assert result3 is not None
+    assert result3.content == "New reasoning"
+
+
+def test_same_chunk_begin_and_end_markers(kimi_k2_tool_parser):
+    """
+    CRITICAL TEST: Verify that when both section_begin and section_end
+    markers appear in the SAME chunk, the parser correctly:
+    1. Enters the tool section
+    2. Immediately exits the tool section
+    3. Does NOT get stuck in in_tool_section=True state
+
+    This tests the bug fix where elif was changed to if to handle
+    both state transitions in a single delta.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Single chunk with both markers (e.g., empty tool section)
+    combined_delta = "<|tool_calls_section_begin|><|tool_calls_section_end|>"
+
+    result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning ",
+        current_text="Some reasoning " + combined_delta,
+        delta_text=combined_delta,
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id, section_end_id],
+        delta_token_ids=[section_begin_id, section_end_id],
+        request=None,
+    )
+
+    # CRITICAL: Parser should NOT be stuck in tool section
+    assert kimi_k2_tool_parser.in_tool_section is False, (
+        "Parser stuck in tool section after processing both begin/end in same chunk. "
+        "This indicates the elif bug was not fixed."
+    )
+
+    # Result should be empty or contain only stripped content
+    assert result is not None
+    assert result.content == "" or result.content is None
+
+    # Verify subsequent content streams correctly (not suppressed)
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning " + combined_delta,
+        current_text="Some reasoning " + combined_delta + " More reasoning",
+        delta_text=" More reasoning",
+        previous_token_ids=[1, 2, section_begin_id, section_end_id],
+        current_token_ids=[1, 2, section_begin_id, section_end_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+
+    # This content should NOT be suppressed (we're out of tool section)
+    assert result2 is not None
+    assert result2.content == " More reasoning"
+
+
+def test_same_chunk_begin_content_end_markers(kimi_k2_tool_parser):
+    """
+    Test the same-chunk scenario with actual content between markers.
+    Example: <|tool_calls_section_begin|> text <|tool_calls_section_end|>
+    all arriving in one delta. The key is that the state machine correctly
+    transitions in and out within the same chunk.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Chunk with begin, some whitespace/noise, and end all together
+    # This simulates a tool section that opens and closes in the same chunk
+    combined_delta = "<|tool_calls_section_begin|>   <|tool_calls_section_end|>"
+
+    _result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning " + combined_delta,
+        delta_text=combined_delta,
+        previous_token_ids=[1],
+        current_token_ids=[1, section_begin_id, 100, section_end_id],
+        delta_token_ids=[section_begin_id, 100, section_end_id],
+        request=None,
+    )
+
+    # Parser should exit cleanly (not stuck in tool section)
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Verify the fix: next content should stream normally, not be suppressed
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning " + combined_delta,
+        current_text="Reasoning " + combined_delta + " Done",
+        delta_text=" Done",
+        previous_token_ids=[1, section_begin_id, 100, section_end_id],
+        current_token_ids=[1, section_begin_id, 100, section_end_id, 200],
+        delta_token_ids=[200],
+        request=None,
+    )
+
+    # Content after section should be returned (not suppressed)
+    assert result2 is not None
+    assert result2.content == " Done"
+
+
+def test_tool_call_end_and_section_end_same_chunk(kimi_k2_tool_parser):
+    """
+    CRITICAL TEST (P1): Verify that when both <|tool_call_end|> and
+    <|tool_calls_section_end|> appear in the SAME chunk, the parser:
+    1. Processes the tool_call_end first (emits final arguments)
+    2. THEN exits the section
+    3. Does NOT drop the final tool call update
+    4. Does NOT leak special tokens into reasoning
+
+    This tests the deferred section exit fix.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+    tool_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+    tool_end_id = kimi_k2_tool_parser.vocab.get("<|tool_call_end|>")
+
+    # Simulate a streaming sequence for a SHORT tool call (all in one chunk):
+    # 1. Reasoning text
+    result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="Let me help. ",
+        delta_text="Let me help. ",
+        previous_token_ids=[],
+        current_token_ids=[1, 2],
+        delta_token_ids=[1, 2],
+        request=None,
+    )
+    assert result1 is not None
+    assert result1.content == "Let me help. "
+
+    # 2. Section begin
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Let me help. ",
+        current_text="Let me help. <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # 3. Tool call begin + full content + tool_end + section_end ALL IN ONE CHUNK
+    # This is the critical scenario for short tool calls
+    combined = (
+        '<|tool_call_begin|>get_weather:0 <|tool_call_argument_begin|> {"city": "Paris"} '
+        "<|tool_call_end|><|tool_calls_section_end|>"
+    )
+
+    # Build up the previous text gradually to simulate realistic streaming
+    prev_text = "Let me help. <|tool_calls_section_begin|>"
+    curr_text = prev_text + combined
+
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text=prev_text,
+        current_text=curr_text,
+        delta_text=combined,
+        previous_token_ids=[1, 2, section_begin_id],
+        current_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+        ],
+        delta_token_ids=[tool_begin_id, 10, 11, 12, tool_end_id, section_end_id],
+        request=None,
+    )
+
+    # CRITICAL: Parser should have exited section AFTER processing tool
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Tool call should have been emitted (not dropped)
+    # The result might be the tool name or None depending on state, but
+    # importantly, it shouldn't be returning the literal tokens as content
+
+    if result3 is not None and result3.content is not None:
+        # Verify no special tokens leaked into content
+        assert "<|tool_call_end|>" not in result3.content
+        assert "<|tool_calls_section_end|>" not in result3.content
+
+    # 4. Verify subsequent content streams normally
+    result4 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text=curr_text,
+        current_text=curr_text + " Done",
+        delta_text=" Done",
+        previous_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+        ],
+        current_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+            20,
+        ],
+        delta_token_ids=[20],
+        request=None,
+    )
+
+    # Content after tool section should stream normally
+    assert result4 is not None
+    assert result4.content == " Done"
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
index 0453db58361a..a84c9e454716 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -34,8 +34,27 @@ def __init__(self, tokenizer: AnyTokenizer):
             str
         ] = []  # map what has been streamed for each tool so far to a list
 
+        # Section-level state management to prevent token leakage
+        self.in_tool_section: bool = False
+        self.token_buffer: str = ""
+        # Buffer size: empirical worst-case for longest marker (~30 chars) * 2
+        # + safety margin for unicode + partial overlap. Prevents unbounded growth.
+        self.buffer_max_size: int = 1024
+        self.section_char_count: int = 0  # Track characters processed in tool section
+        self.max_section_chars: int = 8192  # Force exit if section exceeds this
+        self._buffer_overflow_logged: bool = False  # Log overflow once per session
+
+        # Support both singular and plural variants
         self.tool_calls_start_token: str = "<|tool_calls_section_begin|>"
         self.tool_calls_end_token: str = "<|tool_calls_section_end|>"
+        self.tool_calls_start_token_variants: list[str] = [
+            "<|tool_calls_section_begin|>",
+            "<|tool_call_section_begin|>",  # singular variant
+        ]
+        self.tool_calls_end_token_variants: list[str] = [
+            "<|tool_calls_section_end|>",
+            "<|tool_call_section_end|>",  # singular variant
+        ]
 
         self.tool_call_start_token: str = "<|tool_call_begin|>"
         self.tool_call_end_token: str = "<|tool_call_end|>"
@@ -58,6 +77,18 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token)
         self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token)
 
+        # Get token IDs for all variants
+        self.tool_calls_start_token_ids: list[int] = [
+            tid
+            for variant in self.tool_calls_start_token_variants
+            if (tid := self.vocab.get(variant)) is not None
+        ]
+        self.tool_calls_end_token_ids: list[int] = [
+            tid
+            for variant in self.tool_calls_end_token_variants
+            if (tid := self.vocab.get(variant)) is not None
+        ]
+
         self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
 
@@ -70,6 +101,51 @@ def __init__(self, tokenizer: AnyTokenizer):
                 "tokens in the tokenizer!"
             )
 
+    def _check_and_strip_markers(self, text: str) -> tuple[str, bool, bool]:
+        """
+        Check for section begin/end markers in text and strip them.
+        Returns: (cleaned_text, found_section_begin, found_section_end)
+        """
+        found_begin = False
+        found_end = False
+        cleaned = text
+
+        # Check for section begin markers (any variant)
+        for variant in self.tool_calls_start_token_variants:
+            if variant in cleaned:
+                cleaned = cleaned.replace(variant, "")
+                found_begin = True
+
+        # Check for section end markers (any variant)
+        for variant in self.tool_calls_end_token_variants:
+            if variant in cleaned:
+                cleaned = cleaned.replace(variant, "")
+                found_end = True
+
+        return cleaned, found_begin, found_end
+
+    def _reset_section_state(self) -> None:
+        """Reset state when exiting tool section."""
+        self.in_tool_section = False
+        self.token_buffer = ""
+        self.section_char_count = 0
+
+    def reset_streaming_state(self) -> None:
+        """
+        Reset all streaming state. Call this between requests to prevent
+        state leakage when parser instance is reused.
+        """
+        # Reset section state
+        self._reset_section_state()
+
+        # Reset parent class state
+        self.current_tool_name_sent = False
+        self.prev_tool_call_arr = []
+        self.current_tool_id = -1
+        self.streamed_args_for_tool = []
+
+        logger.debug("Streaming state reset")
+
     def extract_tool_calls(
         self,
         model_output: str,
@@ -131,13 +207,94 @@ def extract_tool_calls_streaming(
     ) -> DeltaMessage | None:
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
-        # check to see if we should be streaming a tool call - is there a
-        if self.tool_calls_start_token_id not in current_token_ids:
+
+        # Flag to defer section exit until after tool parsing completes
+        deferred_section_exit = False
+
+        # Add delta to buffer for split marker detection
+        self.token_buffer += delta_text
+
+        # Enforce buffer size limit to prevent memory issues
+        if len(self.token_buffer) > self.buffer_max_size:
+            if not self._buffer_overflow_logged:
+                logger.warning(
+                    "Token buffer exceeded max size (%d bytes), flushing excess. "
+                    "This may indicate very long markers or unusual tokenization.",
+                    self.buffer_max_size,
+                )
+                self._buffer_overflow_logged = True
+            # Keep only the most recent content that might contain partial markers
+            self.token_buffer = self.token_buffer[-self.buffer_max_size // 2 :]
+
+        # Check buffer for section markers (handles split tokens)
+        buffered_text, found_section_begin, found_section_end = (
+            self._check_and_strip_markers(self.token_buffer)
+        )
+
+        # Track section state transitions
+        if found_section_begin and not self.in_tool_section:
+            logger.debug("Entering tool section")
+            self.in_tool_section = True
+            self.token_buffer = buffered_text  # Use cleaned buffer
+            self.section_char_count = 0  # Reset counter for new section
+        if found_section_end and self.in_tool_section:
+            logger.debug("Detected section end marker")
+            # CRITICAL: Don't exit early if tool_call_end is in this chunk.
+            # Tool parser must emit final arguments/close first to avoid dropping
+            # the final tool update and leaking tokens into reasoning channel.
+            has_tool_end = self.tool_call_end_token_id in delta_token_ids
+            if has_tool_end:
+                # Defer exit until after tool parsing completes
+                deferred_section_exit = True
+                logger.debug("Deferring section exit: tool_call_end in same chunk")
+                self.token_buffer = buffered_text
+            else:
+                # No tool call ending, safe to exit immediately
+                logger.debug("Exiting tool section")
+                remaining = buffered_text
+                self._reset_section_state()
+                # Return remaining text as reasoning content if non-empty
+                if remaining.strip():
+                    return DeltaMessage(content=remaining)
+                # Return empty delta to maintain function contract
+                # (always returns DeltaMessage)
+                return DeltaMessage(content="")
+        else:
+            self.token_buffer = buffered_text
+
+        # Check if any variant of section start token is in current_token_ids
+        has_section_token = any(
+            tid in current_token_ids for tid in self.tool_calls_start_token_ids
+        )
+
+        # Early return: if no section token detected yet, return as reasoning content
+        if not has_section_token and not self.in_tool_section:
             logger.debug("No tool call tokens found!")
+            # Don't clear buffer - it needs to accumulate partial markers across deltas
+            # Buffer overflow is already protected by lines 215-224
             return DeltaMessage(content=delta_text)
-        delta_text = delta_text.replace(self.tool_calls_start_token, "").replace(
-            self.tool_calls_end_token, ""
-        )
+
+        # Strip section markers from delta_text for subsequent processing
+        # NOTE: This preprocessing happens BEFORE the regex-based tool call
+        # parsing (from PR #24847) to ensure markers are removed cleanly
+        # before pattern matching. No double-stripping occurs because
+        # section markers and tool call markers are distinct.
+        delta_text, _, _ = self._check_and_strip_markers(delta_text)
+
+        # Error recovery: If in tool section for too long, force exit
+        if self.in_tool_section:
+            self.section_char_count += len(delta_text)
+            if self.section_char_count > self.max_section_chars:
+                logger.warning(
+                    "Tool section exceeded max length (%d chars), forcing exit. "
+                    "This may indicate malformed model output.",
+                    self.max_section_chars,
+                )
+                self._reset_section_state()
+                # Deferred exit already handled by forced exit above
+                # Return remaining content as reasoning (or empty delta if no content)
+                return DeltaMessage(content=delta_text if delta_text.strip() else "")
+
         try:
             # figure out where we are in the parsing by counting tool call
             # start & end tags
@@ -158,6 +315,16 @@ def extract_tool_calls_streaming(
                 and prev_tool_end_count == cur_tool_end_count
                 and self.tool_call_end_token not in delta_text
             ):
+                # CRITICAL FIX: Suppress content if in tool section but
+                # no tool calls started
+                if self.in_tool_section and cur_tool_start_count == 0:
+                    logger.debug(
+                        "In tool section but no tool calls started yet. "
+                        "Suppressing: %s",
+                        delta_text,
+                    )
+                    # Return empty delta to maintain iterator contract
+                    return DeltaMessage(content="")
                 logger.debug("Generating text content! skipping tool parsing.")
                 return DeltaMessage(content=delta_text)
 
@@ -209,6 +376,9 @@ def extract_tool_calls_streaming(
             ):
                 if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0:
                     logger.debug("attempting to close tool call, but no tool call")
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit and self.in_tool_section:
+                        self._reset_section_state()
                     return None
                 diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments")
                 if diff:
@@ -218,6 +388,9 @@ def extract_tool_calls_streaming(
                         else diff
                     )
                     if '"}' not in delta_text:
+                        # Handle deferred section exit before returning
+                        if deferred_section_exit and self.in_tool_section:
+                            self._reset_section_state()
                         return None
                     end_loc = delta_text.rindex('"}')
                     diff = delta_text[:end_loc] + '"}'
@@ -227,6 +400,10 @@ def extract_tool_calls_streaming(
                         diff,
                     )
                     self.streamed_args_for_tool[self.current_tool_id] += diff
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit and self.in_tool_section:
+                        logger.debug("Completing deferred section exit")
+                        self._reset_section_state()
                     return DeltaMessage(
                         tool_calls=[
                             DeltaToolCall(
@@ -240,9 +417,19 @@ def extract_tool_calls_streaming(
 
             # case -- otherwise we're just generating text
             else:
+                # Check if we're in tool section - if so, suppress
+                if self.in_tool_section:
+                    logger.debug("In tool section, suppressing text generation")
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit:
+                        self._reset_section_state()
+                    return DeltaMessage(content="")
                 text = delta_text.replace(self.tool_call_start_token, "")
                 text = text.replace(self.tool_call_end_token, "")
                 delta = DeltaMessage(tool_calls=[], content=text)
+                # Handle deferred section exit before returning
+                if deferred_section_exit and self.in_tool_section:
+                    self._reset_section_state()
                 return delta
 
             current_tool_call = dict()
@@ -390,6 +577,11 @@ def extract_tool_calls_streaming(
             else:
                 self.prev_tool_call_arr.append(current_tool_call)
 
+            # Handle deferred section exit after tool parsing completes
+            if deferred_section_exit and self.in_tool_section:
+                logger.debug("Completing deferred section exit")
+                self._reset_section_state()
+
             return delta
 
         except Exception:

From 3380ed5e115613bb0029164754ffea99f328e065 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 17 Nov 2025 14:08:48 +0800
Subject: [PATCH 123/578] [Doc] Add llama4 LoRA tag (#28825)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 6eb0947fe568..d47aeaab511b 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -684,7 +684,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
 | `LightOnOCRForConditionalGeneration`  | LightOnOCR-1B  | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
-| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ |
+| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
 | `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
 | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ |

From 679a7cffdc3baf2a2f205d993a60a8925ebfd358 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 17 Nov 2025 06:29:29 +0000
Subject: [PATCH 124/578] WIP: Integrate Aiter bpreshuffle and ck kernels

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/_aiter_ops.py                            |  56 ++++++
 .../kernels/scaled_mm/__init__.py             |   4 +
 .../quantization/kernels/scaled_mm/aiter.py   | 165 ++++++++++++++++++
 3 files changed, 225 insertions(+)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 5508e59bcd2f..6de21176e948 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -402,6 +402,42 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
     return torch.empty_like(x), torch.empty_like(residual)
 
 
+def _rocm_aiter_gemm_a8w8_bpreshuffle_impl(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype | None = None,
+    scale_a: torch.Tensor | None = None,
+    scale_b: torch.Tensor | None = None,
+) -> torch.Tensor:
+    # This AITER function can be used for
+    # - per-token activations + per-channel weights
+    # accept the weight as # keep the weight as (N, K)
+    # NOTE: The weight has to be shuffled in the
+    # process_weights_after_loading of the CompressedTensorsW8A8Fp8 class
+
+    from aiter import gemm_a8w8_bpreshuffle_ck
+
+    m = input.shape[0]
+    n = weight.shape[0]
+    Y = torch.empty(m, n, dtype=out_dtype, device=input.device)
+    gemm_a8w8_bpreshuffle_ck(input, weight, scale_a, scale_b, Y)
+    return Y
+
+
+def _rocm_aiter_gemm_a8w8_bpreshuffle_fake(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype | None = None,
+    scale_a: torch.Tensor | None = None,
+    scale_b: torch.Tensor | None = None,
+) -> torch.Tensor:
+    m = input.shape[0]
+    n = weight.shape[0]
+    if out_dtype is None:
+        out_dtype = input.dtype
+    return torch.empty((m, n), dtype=out_dtype, device=input.device)
+
+
 # Global flag to ensure ops are registered only once
 _OPS_REGISTERED = False
 
@@ -592,6 +628,14 @@ def register_ops_once() -> None:
                 dispatch_key=current_platform.dispatch_key,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_gemm_a8w8_bpreshuffle",
+                op_func=_rocm_aiter_gemm_a8w8_bpreshuffle_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_gemm_a8w8_bpreshuffle_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
             _OPS_REGISTERED = True
 
     @staticmethod
@@ -635,6 +679,18 @@ def gemm_a8w8_blockscale(
             A, B, As, Bs, output_dtype
         )
 
+    @staticmethod
+    def gemm_a8w8_bpreshuffle(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        out_dtype: torch.dtype | None = None,
+        scale_a: torch.Tensor | None = None,
+        scale_b: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_gemm_a8w8_bpreshuffle(
+            input, weight, out_dtype, scale_a, scale_b
+        )
+
     @staticmethod
     def fused_moe(
         hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index 36e4a16c0168..90cbda90adf9 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -8,6 +8,8 @@
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
+    AiterBpreshufflePerTokenFp8ScaledMMLinearKernel,
+    AiterCKPerTokenFp8ScaledMMLinearKernel,
     AiterScaledMMLinearKernel,
 )
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import (
@@ -64,6 +66,8 @@
         ChannelWiseTorchScaledMMLinearKernel,
     ],
     PlatformEnum.ROCM: [
+        AiterBpreshufflePerTokenFp8ScaledMMLinearKernel,
+        AiterCKPerTokenFp8ScaledMMLinearKernel,
         ROCmScaledMMLinearKernel,
         PerTensorTorchScaledMMLinearKernel,
         RowWiseTorchScaledMMLinearKernel,
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index 4a1c76ffd9b1..28c5640d319a 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -2,17 +2,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+from collections.abc import Callable
+
 import torch
+from aiter.ops.shuffle import shuffle_weight
 
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
 from .cutlass import CutlassScaledMMLinearKernel
 from .ScaledMMLinearKernel import (
+    FP8ScaledMMLinearKernel,
+    FP8ScaledMMLinearLayerConfig,
     Int8ScaledMMLinearLayerConfig,
 )
 
+logger = init_logger(__name__)
+
 
 class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
     @classmethod
@@ -117,3 +125,160 @@ def apply_weights(
         # b to be [N, K]
         # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
         return rocm_aiter_ops.gemm_a8w8(x_q, w_q.t(), x_s, w_s, bias, out_dtype)
+
+
+class AiterBpreshufflePerTokenFp8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
+    def get_ouput_padding(self) -> int | None:
+        # PTPC kernels do not require padding.
+        return None
+
+    @classmethod
+    def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_rocm():
+            return (False, "AITER bpreshuffle is ROCm-only")
+
+        if not rocm_aiter_ops.is_linear_enabled():
+            return (False, "AITER bpreshuffle is disabled by env var")
+
+        try:
+            import aiter  # noqa: F401
+        except Exception:
+            return (False, "AITER not installed")
+
+        # Check if the configuration is PTPC
+        is_per_channel_weight = c.weight_quant_key.scale.group_shape.is_per_token()
+        is_per_token_activation = (
+            c.activation_quant_key.scale.group_shape.is_per_token()
+        )
+        is_ptpc = is_per_channel_weight and is_per_token_activation
+
+        logger.info_once(f"AiterBpreshuffle: can_implement called. is_ptpc={is_ptpc}")
+
+        if not is_ptpc:
+            return (False, "This kernel only handles Per-Token/Per-Channel (PTPC)")
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        logger.info_once("AiterBpreshuffle: SHUFFLING WEIGHTS NOW.")
+
+        w_q, _, _, _ = self._get_layer_params(layer)
+
+        N = w_q.shape[1]
+        K = w_q.shape[0]
+
+        if N % 16 == 0 and K % 16 == 0:
+            # AITER shuffle_weight expectation [N, K]
+            w_q_nk = w_q.t().contiguous()
+
+            # Execute shuffle
+            shuffled_w_nk = shuffle_weight(w_q_nk, layout=(16, 16))
+
+            del layer.weight
+            layer.register_buffer("weight", shuffled_w_nk)
+
+            logger.info_once("[AiterBpreshuffle: Weight shuffle COMPLETE.")
+
+        else:
+            raise ValueError(
+                f"Weight shape (N={N}, K={K}) not divisible by 16 "
+                "for AITER bpreshuffle."
+            )
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        # 1. Obtain parameters
+        w_q, w_s, x_s, x_s_ub = self._get_layer_params(layer)
+        # 2. Dynamic quantization input
+        qinput, qinput_scale = self.quant_fp8(x, x_s, x_s_ub)
+
+        logger.info_once(
+            "AiterBpreshuffle: apply_weights... ABOUT TO CALL C++ KERNEL..."
+        )
+
+        output = rocm_aiter_ops.gemm_a8w8_bpreshuffle(
+            qinput,
+            w_q,  # Input [N, K] shuffle weights
+            out_dtype=self.config.out_dtype,
+            scale_a=qinput_scale,
+            scale_b=w_s,
+        )
+
+        logger.info_once("AiterBpreshuffle: C++ KERNEL CALL SUCCEEDED.")
+
+        if bias is not None:
+            output.add_(bias)
+        return output
+
+    def get_scaled_mm_func(self) -> Callable[..., torch.Tensor]:
+        return rocm_aiter_ops.gemm_a8w8_bpreshuffle
+
+
+class AiterCKPerTokenFp8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
+    """
+    AITER PTPC kernel (gemm_a8w8_CK) without pre-shuffling.
+    """
+
+    def get_ouput_padding(self) -> int | None:
+        return None
+
+    @classmethod
+    def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_rocm():
+            return (False, "AITER CK is ROCm-only")
+
+        if not rocm_aiter_ops.is_linear_enabled():
+            return (False, "AITER CK is disabled by env var")
+
+        try:
+            import aiter  # noqa: F401
+        except Exception:
+            return (False, "AITER not installed")
+
+        is_per_channel_weight = c.weight_quant_key.scale.group_shape.is_per_token()
+        is_per_token_activation = (
+            c.activation_quant_key.scale.group_shape.is_per_token()
+        )
+        is_ptpc = is_per_channel_weight and is_per_token_activation
+
+        logger.info_once(f"AiterCK: can_implement called. is_ptpc={is_ptpc}")
+
+        if not is_ptpc:
+            return (False, "This kernel only handles Per-Token/Per-Channel (PTPC)")
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        logger.info_once(
+            "AITER CK: process_weights_after_loading... DOING NOTHING (pass)."
+        )
+        pass
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        w_q, w_s, x_s, x_s_ub = self._get_layer_params(layer)
+
+        qinput, qinput_scale = self.quant_fp8(x, x_s, x_s_ub)
+
+        logger.info_once(
+            "AiterCK: apply_weights... "
+            "ABOUT TO CALL C++ KERNEL (this is where it hangs)..."
+        )
+
+        output = rocm_aiter_ops.gemm_a8w8(
+            qinput, w_q.t(), qinput_scale, w_s, bias, self.config.out_dtype
+        )
+
+        logger.info_once("AiterCK: C++ KERNEL CALL SUCCEEDED.")
+        return output
+
+    def get_scaled_mm_func(self) -> Callable[..., torch.Tensor]:
+        return rocm_aiter_ops.gemm_a8w8

From 577bb34fffc83598d3e4940f8492c122d9e3318d Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Mon, 17 Nov 2025 15:47:24 +0800
Subject: [PATCH 125/578] [CPU][Bugfix] Fix _to_list in CPU model runner
 (#28824)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 csrc/cpu/torch_bindings.cpp        | 8 ++++++++
 vllm/v1/worker/cpu_model_runner.py | 3 ---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 5e2aa7069256..9fefd88cd9b0 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -100,6 +100,9 @@ void cpu_attention_with_kv_cache(
     const torch::Tensor& scheduler_metadata,
     const std::optional<torch::Tensor>& s_aux);
 
+// Note: just for avoiding importing errors
+void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -275,6 +278,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
       "float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
       &cpu_attention_with_kv_cache);
+
+  // placeholders
+  ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
+  ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
+  ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index ceb1cf64b588..40f011fed1ad 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -80,9 +80,6 @@ def _init_device_properties(self) -> None:
     def _sync_device(self) -> None:
         pass
 
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
-        return sampled_token_ids.tolist()
-
     def get_dp_padding(self, num_tokens: int) -> tuple[int, torch.Tensor | None]:
         # Note: For CPU backend, dp padding is not required for now.
         return 0, None

From ab01cd14e5e2ef65549b459d0a2bf3a2540c9f3f Mon Sep 17 00:00:00 2001
From: wuyaoxuehun <798143193@qq.com>
Date: Mon, 17 Nov 2025 16:13:11 +0700
Subject: [PATCH 126/578] [BugFix] Fix glm4_moe_mtp load weights bug (#28805)

Signed-off-by: wuyaoxuehun <798143193@qq.com>
---
 vllm/model_executor/models/glm4_moe_mtp.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py
index 110ed0a64633..e34ae6c85a4f 100644
--- a/vllm/model_executor/models/glm4_moe_mtp.py
+++ b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -256,13 +256,12 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
-        spec_layer = self.model.mtp_start_layer_idx
         for name, loaded_weight in weights:
             if name == "lm_head.weight":
-                name = f"model.layers.{spec_layer}.shard_head.head.weight"
+                spec_layer = self.model.mtp_start_layer_idx
+                name = f"model.layers.{spec_layer}.shared_head.head.weight"
             elif name == "model.embed_tokens.weight":
-                # This name is same with local model, rewriting is not needed.
-                pass
+                spec_layer = self.model.mtp_start_layer_idx
             else:
                 spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
                 if spec_layer is None:

From d4acf518d09515560e1082a80b8a4d6550e20d9b Mon Sep 17 00:00:00 2001
From: Jae-Won Chung <jwnchung@umich.edu>
Date: Mon, 17 Nov 2025 04:54:15 -0500
Subject: [PATCH 127/578] [Metrics] Fix KV cache usage percent metric multiproc
 (#28792)

The `vllm:kv_cache_usage_perc` Gauge metric is missing `multiprocess_mode="mostrecent"` and ends up returning

```
vllm:kv_cache_usage_perc{engine="0",model_name="Qwen/Qwen3-VL-8B-Instruct",pid="277"} 0.0
vllm:kv_cache_usage_perc{engine="0",model_name="Qwen/Qwen3-VL-8B-Instruct",pid="275"} 0.0
vllm:kv_cache_usage_perc{engine="0",model_name="Qwen/Qwen3-VL-8B-Instruct",pid="273"} 0.6530455880475035
...
```

The deprecated `vllm:gpu_cache_usage_perc` Gauge metric has `multiprocess_mode="mostrecent"`.

Signed-off-by: Jae-Won Chung <jwnchung@umich.edu>
---
 vllm/v1/metrics/loggers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 21280b9c84cf..cb36e7973650 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -494,6 +494,7 @@ def __init__(
         gauge_kv_cache_usage = self._gauge_cls(
             name="vllm:kv_cache_usage_perc",
             documentation="KV-cache usage. 1 means 100 percent usage.",
+            multiprocess_mode="mostrecent",
             labelnames=labelnames,
         )
         self.gauge_kv_cache_usage = make_per_engine(

From 1b82fb0ad3cea2e1a31da4fa20dd736a8a181089 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Mon, 17 Nov 2025 21:16:44 +0800
Subject: [PATCH 128/578] [XPU] work around for sp, avoid custom op import
 error (#28822)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/compilation/pass_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 0c2210d72ce0..0e8bb2fc9735 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -18,6 +18,7 @@
     from .fusion import RMSNormQuantFusionPass
     from .fusion_attn import AttnFusionPass
     from .qk_norm_rope_fusion import QKNormRoPEFusionPass
+    from .sequence_parallelism import SequenceParallelismPass
 
 if current_platform.is_cuda():
     from .collective_fusion import AllReduceFusionPass, AsyncTPPass
@@ -25,7 +26,6 @@
 from .fix_functionalization import FixFunctionalizationPass
 from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
 from .noop_elimination import NoOpEliminationPass
-from .sequence_parallelism import SequenceParallelismPass
 
 logger = init_logger(__name__)
 

From 64e39d667cb5b550e6ce148acd3d4dcd1654eace Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Mon, 17 Nov 2025 09:41:22 -0500
Subject: [PATCH 129/578] [BugFix] Temporary fix for IMA with MTP = 2 and
 full-cg (#28315)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/config/compilation.py         | 77 +++++++++++++++++++++++++-----
 vllm/v1/worker/gpu_model_runner.py | 16 +++++++
 2 files changed, 80 insertions(+), 13 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 10673041aa68..088d0b1af757 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -18,6 +18,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.utils.math_utils import round_up
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 if TYPE_CHECKING:
@@ -773,19 +774,8 @@ def post_init_cudagraph_sizes(self) -> None:
         if self.cudagraph_capture_sizes:
             assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
 
-        # pre-compute the mapping from batch size to padded graph size
-        self.bs_to_padded_graph_size = [
-            0 for i in range(self.max_cudagraph_capture_size + 1)
-        ]
-        for end, start in zip(
-            self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
-            [0] + self.cudagraph_capture_sizes,
-        ):
-            for bs in range(start, end):
-                if bs == start:
-                    self.bs_to_padded_graph_size[bs] = start
-                else:
-                    self.bs_to_padded_graph_size[bs] = end
+        # May get recomputed in the model runner if adjustment is needed for spec-decode
+        self.compute_bs_to_padded_graph_size()
 
     def set_splitting_ops_for_v1(self):
         # NOTE: this function needs to be called only when mode is
@@ -922,3 +912,64 @@ def custom_op_log_check(self):
                     enable_str,
                     op,
                 )
+
+    def adjust_cudagraph_sizes_for_spec_decode(
+        self, uniform_decode_query_len: int, tensor_parallel_size: int
+    ):
+        multiple_of = uniform_decode_query_len
+        if tensor_parallel_size > 1:
+            multiple_of = max(uniform_decode_query_len, tensor_parallel_size)
+            if (
+                multiple_of % uniform_decode_query_len != 0
+                or multiple_of % tensor_parallel_size != 0
+            ):
+                raise ValueError(
+                    f"Can't determine cudagraph shapes that are both a "
+                    f"multiple of {uniform_decode_query_len} "
+                    f"(num_speculative_tokens + 1) required by spec-decode "
+                    f"and {tensor_parallel_size} (tensor_parallel_size) "
+                    f"required by sequence parallelism please adjust "
+                    f"num_speculative_tokens or disable sequence parallelism"
+                )
+
+        if not self.cudagraph_capture_sizes or multiple_of <= 1:
+            return
+
+        assert self.max_cudagraph_capture_size is not None
+        rounded_sizes = sorted(
+            set(
+                round_up(size, multiple_of)
+                for size in self.cudagraph_capture_sizes
+                if round_up(size, multiple_of) <= self.max_cudagraph_capture_size
+            )
+        )
+
+        if len(rounded_sizes) == 0:
+            logger.warning(
+                "No valid cudagraph sizes after rounding to multiple of "
+                " num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens"
+                " or max_cudagraph_capture_size (or cudagraph_capture_sizes)",
+                multiple_of,
+            )
+            return
+
+        self.max_cudagraph_capture_size = rounded_sizes[-1]
+        self.cudagraph_capture_sizes = rounded_sizes
+
+        # Recompute after adjusting the cudagraph sizes
+        self.compute_bs_to_padded_graph_size()
+
+    def compute_bs_to_padded_graph_size(self):
+        # pre-compute the mapping from batch size to padded graph size
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_cudagraph_capture_size + 1)
+        ]
+        for end, start in zip(
+            self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
+            [0] + self.cudagraph_capture_sizes,
+        ):
+            for bs in range(start, end):
+                if bs == start:
+                    self.bs_to_padded_graph_size[bs] = start
+                else:
+                    self.bs_to_padded_graph_size[bs] = end
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ffbac5fe12f7..4fe1b6487d58 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4332,6 +4332,22 @@ def _check_and_update_cudagraph_mode(
                 "and make sure compilation mode is VLLM_COMPILE"
             )
 
+        # if we have dedicated decode cudagraphs, and spec-decode is enabled,
+        # we need to adjust the cudagraph sizes to be a multiple of the uniform
+        # decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
+        # temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
+        # Will be removed in the near future when we have seperate cudagraph capture
+        # sizes for decode and mixed prefill-decode.
+        if (
+            cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+            and cudagraph_mode.separate_routine()
+            and self.uniform_decode_query_len > 1
+        ):
+            self.compilation_config.adjust_cudagraph_sizes_for_spec_decode(
+                self.uniform_decode_query_len, self.parallel_config.tensor_parallel_size
+            )
+            self.cudagraph_batch_sizes = self.compilation_config.cudagraph_capture_sizes
+
         # Trigger cudagraph dispatching keys initialization after
         # resolved cudagraph mode.
         self.cudagraph_dispatcher.initialize_cudagraph_keys(

From 7f064491f80ba20e782f33f4da566ec7da5118d7 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Mon, 17 Nov 2025 06:49:25 -0800
Subject: [PATCH 130/578] [Bugfix][Perf] Revert applying HF processor on
 text-only inputs for multimodal models  (#28858)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 tests/test_inputs.py      | 35 +++++++----------------------------
 vllm/inputs/preprocess.py | 14 ++++----------
 2 files changed, 11 insertions(+), 38 deletions(-)

diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index 50a273016ab8..b1fb4e06a690 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -86,34 +86,6 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
         assert zipped["mm_processor_kwargs"] == exp_kwargs
 
 
-@pytest.mark.parametrize(
-    "model_id",
-    [
-        "facebook/opt-125m",
-    ],
-)
-@pytest.mark.parametrize(
-    "prompt",
-    [
-        {
-            "prompt": "",
-            "multi_modal_data": {"dummy": []},
-        },
-        {
-            "prompt_token_ids": [],
-            "multi_modal_data": {"dummy": []},
-        },
-    ],
-)
-def test_preprocessor_text_no_mm_inputs(model_id, prompt):
-    model_config = ModelConfig(model=model_id)
-    tokenizer = init_tokenizer_from_configs(model_config)
-    input_preprocessor = InputPreprocessor(model_config, tokenizer)
-
-    with pytest.raises(ValueError, match="does not support multimodal inputs"):
-        input_preprocessor.preprocess(prompt)
-
-
 @pytest.mark.parametrize(
     "model_id",
     [
@@ -127,6 +99,13 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt):
         {"prompt_token_ids": []},
     ],
 )
+@pytest.mark.skip(
+    reason=(
+        "Applying huggingface processor on text inputs results in "
+        "significant performance regression for multimodal models. "
+        "See https://github.com/vllm-project/vllm/issues/26320"
+    )
+)
 def test_preprocessor_always_mm_code_path(model_id, prompt):
     model_config = ModelConfig(model=model_id)
     tokenizer = init_tokenizer_from_configs(model_config)
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 80d5322a34c3..839c13868a16 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -348,18 +348,15 @@ def _process_tokens(
         )
 
         inputs: TokenInputs | MultiModalInputs
-        if self.model_config.is_multimodal_model:
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
             inputs = self._process_multimodal(
                 prompt_token_ids,
-                parsed_content.get("multi_modal_data") or {},
+                multi_modal_data,
                 parsed_content.get("mm_processor_kwargs") or {},
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
         else:
-            if parsed_content.get("multi_modal_data"):
-                raise ValueError("This model does not support multimodal inputs")
-
             inputs = token_inputs(prompt_token_ids)
 
         if cache_salt := parsed_content.get("cache_salt"):
@@ -377,18 +374,15 @@ def _process_text(
         prompt_text = parsed_content["prompt"]
 
         inputs: TokenInputs | MultiModalInputs
-        if self.model_config.is_multimodal_model:
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
             inputs = self._process_multimodal(
                 prompt_text,
-                parsed_content.get("multi_modal_data") or {},
+                multi_modal_data,
                 parsed_content.get("mm_processor_kwargs") or {},
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
         else:
-            if parsed_content.get("multi_modal_data"):
-                raise ValueError("This model does not support multimodal inputs")
-
             prompt_token_ids = self._tokenize_prompt(
                 prompt_text,
                 tokenization_kwargs=tokenization_kwargs,

From e42bd8c2e3bfecdaf9c5a7ad99d7c7d7cb75a7b5 Mon Sep 17 00:00:00 2001
From: tiehexue <tiehexue@hotmail.com>
Date: Tue, 18 Nov 2025 00:02:32 +0800
Subject: [PATCH 131/578] Cast return value to int64_t for cache size (#28814)

Signed-off-by: tiehexue <tiehexue@hotmail.com>
---
 csrc/cpu/cpu_attn_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 5de8a114b2b5..344296528b65 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -754,7 +754,7 @@ class AttentionScheduler {
         return l2_cache_size >> 1;  // use 50% of L2 cache
       }
       // Fallback if sysctlbyname fails
-      return 128 * 1024 >> 1;  // use 50% of 128KB
+      return 128LL * 1024 >> 1;  // use 50% of 128KB
 #else
       long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
       TORCH_CHECK_NE(l2_cache_size, -1);

From f8b19c0ffd65f7f6f01a0da4a39b6890f5db40cb Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Mon, 17 Nov 2025 10:15:26 -0800
Subject: [PATCH 132/578] [Bugfix] Fix GPT-OSS on AMD after #28603 (#28816)

Signed-off-by: zhewenli <zhewenli@meta.com>
---
 .buildkite/test-amd.yaml                         | 9 +++++----
 vllm/model_executor/layers/quantization/mxfp4.py | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index e232000511c3..2471b509a9ff 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1068,7 +1068,7 @@ steps:
     # this runner has 2 GPUs available even though num_gpus=2 is not set
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml 
+    # Wrap with quotes to escape yaml
     - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
 
 - label: Blackwell Fusion E2E Tests # 30 min
@@ -1095,10 +1095,11 @@ steps:
     # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
     - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
 
-- label: Blackwell GPT-OSS Eval
+- label: ROCm GPT-OSS Eval
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
-  gpu: b200
+  agent_pool: mi325_1
+  mirror_hardwares: [amdproduction]
   optional: true # run on nightlies
   source_file_dependencies:
   - tests/evals/gpt_oss
@@ -1107,7 +1108,7 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   commands:
     - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 
 - label: Blackwell Quantized MoE Test
   timeout_in_minutes: 60
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 5552c1ae5edf..b95d1a6b3a1f 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -755,8 +755,8 @@ def _interleave_mxfp4_cutlass_sm90(w):
 
             self.w13_weight = w13_weight
             self.w2_weight = w2_weight
-            layer.w13_weight = Parameter(w13_weight.data, requires_grad=False)
-            layer.w2_weight = Parameter(w2_weight.data, requires_grad=False)
+            layer.w13_weight = Parameter(w13_weight.storage.data, requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight.storage.data, requires_grad=False)
         else:
             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
 

From d8874c61a55e40db4ada047f1736c38c86439fff Mon Sep 17 00:00:00 2001
From: Ronald <ronaldautomobile@163.com>
Date: Tue, 18 Nov 2025 04:16:20 +0800
Subject: [PATCH 133/578] [Core] Async Scheduling X Spec Decoding Compatibility
 (#24799)

Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
---
 tests/v1/e2e/test_async_scheduling.py       |  38 +--
 vllm/config/speculative.py                  |  38 ++-
 vllm/config/vllm.py                         |  21 +-
 vllm/v1/core/sched/async_scheduler.py       |  15 +-
 vllm/v1/core/sched/scheduler.py             |  12 +-
 vllm/v1/engine/core.py                      |   6 +-
 vllm/v1/engine/processor.py                 |  17 ++
 vllm/v1/sample/logits_processor/__init__.py |   2 +-
 vllm/v1/spec_decode/eagle.py                |   7 +-
 vllm/v1/worker/gpu_input_batch.py           |   3 +
 vllm/v1/worker/gpu_model_runner.py          | 253 +++++++++++++++++---
 11 files changed, 314 insertions(+), 98 deletions(-)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index c4aca82416cd..f732b05f09f9 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -15,7 +15,7 @@
 from ...models.utils import check_outputs_equal
 
 MODEL = "Qwen/Qwen3-0.6B"
-MTP_MODEL = "XiaomiMiMo/MiMo-7B-Base"
+MTP_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 
 
 first_prompt = (
@@ -29,7 +29,8 @@
 
 default_params = dict(
     temperature=0.0,  # greedy
-    max_tokens=20,
+    max_tokens=23,
+    min_tokens=18,
 )
 
 
@@ -69,15 +70,9 @@ def test_without_spec_decoding(
         (True, "uni", True, None, True),
     ]
 
-    run_tests(
-        monkeypatch,
-        MODEL,
-        test_configs,
-        test_sampling_params,
-    )
+    run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
 
 
-@pytest.mark.skip("MTP model too big to run in fp32 in CI")
 def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
     """Test consistency and acceptance rates with some different combos of
     preemption, executor, async scheduling, prefill chunking,
@@ -85,8 +80,9 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
     """
 
     spec_config = {
-        "method": "mtp",
+        "method": "eagle3",
         "num_speculative_tokens": 2,
+        "model": "nm-testing/Llama3_2_1B_speculator.eagle3",
     }
     spec_config_short = spec_config | {"max_model_len": 50}
 
@@ -106,12 +102,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
         (True, "uni", True, spec_config_short, True),
     ]
 
-    run_tests(
-        monkeypatch,
-        MTP_MODEL,
-        test_configs,
-        [{}],
-    )
+    run_tests(monkeypatch, MTP_MODEL, test_configs, [{}])
 
 
 @dynamo_config.patch(cache_size_limit=16)
@@ -182,15 +173,13 @@ def run_tests(
                     and test_acceptance_rate is not None
                 ):
                     if "spec_mml=None" in test_config:
-                        # because the acceptance rate can vary, we use a looser
-                        # tolerance here.
                         assert (
                             pytest.approx(test_acceptance_rate, rel=5e-2)
                             == base_acceptance_rate
                         )
                     else:
                         # Currently the reported acceptance rate is expected to be
-                        # lower when we skip drafting altogether.
+                        # lower when we sometimes skip drafting altogether.
                         assert test_acceptance_rate > 0.05
                 print(
                     f"PASSED: config=[{test_config}], params={params}"
@@ -220,6 +209,7 @@ def run_test(
 ):
     spec_decoding = spec_config is not None
     cache_arg: dict[str, Any] = (
+        # Force preemptions
         dict(num_gpu_blocks_override=32)
         if test_preemption
         else dict(gpu_memory_utilization=0.9)
@@ -238,6 +228,7 @@ def run_test(
         model,
         max_model_len=512,
         enable_chunked_prefill=test_prefill_chunking,
+        # Force prefill chunking
         max_num_batched_tokens=48 if test_prefill_chunking else None,
         # enforce_eager=True,
         async_scheduling=async_scheduling,
@@ -255,10 +246,7 @@ def run_test(
             results.append(
                 vllm_model.generate(
                     example_prompts,
-                    sampling_params=SamplingParams(
-                        **default_params,
-                        **override_params,
-                    ),
+                    sampling_params=SamplingParams(**default_params, **override_params),
                     return_logprobs=True,
                 )
             )
@@ -270,9 +258,7 @@ def run_test(
 
             if test_preemption:
                 preemptions = _get_count(
-                    metrics_before,
-                    metrics_after,
-                    "vllm:num_preemptions",
+                    metrics_before, metrics_after, "vllm:num_preemptions"
                 )
                 assert preemptions > 0, "preemption test had no preemptions"
 
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 31cdeabe501d..13a8632413d9 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -3,7 +3,7 @@
 
 import ast
 import hashlib
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, get_args
 
 from pydantic import Field, SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
@@ -29,31 +29,25 @@
 
 logger = init_logger(__name__)
 
-SpeculativeMethod = Literal[
-    "ngram",
-    "eagle",
-    "eagle3",
-    "medusa",
-    "mlp_speculator",
-    "draft_model",
-    "deepseek_mtp",
-    "ernie_mtp",
-    "qwen3_next_mtp",
-    "mimo_mtp",
-    "longcat_flash_mtp",
-    "pangu_ultra_moe_mtp",
-    "mtp",
-    "suffix",
-]
-MTP_MODEL_TYPES = (
+MTPModelTypes = Literal[
     "deepseek_mtp",
     "mimo_mtp",
     "glm4_moe_mtp",
     "ernie_mtp",
     "qwen3_next_mtp",
     "longcat_flash_mtp",
+    "mtp",
     "pangu_ultra_moe_mtp",
-)
+]
+EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
+SpeculativeMethod = Literal[
+    "ngram",
+    "medusa",
+    "mlp_speculator",
+    "draft_model",
+    "suffix",
+    EagleModelTypes,
+]
 
 
 @config
@@ -244,7 +238,7 @@ def __post_init__(self):
         # can not be detected, it will be considered as the "draft_model" by
         # default.
 
-        if self.method in MTP_MODEL_TYPES:
+        if self.method in get_args(MTPModelTypes) and self.method != "mtp":
             logger.warning(
                 "method `%s` is deprecated and replaced with mtp.", self.method
             )
@@ -361,7 +355,9 @@ def __post_init__(self):
                     self.method = "medusa"
                 elif self.draft_model_config.hf_config.model_type == "mlp_speculator":
                     self.method = "mlp_speculator"
-                elif self.draft_model_config.hf_config.model_type in MTP_MODEL_TYPES:
+                elif self.draft_model_config.hf_config.model_type in get_args(
+                    MTPModelTypes
+                ):
                     self.method = "mtp"
                     if self.num_speculative_tokens > 1:
                         logger.warning(
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index bd98be48588f..672b004c4aa5 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -14,13 +14,14 @@
 from datetime import datetime
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar, get_args
 
 import torch
 from pydantic import ConfigDict, Field, model_validator
 from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
+from vllm.config.speculative import EagleModelTypes
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid
@@ -374,10 +375,22 @@ def __post_init__(self):
                     "Async scheduling is not yet compatible with "
                     "pipeline_parallel_size > 1."
                 )
+            # Currently, async scheduling only support eagle speculative
+            # decoding.
             if self.speculative_config is not None:
-                raise ValueError(
-                    "Async scheduling is not yet compatible with speculative decoding."
-                )
+                if self.speculative_config.method not in get_args(EagleModelTypes):
+                    raise ValueError(
+                        "Currently, async scheduling is only supported "
+                        "with EAGLE/MTP kind of speculative decoding"
+                    )
+                if self.speculative_config.disable_padded_drafter_batch:
+                    raise ValueError(
+                        "async scheduling for EAGLE/MTP kind of speculative "
+                        "decoding is enabled, but disable_padded_drafter_batch=True "
+                        "disable_padded_drafter_batch=True is not supported for "
+                        "this situation now. please set "
+                        "disable_padded_drafter_batch=Fasle"
+                    )
             if not executor_supports_async_sched:
                 raise ValueError(
                     "Currently, async scheduling only supports `mp`, `uni`, or "
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
index 0ad994c360b0..3214f65a0972 100644
--- a/vllm/v1/core/sched/async_scheduler.py
+++ b/vllm/v1/core/sched/async_scheduler.py
@@ -16,18 +16,25 @@ def _update_after_schedule(
     ) -> None:
         super()._update_after_schedule(scheduler_output)
         pending_structured_output_tokens = False
+        spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
         for req_id in scheduler_output.num_scheduled_tokens:
             request = self.requests[req_id]
             pending_structured_output_tokens |= (
                 request.use_structured_output and request.num_output_placeholders > 0
             )
+            cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, ()))
             if (
                 request.num_computed_tokens
-                == request.num_tokens + request.num_output_placeholders
+                == request.num_tokens
+                + request.num_output_placeholders
+                + cur_num_spec_tokens
             ):
-                # The request will generate a new token in this scheduling step.
-                # TODO(woosuk): Support speculative decoding.
-                request.num_output_placeholders += 1
+                # The request will generate a new token plus num_spec_tokens
+                # in this scheduling step.
+                request.num_output_placeholders += 1 + cur_num_spec_tokens
+                # Add placeholders for the new tokens in spec_token_ids.
+                # Wwe will update the actual spec token ids in the worker process.
+                request.spec_token_ids = [-1] * self.num_spec_tokens
 
         scheduler_output.pending_structured_output_tokens = (
             pending_structured_output_tokens
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 8e62542337a7..61640e856ac1 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -348,7 +348,10 @@ def schedule(self) -> SchedulerOutput:
             # Speculative decode related.
             if request.spec_token_ids:
                 num_scheduled_spec_tokens = (
-                    num_new_tokens + request.num_computed_tokens - request.num_tokens
+                    num_new_tokens
+                    + request.num_computed_tokens
+                    - request.num_tokens
+                    - request.num_output_placeholders
                 )
                 if num_scheduled_spec_tokens > 0:
                     # Trim spec_token_ids list to num_scheduled_spec_tokens.
@@ -1024,7 +1027,12 @@ def update_from_output(
                 # tokens and rejections. If some tokens are rejected,
                 # num_computed_tokens is decreased by the number of rejected
                 # tokens.
-                request.num_computed_tokens -= num_rejected
+                if request.num_computed_tokens > 0:
+                    request.num_computed_tokens -= num_rejected
+                # If async scheduling, num_output_placeholders also includes
+                # the scheduled spec tokens count and so is similarly adjusted.
+                if request.num_output_placeholders > 0:
+                    request.num_output_placeholders -= num_rejected
                 spec_decoding_stats = self.make_spec_decoding_stats(
                     spec_decoding_stats,
                     num_draft_tokens=num_draft_tokens,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a6965182fc2c..508669cf527d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -198,6 +198,7 @@ def __init__(
         self.step_fn = (
             self.step if self.batch_queue is None else self.step_with_batch_queue
         )
+        self.async_scheduling = vllm_config.scheduler_config.async_scheduling
 
         # Mark the startup heap as static so that it's ignored by GC.
         # Reduces pause times of oldest generation collections.
@@ -341,7 +342,10 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         return engine_core_outputs, scheduler_output.total_num_scheduled_tokens > 0
 
     def post_step(self, model_executed: bool) -> None:
-        if self.use_spec_decode and model_executed:
+        # When using async scheduling we can't get draft token ids in advance,
+        # so we update draft token ids in the worker process and don't
+        # need to update draft token ids here.
+        if not self.async_scheduling and self.use_spec_decode and model_executed:
             # Take the draft token ids.
             draft_token_ids = self.model_executor.take_draft_token_ids()
             if draft_token_ids is not None:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index fffd075a5165..4cb911d8e22b 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -150,6 +150,23 @@ def _validate_supported_sampling_params(
             raise ValueError(
                 "vLLM V1 does not support per request user provided logits processors."
             )
+        # Async scheduling + spec decode currently incompatible with some
+        # sampling parameters.
+        if (
+            self.vllm_config.speculative_config is not None
+            and self.vllm_config.scheduler_config.async_scheduling
+            and (
+                params.frequency_penalty != 0.0
+                or params.presence_penalty != 0.0
+                or params.repetition_penalty != 1.0
+                or params.bad_words_token_ids
+                or params.structured_outputs
+            )
+        ):
+            raise ValueError(
+                "async scheduling with spec decoding doesn't yet support "
+                "penalties, bad words or structured outputs in sampling parameters."
+            )
 
     def _validate_params(
         self,
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 5992c4066c9c..8b174af4c779 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -41,7 +41,7 @@
 # Error message when the user tries to initialize vLLM with a speculative
 # decoding enabled and custom logitsproces
 STR_SPEC_DEC_REJECTS_LOGITSPROCS = (
-    "Custom logits processors are not supportedwhen speculative decoding is enabled."
+    "Custom logits processors are not supported when speculative decoding is enabled."
 )
 
 LOGITSPROCS_GROUP = "vllm.logits_processors"
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index ed602f39d0f9..5bf2503c3027 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -397,10 +397,13 @@ def propose(
                 positions += 1
                 exceeds_max_model_len = positions >= self.max_model_len
                 clamped_positions = torch.where(exceeds_max_model_len, 0, positions)
-
+            # For data integrity when async scheduling, we shouldn't use in place
+            # operations in case they are modified in next step's `prepare_input`
+            # of main model.
             # Increment the sequence lengths.
             common_attn_metadata.seq_lens += 1
-            common_attn_metadata.seq_lens_cpu += 1
+            # This is an out-of-place operation to avoid modifying the original tensor.
+            common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1
             # For the requests that exceed the max model length, we set the
             # sequence length to 1 to minimize their overheads in attention.
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 393181f543d2..7cf6afa3fc37 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -46,6 +46,9 @@ class CachedRequestState:
     lora_request: LoRARequest | None = None
     prompt_embeds: torch.Tensor | None = None
 
+    # Used when both async_scheduling and spec_decode are enabled.
+    prev_num_draft_len: int = 0
+
     def __post_init__(self):
         self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
             self.prompt_token_ids, self.prompt_embeds
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4fe1b6487d58..758e3e1b3a82 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -179,6 +179,7 @@ def __init__(
         logprobs_tensors: torch.Tensor | None,
         invalid_req_indices: list[int],
         async_output_copy_stream: torch.cuda.Stream,
+        vocab_size: int,
     ):
         self._model_runner_output = model_runner_output
         self._invalid_req_indices = invalid_req_indices
@@ -189,6 +190,7 @@ def __init__(
         # Keep a reference to the device tensor to avoid it being
         # deallocated until we finish copying it to the host.
         self._sampled_token_ids = sampled_token_ids
+        self.vocab_size = vocab_size
         self._logprobs_tensors = logprobs_tensors
 
         # Initiate the copy on a separate stream, but do not synchronize it.
@@ -215,10 +217,16 @@ def get_output(self) -> ModelRunnerOutput:
         # Release the device tensors once the copy has completed.
         del self._logprobs_tensors
         del self._sampled_token_ids
-
-        valid_sampled_token_ids: list[np.ndarray] = [
-            row for row in self.sampled_token_ids_cpu.numpy()
-        ]
+        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
+        if max_gen_len == 1:
+            valid_sampled_token_ids: list[np.ndarray] = [
+                row for row in self.sampled_token_ids_cpu.numpy()
+            ]
+        else:
+            valid_sampled_token_ids = RejectionSampler.parse_output(
+                self.sampled_token_ids_cpu,
+                self.vocab_size,
+            )
         for i in self._invalid_req_indices:
             valid_sampled_token_ids[i] = np.array([])
 
@@ -377,6 +385,10 @@ def __init__(
                 )
             self.rejection_sampler = RejectionSampler(self.sampler)
 
+        self.num_spec_tokens = 0
+        if self.speculative_config:
+            self.num_spec_tokens = self.speculative_config.num_speculative_tokens
+
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
         self.comm_stream = torch.cuda.Stream()
@@ -513,11 +525,7 @@ def __init__(
                 self.max_num_tokens, dtype=torch.int32, device=self.device
             )
 
-        self.uniform_decode_query_len = (
-            1
-            if not self.speculative_config
-            else 1 + self.speculative_config.num_speculative_tokens
-        )
+        self.uniform_decode_query_len = 1 + self.num_spec_tokens
 
         # Cudagraph dispatcher for runtime cudagraph dispatching.
         self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config)
@@ -549,6 +557,20 @@ def __init__(
             pin_memory=self.pin_memory,
         )
 
+        # Pre-allocated tensor for copying valid sampled token counts to CPU,
+        # with dedicated stream for overlapping and event for coordination.
+        self.valid_sampled_token_count_event: torch.cuda.Event | None = None
+        self.valid_sampled_token_count_copy_stream: torch.cuda.Stream | None = None
+        if self.use_async_scheduling and self.num_spec_tokens:
+            self.valid_sampled_token_count_event = torch.cuda.Event()
+            self.valid_sampled_token_count_copy_stream = torch.cuda.Stream()
+        self.valid_sampled_token_count_cpu = torch.empty(
+            self.max_num_reqs,
+            dtype=torch.int64,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
+
         # Ephemeral state transferred between execute_model() and sample_tokens().
         self.execute_model_state: ExecuteModelState | None = None
 
@@ -736,17 +758,45 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Update the states of the running/resumed requests.
         is_last_rank = get_pp_group().is_last_rank
         req_data = scheduler_output.scheduled_cached_reqs
+
+        # Wait until valid_sampled_tokens_count is copied to cpu,
+        # then use it to update actual num_computed_tokens of each request.
+        valid_sampled_token_count = self._get_valid_sampled_token_count()
+
         for i, req_id in enumerate(req_data.req_ids):
             req_state = self.requests[req_id]
             num_computed_tokens = req_data.num_computed_tokens[i]
             new_block_ids = req_data.new_block_ids[i]
             resumed_from_preemption = req_id in req_data.resumed_req_ids
             num_output_tokens = req_data.num_output_tokens[i]
+            req_index = self.input_batch.req_id_to_index.get(req_id)
 
-            # Update the cached states.
+            # prev_num_draft_len is used in async scheduling mode with
+            # spec decode. it indicates if need to update num_computed_tokens
+            # of the request. for example:
+            # fist step: num_computed_tokens = 0, spec_tokens = [],
+            # prev_num_draft_len = 0.
+            # second step: num_computed_tokens = 100(prompt lenth),
+            # spec_tokens = [a,b], prev_num_draft_len = 0.
+            # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d],
+            # prev_num_draft_len = 2.
+            # num_computed_tokens in first step and second step does't contain
+            # the spec tokens length, but in third step it contains the
+            # spec tokens length. we only need to update num_computed_tokens
+            # when prev_num_draft_len > 0.
+            if req_state.prev_num_draft_len:
+                if req_index is None:
+                    req_state.prev_num_draft_len = 0
+                else:
+                    assert self.input_batch.prev_req_id_to_index is not None
+                    prev_req_index = self.input_batch.prev_req_id_to_index[req_id]
+                    num_accepted = valid_sampled_token_count[prev_req_index] - 1
+                    num_rejected = req_state.prev_num_draft_len - num_accepted
+                    num_computed_tokens -= num_rejected
+                    req_state.output_token_ids.extend([-1] * num_accepted)
 
+            # Update the cached states.
             req_state.num_computed_tokens = num_computed_tokens
-            req_index = self.input_batch.req_id_to_index.get(req_id)
 
             if not is_last_rank:
                 # When using PP, the scheduler sends the sampled tokens back,
@@ -823,8 +873,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
                 req_id, []
             )
-            if spec_token_ids:
-                num_spec_tokens = len(spec_token_ids)
+            num_spec_tokens = len(spec_token_ids)
+            # For async scheduling, token_ids_cpu assigned from
+            # spec_token_ids are placeholders and will be overwritten in
+            # _prepare_input_ids.
+            if num_spec_tokens:
                 start_index = self.input_batch.num_tokens_no_spec[req_index]
                 end_token_index = start_index + num_spec_tokens
                 self.input_batch.token_ids_cpu[
@@ -840,6 +893,15 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             # even when speculative decoding is enabled.
             self.input_batch.spec_token_ids[req_index] = spec_token_ids
 
+            # there are no draft tokens with async scheduling,
+            # we clear the spec_decoding info in scheduler_output and
+            # use normal sampling but rejection_sampling.
+            if self.use_async_scheduling:
+                req_state.prev_num_draft_len = num_spec_tokens
+                if num_spec_tokens and self._draft_token_ids is None:
+                    scheduler_output.total_num_scheduled_tokens -= num_spec_tokens
+                    scheduler_output.num_scheduled_tokens[req_id] -= num_spec_tokens
+                    scheduler_output.scheduled_spec_decode_tokens.pop(req_id, None)
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
         for request in reqs_to_add:
@@ -959,7 +1021,10 @@ def _get_cumsum_and_arange(
         return cu_num_tokens, arange
 
     def _prepare_input_ids(
-        self, total_num_scheduled_tokens: int, cu_num_tokens: np.ndarray
+        self,
+        scheduler_output: "SchedulerOutput",
+        total_num_scheduled_tokens: int,
+        cu_num_tokens: np.ndarray,
     ) -> None:
         """Prepare the input IDs for the current batch.
 
@@ -980,21 +1045,43 @@ def _prepare_input_ids(
         # on the GPU from prev_sampled_token_ids.
         prev_req_id_to_index = self.input_batch.prev_req_id_to_index
         assert prev_req_id_to_index is not None
-        flattened_indices = []
-        prev_common_req_indices = []
+        sample_flattened_indices: list[int] = []
+        spec_flattened_indices: list[int] = []
+        prev_common_req_indices: list[int] = []
+        prev_draft_token_indices: list[int] = []
         indices_match = True
         max_flattened_index = -1
+        total_num_spec_tokens = 0
+        scheduled_spec_tokens = scheduler_output.scheduled_spec_decode_tokens
+
         for req_id, cur_index in self.input_batch.req_id_to_index.items():
             if (prev_index := prev_req_id_to_index.get(req_id)) is not None:
                 prev_common_req_indices.append(prev_index)
                 # We need to compute the flattened input_ids index of the
                 # last token in each common request.
+                draft_len = len(scheduled_spec_tokens.get(req_id, ()))
+                total_num_spec_tokens += draft_len
                 flattened_index = cu_num_tokens[cur_index].item() - 1
-                flattened_indices.append(flattened_index)
+                # example: cu_num_tokens = [2, 5, 8], draft_tokens = [1, 2, 2]
+                # sample_flattened_indices = [0, 2, 5]
+                # spec_flattened_indices = [1,   3, 4,    6, 7]
+                sample_flattened_indices.append(flattened_index - draft_len)
+                spec_flattened_indices.extend(
+                    range(flattened_index - draft_len + 1, flattened_index + 1)
+                )
+                start = prev_index * self.num_spec_tokens
+                # prev_draft_token_indices is used to find which draft_tokens_id
+                # should be copied to input_ids
+                # example: prev draft_tokens_id [[1,2], [3,4], [5, 6]]
+                # flatten draft_tokens_id [1,2,3,4,5,6]
+                # draft_len of each request [1, 2, 1]
+                # then prev_draft_token_indices is [0,   2, 3,   4]
+                prev_draft_token_indices.extend(range(start, start + draft_len))
                 indices_match &= prev_index == flattened_index
                 max_flattened_index = max(max_flattened_index, flattened_index)
-        num_commmon_tokens = len(flattened_indices)
-        if num_commmon_tokens < total_num_scheduled_tokens:
+        num_commmon_tokens = len(sample_flattened_indices)
+        total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
+        if num_commmon_tokens < total_without_spec:
             # If not all requests are decodes from the last iteration,
             # We need to copy the input_ids_cpu to the GPU first.
             self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
@@ -1018,20 +1105,43 @@ def _prepare_input_ids(
                 self.is_token_ids.gpu[:num_commmon_tokens] = True
             return
         # Upload the index tensors asynchronously so the scatter can be non-blocking.
-        input_ids_index_tensor = torch.tensor(
-            flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        sampled_tokens_index_tensor = torch.tensor(
+            sample_flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory
         ).to(self.device, non_blocking=True)
         prev_common_req_indices_tensor = torch.tensor(
             prev_common_req_indices, dtype=torch.int64, pin_memory=self.pin_memory
         ).to(self.device, non_blocking=True)
         self.input_ids.gpu.scatter_(
             dim=0,
-            index=input_ids_index_tensor,
+            index=sampled_tokens_index_tensor,
             src=self.input_batch.prev_sampled_token_ids[
                 prev_common_req_indices_tensor, 0
             ],
         )
 
+        # Scatter the draft tokens after the sampled tokens are scattered.
+        if self._draft_token_ids is None or not spec_flattened_indices:
+            return
+
+        assert isinstance(self._draft_token_ids, torch.Tensor)
+        draft_tokens_index_tensor = torch.tensor(
+            spec_flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        ).to(self.device, non_blocking=True)
+        prev_draft_token_indices_tensor = torch.tensor(
+            prev_draft_token_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        ).to(self.device, non_blocking=True)
+
+        # because input_ids dtype is torch.int32,
+        # so convert draft_token_ids to torch.int32 here.
+        draft_token_ids = self._draft_token_ids.to(dtype=torch.int32)
+        self._draft_token_ids = None
+
+        self.input_ids.gpu.scatter_(
+            dim=0,
+            index=draft_tokens_index_tensor,
+            src=draft_token_ids.flatten()[prev_draft_token_indices_tensor],
+        )
+
     def _get_encoder_seq_lens(
         self,
         scheduled_encoder_inputs: dict[str, list[int]],
@@ -1218,7 +1328,11 @@ def _prepare_inputs(
         self.discard_request_indices.copy_to_gpu(self.num_discarded_requests)
 
         # Copy the tensors to the GPU.
-        self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
+        self._prepare_input_ids(
+            scheduler_output,
+            total_num_scheduled_tokens,
+            cu_num_tokens,
+        )
 
         if self.uses_mrope:
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
@@ -2377,12 +2491,14 @@ def _bookkeeping_sync(
             valid_sampled_token_ids = []
             invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
             invalid_req_indices_set = set(invalid_req_indices)
-            assert sampled_token_ids.shape[-1] == 1
 
             # Cache the sampled tokens on the GPU and avoid CPU sync.
             # These will be copied into input_ids in the next step
             # when preparing inputs.
-            self.input_batch.prev_sampled_token_ids = sampled_token_ids
+            # With spec decoding, this is done in propose_draft_token_ids().
+            if self.input_batch.prev_sampled_token_ids is None:
+                assert sampled_token_ids.shape[-1] == 1
+                self.input_batch.prev_sampled_token_ids = sampled_token_ids
             self.input_batch.prev_req_id_to_index = {
                 req_id: i
                 for i, req_id in enumerate(self.input_batch.req_ids)
@@ -2517,6 +2633,21 @@ def execute_model(
                 "State error: sample_tokens() must be called "
                 "after execute_model() returns None."
             )
+
+        # self._draft_token_ids is None when `input_fits_in_drafter=False`
+        # and there is no draft tokens scheduled. so it need to update the
+        # spec_decoding info in scheduler_output with async_scheduling.
+        # use deepcopy to avoid the modification has influence on the
+        # scheduler_output in engine core process.
+        # TODO(Ronald1995): deepcopy is expensive when there is a large
+        # number of requests, optimize it later.
+        if (
+            self.use_async_scheduling
+            and self.num_spec_tokens
+            and self._draft_token_ids is None
+        ):
+            scheduler_output = deepcopy(scheduler_output)
+
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         with record_function_or_nullcontext("gpu_model_runner: preprocess"):
             with self.synchronize_input_prep():
@@ -2759,6 +2890,8 @@ def sample_tokens(
         with record_function_or_nullcontext("gpu_model_runner: sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
 
+        self.input_batch.prev_sampled_token_ids = None
+
         def propose_draft_token_ids(
             sampled_token_ids: torch.Tensor | list[np.ndarray],
         ) -> None:
@@ -2792,14 +2925,29 @@ def propose_draft_token_ids(
                 self.speculative_config.draft_model_config.max_model_len
             )
         input_fits_in_drafter = spec_decode_common_attn_metadata and (
-            spec_decode_common_attn_metadata.max_seq_len
-            + self.speculative_config.num_speculative_tokens
+            spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens
             <= effective_drafter_max_model_len
         )
-        if use_padded_batch_for_eagle and input_fits_in_drafter:
-            # EAGLE speculative decoding can use the GPU sampled tokens
-            # as inputs, and does not need to wait for bookkeeping to finish.
-            propose_draft_token_ids(sampler_output.sampled_token_ids)
+        if use_padded_batch_for_eagle:
+            sampled_token_ids = sampler_output.sampled_token_ids
+            if input_fits_in_drafter:
+                # EAGLE speculative decoding can use the GPU sampled tokens
+                # as inputs, and does not need to wait for bookkeeping to finish.
+                propose_draft_token_ids(sampled_token_ids)
+            elif self.valid_sampled_token_count_event is not None:
+                next_token_ids, valid_sampled_tokens_count = (
+                    self.drafter.prepare_next_token_ids_padded(
+                        spec_decode_common_attn_metadata,
+                        sampled_token_ids,
+                        self.requests,
+                        self.input_batch,
+                        self.discard_request_indices.gpu,
+                        self.num_discarded_requests,
+                    )
+                )
+                self._copy_valid_sampled_token_count(
+                    next_token_ids, valid_sampled_tokens_count
+                )
 
         with record_function_or_nullcontext("gpu_model_runner: bookkeep"):
             (
@@ -2856,6 +3004,7 @@ def propose_draft_token_ids(
                 logprobs_tensors=sampler_output.logprobs_tensors,
                 invalid_req_indices=invalid_req_indices,
                 async_output_copy_stream=self.async_output_copy_stream,
+                vocab_size=self.input_batch.vocab_size,
             )
         with record_function_or_nullcontext(
             "gpu_model_runner: set_async_sampled_token_ids"
@@ -2880,6 +3029,37 @@ def take_draft_token_ids(self) -> DraftTokenIds | None:
         self._draft_token_ids = None
         return DraftTokenIds(req_ids, draft_token_ids)
 
+    def _copy_valid_sampled_token_count(
+        self, next_token_ids: torch.Tensor, valid_sampled_tokens_count: torch.Tensor
+    ) -> None:
+        if self.valid_sampled_token_count_event is None:
+            return
+
+        default_stream = torch.cuda.current_stream()
+        # Initialize a new stream to overlap the copy operation with
+        # prepare_input of draft model.
+        with torch.cuda.stream(self.valid_sampled_token_count_copy_stream):
+            self.valid_sampled_token_count_copy_stream.wait_stream(default_stream)  # type: ignore
+            counts = valid_sampled_tokens_count
+            counts_cpu = self.valid_sampled_token_count_cpu
+            counts_cpu[: counts.shape[0]].copy_(counts, non_blocking=True)
+            self.valid_sampled_token_count_event.record()
+
+        self.input_batch.prev_sampled_token_ids = next_token_ids.unsqueeze(1)
+
+    def _get_valid_sampled_token_count(self) -> list[int]:
+        # Wait until valid_sampled_tokens_count is copied to cpu,
+        prev_sampled_token_ids = self.input_batch.prev_sampled_token_ids
+        if (
+            self.valid_sampled_token_count_event is None
+            or prev_sampled_token_ids is None
+        ):
+            return []
+
+        counts_cpu = self.valid_sampled_token_count_cpu
+        self.valid_sampled_token_count_event.synchronize()
+        return counts_cpu[: prev_sampled_token_ids.shape[0]].tolist()
+
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
@@ -2967,6 +3147,9 @@ def propose_draft_token_ids(
                         self.num_discarded_requests,
                     )
                 )
+                self._copy_valid_sampled_token_count(
+                    next_token_ids, valid_sampled_tokens_count
+                )
 
             if spec_decode_metadata is None:
                 token_indices_to_sample = None
@@ -3532,7 +3715,7 @@ def _dummy_run(
                 # TODO(luka) better system for describing dummy batches
                 seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1]
             else:
-                seq_lens = max_query_len
+                seq_lens = max_query_len  # type: ignore[assignment]
             self.seq_lens.np[:num_reqs] = seq_lens
             self.seq_lens.np[num_reqs:] = 0
             self.seq_lens.copy_to_gpu()
@@ -4485,11 +4668,7 @@ def may_reinitialize_input_batch(
                 logitsprocs=self.input_batch.logitsprocs,
                 logitsprocs_need_output_token_ids=self.input_batch.logitsprocs_need_output_token_ids,
                 is_pooling_model=self.is_pooling_model,
-                num_speculative_tokens=(
-                    self.vllm_config.speculative_config.num_speculative_tokens
-                    if self.vllm_config.speculative_config
-                    else 0
-                ),
+                num_speculative_tokens=self.num_spec_tokens,
             )
 
     def _allocate_kv_cache_tensors(

From 7765e5ba75c0b5caa8f372bfa20ab3de2c6b3aac Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 17 Nov 2025 14:08:50 -0800
Subject: [PATCH 134/578] [BugFix] Fix PP performance and PP kv connector
 output regression  (#28768)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core.py             | 150 +++++++++++++----------------
 vllm/v1/executor/ray_executor.py   |  21 +++-
 vllm/v1/worker/gpu_model_runner.py |  23 ++++-
 vllm/v1/worker/gpu_worker.py       |  15 +--
 4 files changed, 105 insertions(+), 104 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 508669cf527d..97286c6e2e5e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -63,7 +63,6 @@
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.v1.structured_output import StructuredOutputManager
-from vllm.v1.utils import record_function_or_nullcontext
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -181,11 +180,13 @@ def __init__(
             logger.info("Batch queue is enabled with size %d", self.batch_queue_size)
             self.batch_queue = deque(maxlen=self.batch_queue_size)
 
+        self.ec_producer = (
+            vllm_config.ec_transfer_config is not None
+            and vllm_config.ec_transfer_config.is_ec_producer
+        )
+
         self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None
-        if (
-            self.vllm_config.cache_config.enable_prefix_caching
-            or kv_connector is not None
-        ):
+        if vllm_config.cache_config.enable_prefix_caching or kv_connector is not None:
             caching_hash_fn = get_hash_fn_by_name(
                 vllm_config.cache_config.prefix_caching_hash_algo
             )
@@ -246,7 +247,7 @@ def _initialize_kv_caches(
 
         elapsed = time.time() - start
         logger.info_once(
-            ("init engine (profile, create kv cache, warmup model) took %.2f seconds"),
+            "init engine (profile, create kv cache, warmup model) took %.2f seconds",
             elapsed,
             scope="local",
         )
@@ -312,6 +313,16 @@ def log_error_detail(self, scheduler_output: SchedulerOutput):
             )
             raise err
 
+    def _log_err_callback(self, scheduler_output: SchedulerOutput):
+        """Log error details of a future that's not expected to return a result."""
+
+        def callback(f, sched_output=scheduler_output):
+            with self.log_error_detail(sched_output):
+                result = f.result()
+                assert result is None
+
+        return callback
+
     def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         """Schedule, execute, and make output.
 
@@ -323,21 +334,17 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         # or finished and not yet removed from the batch.
         if not self.scheduler.has_requests():
             return {}, False
-        with record_function_or_nullcontext("core step: schedule"):
-            scheduler_output = self.scheduler.schedule()
-
-        with record_function_or_nullcontext("core step: execute_model"):
-            future = self.model_executor.execute_model(scheduler_output, non_block=True)
-            grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
-            with self.log_error_detail(scheduler_output):
-                model_output = future.result()
-                if model_output is None:
-                    model_output = self.model_executor.sample_tokens(grammar_output)
-
-        with record_function_or_nullcontext("core step: update_from_output"):
-            engine_core_outputs = self.scheduler.update_from_output(
-                scheduler_output, model_output
-            )
+        scheduler_output = self.scheduler.schedule()
+        future = self.model_executor.execute_model(scheduler_output, non_block=True)
+        grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
+        with self.log_error_detail(scheduler_output):
+            model_output = future.result()
+            if model_output is None:
+                model_output = self.model_executor.sample_tokens(grammar_output)
+
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, model_output
+        )
 
         return engine_core_outputs, scheduler_output.total_num_scheduled_tokens > 0
 
@@ -378,52 +385,34 @@ def step_with_batch_queue(
         model_executed = False
         deferred_scheduler_output = None
         if self.scheduler.has_requests():
-            with record_function_or_nullcontext("core step_with_batch_queue: schedule"):
-                scheduler_output = self.scheduler.schedule()
-            with record_function_or_nullcontext(
-                "core step_with_batch_queue: execute_model"
-            ):
-                exec_future = self.model_executor.execute_model(
-                    scheduler_output, non_block=True
-                )
-            model_executed = scheduler_output.total_num_scheduled_tokens > 0
+            scheduler_output = self.scheduler.schedule()
+            exec_future = self.model_executor.execute_model(
+                scheduler_output, non_block=True
+            )
+            if not self.ec_producer:
+                model_executed = scheduler_output.total_num_scheduled_tokens > 0
 
-            if scheduler_output.pending_structured_output_tokens:
-                with record_function_or_nullcontext(
-                    "core step_with_batch_queue: pending_structured_output_tokens"
-                ):
-                    # We need to defer sampling until we have processed the model output
-                    # from the prior step.
-                    deferred_scheduler_output = scheduler_output
-                    # Block-wait for execute to return
-                    # (continues running async on the GPU).
-                    with self.log_error_detail(scheduler_output):
-                        exec_result = exec_future.result()
-                        assert exec_result is None
+            if not model_executed:
+                # No sampling required (no requests scheduled).
+                future = cast(Future[ModelRunnerOutput], exec_future)
             else:
-                with record_function_or_nullcontext(
-                    "core step_with_batch_queue: get_grammar_bitmask"
-                ):
-                    # We aren't waiting for any tokens, get any grammar
-                    # output immediately.
+                exec_future.add_done_callback(self._log_err_callback(scheduler_output))
+
+                if not scheduler_output.pending_structured_output_tokens:
+                    # We aren't waiting for any tokens, get any grammar output
+                    # and sample immediately.
                     grammar_output = self.scheduler.get_grammar_bitmask(
                         scheduler_output
                     )
-                # Block-wait for execute to return (continues running async on the GPU).
-                with self.log_error_detail(scheduler_output):
-                    exec_result = exec_future.result()
-
-                if exec_result is None:
-                    with record_function_or_nullcontext(
-                        "core step_with_batch_queue: sample_tokens"
-                    ):
-                        # Call sample tokens.
-                        future = self.model_executor.sample_tokens(
-                            grammar_output, non_block=True
-                        )
+                    future = self.model_executor.sample_tokens(
+                        grammar_output, non_block=True
+                    )
                 else:
-                    # No sampling required (e.g. all requests finished).
-                    future = cast(Future[ModelRunnerOutput], exec_future)
+                    # We need to defer sampling until we have processed the model output
+                    # from the prior step.
+                    deferred_scheduler_output = scheduler_output
+
+            if not deferred_scheduler_output:
                 # Add this step's future to the queue.
                 batch_queue.appendleft((future, scheduler_output))
                 if (
@@ -440,34 +429,27 @@ def step_with_batch_queue(
             # only be called when the scheduler contains requests or the queue
             # is non-empty.
             return None, False
-        with record_function_or_nullcontext("core step_with_batch_queue: model_output"):
-            # Block until the next result is available.
-            future, scheduler_output = batch_queue.pop()
-            with self.log_error_detail(scheduler_output):
-                model_output = future.result()
-        with record_function_or_nullcontext(
-            "core step_with_batch_queue: update_from_output"
-        ):
-            engine_core_outputs = self.scheduler.update_from_output(
-                scheduler_output, model_output
-            )
+
+        # Block until the next result is available.
+        future, scheduler_output = batch_queue.pop()
+        with self.log_error_detail(scheduler_output):
+            model_output = future.result()
+
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, model_output
+        )
 
         # NOTE(nick): We can either handle the deferred tasks here or save
         # in a field and do it immediately once step_with_batch_queue is
         # re-called. The latter slightly favors TTFT over TPOT/throughput.
         if deferred_scheduler_output:
-            with record_function_or_nullcontext(
-                "core step_with_batch_queue: deferred_scheduler_output"
-            ):
-                # We now have the tokens needed to compute the bitmask for the
-                # deferred request. Get the bitmask and call sample tokens.
-                grammar_output = self.scheduler.get_grammar_bitmask(
-                    deferred_scheduler_output
-                )
-                future = self.model_executor.sample_tokens(
-                    grammar_output, non_block=True
-                )
-                batch_queue.appendleft((future, deferred_scheduler_output))
+            # We now have the tokens needed to compute the bitmask for the
+            # deferred request. Get the bitmask and call sample tokens.
+            grammar_output = self.scheduler.get_grammar_bitmask(
+                deferred_scheduler_output
+            )
+            future = self.model_executor.sample_tokens(grammar_output, non_block=True)
+            batch_queue.appendleft((future, deferred_scheduler_output))
 
         return engine_core_outputs, model_executed
 
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index 119e4c081831..55db7445c9c7 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -99,6 +99,11 @@ def _init_executor(self) -> None:
         # KV connector setup
         self.has_connector = self.vllm_config.kv_transfer_config is not None
 
+        self.ec_producer = (
+            self.vllm_config.ec_transfer_config is not None
+            and self.vllm_config.ec_transfer_config.is_ec_producer
+        )
+
         self.scheduler_output: SchedulerOutput | None = None
 
     @property
@@ -395,6 +400,12 @@ def execute_model(  # type: ignore[override]
                 "State error: sample_tokens() must be called "
                 "after execute_model() returns None."
             )
+
+        if self.ec_producer or not scheduler_output.total_num_scheduled_tokens:
+            # Model will not execute, call model runner immediately.
+            return self._execute_dag(scheduler_output, None, non_block)
+
+        # Model will execute, defer to sample_tokens() call.
         self.scheduler_output = scheduler_output
         return COMPLETED_NONE_FUTURE if non_block else None
 
@@ -417,10 +428,18 @@ def sample_tokens(  # type: ignore[override]
         """
         scheduler_output = self.scheduler_output
         if scheduler_output is None:
-            return None  # noqa
+            return COMPLETED_NONE_FUTURE if non_block else None  # noqa
 
         self.scheduler_output = None
 
+        return self._execute_dag(scheduler_output, grammar_output, non_block)
+
+    def _execute_dag(
+        self,
+        scheduler_output: SchedulerOutput,
+        grammar_output: "GrammarOutput | None",
+        non_block: bool = False,
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
         # Build the compiled DAG for the first time.
         if self.forward_dag is None:  # type: ignore
             self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 758e3e1b3a82..2a8ff746f112 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -7,7 +7,7 @@
 from collections import defaultdict
 from collections.abc import Iterator
 from contextlib import contextmanager
-from copy import deepcopy
+from copy import copy, deepcopy
 from functools import reduce
 from itertools import product
 from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, cast
@@ -250,7 +250,6 @@ class ExecuteModelState(NamedTuple):
     hidden_states: torch.Tensor
     sample_hidden_states: torch.Tensor
     aux_hidden_states: list[torch.Tensor] | None
-    kv_connector_output: KVConnectorOutput | None
     ec_connector_output: ECConnectorOutput | None
 
 
@@ -573,6 +572,7 @@ def __init__(
 
         # Ephemeral state transferred between execute_model() and sample_tokens().
         self.execute_model_state: ExecuteModelState | None = None
+        self.kv_connector_output: KVConnectorOutput | None = None
 
     def reset_mm_cache(self) -> None:
         if self.mm_budget:
@@ -2803,6 +2803,7 @@ def execute_model(
                     # Return the intermediate tensors.
                     assert isinstance(hidden_states, IntermediateTensors)
                     hidden_states.kv_connector_output = kv_connector_output
+                    self.kv_connector_output = kv_connector_output
                     return hidden_states
 
                 if self.is_pooling_model:
@@ -2853,18 +2854,31 @@ def execute_model(
             hidden_states,
             sample_hidden_states,
             aux_hidden_states,
-            kv_connector_output,
             ec_connector_output,
         )
+        self.kv_connector_output = kv_connector_output
         return None
 
     @torch.inference_mode
     def sample_tokens(
         self, grammar_output: "GrammarOutput | None"
     ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors:
+        kv_connector_output = self.kv_connector_output
+        self.kv_connector_output = None
+
         if self.execute_model_state is None:
             # Nothing to do (PP non-final rank case), output isn't used.
-            return None  # noqa
+            if not kv_connector_output:
+                return None  # noqa
+
+            # In case of PP with kv transfer, we need to pass through the
+            # kv_connector_output
+            if kv_connector_output.is_empty():
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+            output = copy(EMPTY_MODEL_RUNNER_OUTPUT)
+            output.kv_connector_output = kv_connector_output
+            return output
 
         # Unpack ephemeral state.
         (
@@ -2875,7 +2889,6 @@ def sample_tokens(
             hidden_states,
             sample_hidden_states,
             aux_hidden_states,
-            kv_connector_output,
             ec_connector_output,
         ) = self.execute_model_state
         # Clear ephemeral state.
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 42a844d96558..315f01b68499 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A GPU worker class."""
 
-import copy
 import gc
 import os
 from contextlib import AbstractContextManager, nullcontext
@@ -45,7 +44,6 @@
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import (
-    EMPTY_MODEL_RUNNER_OUTPUT,
     AsyncModelRunnerOutput,
     DraftTokenIds,
     ModelRunnerOutput,
@@ -581,18 +579,7 @@ def execute_model(
             all_gather_tensors=all_gather_tensors,
         )
 
-        kv_connector_output = output.kv_connector_output
-        if not kv_connector_output:
-            return None
-
-        # In case of PP with kv transfer, we need to pass through the
-        # kv_connector_output
-        if kv_connector_output.is_empty():
-            return EMPTY_MODEL_RUNNER_OUTPUT
-
-        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
-        output.kv_connector_output = kv_connector_output
-        return output
+        return None
 
     def take_draft_token_ids(self) -> DraftTokenIds | None:
         return self.model_runner.take_draft_token_ids()

From 95ae50b7d1bf3b5b66ac39b19d3169bad5443f2e Mon Sep 17 00:00:00 2001
From: Shreyas Kulkarni <shreyas.gp269@gmail.com>
Date: Mon, 17 Nov 2025 18:01:34 -0500
Subject: [PATCH 135/578] [Quantization] [Eagle] Add complete quantization
 support to the draft model in Eagle (#28435)

Signed-off-by: Shreyas Kulkarni <shreyas.gp269@gmail.com>
---
 .../model_executor/test_eagle_quantization.py | 169 ++++++++++++++++++
 vllm/model_executor/models/llama_eagle.py     |  53 ++++--
 vllm/model_executor/models/llama_eagle3.py    |  62 +++++--
 vllm/model_executor/models/utils.py           |  27 +++
 4 files changed, 282 insertions(+), 29 deletions(-)
 create mode 100644 tests/model_executor/test_eagle_quantization.py

diff --git a/tests/model_executor/test_eagle_quantization.py b/tests/model_executor/test_eagle_quantization.py
new file mode 100644
index 000000000000..1ab75933ee31
--- /dev/null
+++ b/tests/model_executor/test_eagle_quantization.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+
+from vllm.config import LoadConfig, ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.model_executor.models.utils import get_draft_quant_config
+from vllm.platforms import current_platform
+
+DEVICES = (
+    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    if current_platform.is_cuda_alike()
+    else ["cpu"]
+)
+
+
+def test_get_draft_quant_config_with_draft_model():
+    mock_draft_model_config = Mock(spec=ModelConfig)
+    mock_load_config = Mock(spec=LoadConfig)
+    mock_speculative_config = Mock(spec=SpeculativeConfig)
+    mock_speculative_config.draft_model_config = mock_draft_model_config
+
+    mock_vllm_config = Mock(spec=VllmConfig)
+    mock_vllm_config.speculative_config = mock_speculative_config
+    mock_vllm_config.load_config = mock_load_config
+
+    mock_quant_config = Mock()
+    with patch.object(
+        VllmConfig, "get_quantization_config", return_value=mock_quant_config
+    ):
+        result = get_draft_quant_config(mock_vllm_config)
+
+        # Verify the function calls get_quantization_config with draft model config
+        VllmConfig.get_quantization_config.assert_called_once_with(
+            mock_draft_model_config, mock_load_config
+        )
+        assert result == mock_quant_config
+
+
+def test_get_draft_quant_config_without_draft_model():
+    mock_speculative_config = Mock(spec=SpeculativeConfig)
+    mock_speculative_config.draft_model_config = None
+
+    mock_vllm_config = Mock(spec=VllmConfig)
+    mock_vllm_config.speculative_config = mock_speculative_config
+    mock_vllm_config.load_config = Mock(spec=LoadConfig)
+
+    result = get_draft_quant_config(mock_vllm_config)
+
+    assert result is None
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("device", DEVICES)
+def test_fc_layer_quant_config_usage(dist_init, device) -> None:
+    import torch
+
+    from vllm.model_executor.layers.linear import ReplicatedLinear
+
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
+    torch.set_default_device(device)
+
+    input_size = 256
+    output_size = 128
+
+    fc_no_quant = ReplicatedLinear(
+        input_size=input_size,
+        output_size=output_size,
+        bias=False,
+        params_dtype=torch.float16,
+        quant_config=None,
+        prefix="fc",
+    )
+
+    assert fc_no_quant.quant_config is None
+    assert fc_no_quant.input_size == input_size
+    assert fc_no_quant.output_size == output_size
+
+    mock_quant_config = Mock()
+    fc_with_quant = ReplicatedLinear(
+        input_size=input_size,
+        output_size=output_size,
+        bias=False,
+        params_dtype=torch.float16,
+        quant_config=mock_quant_config,
+        prefix="fc",
+    )
+
+    assert fc_with_quant.quant_config == mock_quant_config
+
+    # Check forward pass
+    x = torch.randn(2, input_size, dtype=torch.float16)
+    output, _ = fc_no_quant(x)
+    assert output.shape == (2, output_size)
+
+
+def test_kv_cache_scale_name_handling():
+    # Mock a quant config that supports cache scales
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
+
+    # Condition check in load_weights
+    name = "layers.0.self_attn.k_proj.weight"
+    scale_name = mock_quant_config.get_cache_scale(name)
+
+    # Check if get_cache_scale is called and returns expected value
+    mock_quant_config.get_cache_scale.assert_called_once_with(name)
+    assert scale_name == "layers.0.self_attn.kv_scale"
+
+
+def test_kv_cache_scale_name_no_scale():
+    # Mock a quant config that returns None for get_cache_scale
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value=None)
+
+    name = "layers.0.mlp.gate_proj.weight"
+    scale_name = mock_quant_config.get_cache_scale(name)
+
+    # Should return None for weights that don't have cache scales
+    assert scale_name is None
+
+
+def test_maybe_remap_kv_scale_name():
+    from vllm.model_executor.model_loader.weight_utils import maybe_remap_kv_scale_name
+
+    params_dict = {
+        "layers.0.self_attn.kv_scale": Mock(),
+        "layers.1.self_attn.kv_scale": Mock(),
+    }
+
+    name = "layers.0.self_attn.some_scale"
+    remapped = maybe_remap_kv_scale_name(name, params_dict)
+
+    assert remapped in params_dict or remapped == name or remapped is None
+
+
+def test_load_weights_kv_scale_handling():
+    kv_scale_param = Mock()
+    kv_scale_param.weight_loader = Mock()
+
+    params_dict = {
+        "layers.0.self_attn.kv_scale": kv_scale_param,
+    }
+
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
+
+    # Load_weights logic for KV cache scales
+    name = "layers.0.self_attn.k_proj.weight"
+    loaded_weight_tensor = torch.tensor([1.0, 2.0])
+
+    if mock_quant_config is not None:
+        scale_name = mock_quant_config.get_cache_scale(name)
+        if scale_name:
+            param = params_dict[scale_name]
+            assert param is kv_scale_param
+            weight_to_load = (
+                loaded_weight_tensor
+                if loaded_weight_tensor.dim() == 0
+                else loaded_weight_tensor[0]
+            )
+
+            assert scale_name == "layers.0.self_attn.kv_scale"
+            assert weight_to_load == loaded_weight_tensor[0]
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 0287132c5637..90ab5c50361b 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -11,13 +11,22 @@
 from vllm.config import VllmConfig
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM
 
-from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
+from .utils import (
+    AutoWeightsLoader,
+    get_draft_quant_config,
+    maybe_prefix,
+    process_eagle_weight,
+)
 
 logger = init_logger(__name__)
 
@@ -40,14 +49,7 @@ def __init__(
 
     def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
         """Use drafter's quantization config instead of verifier's."""
-        draft_model_config = vllm_config.speculative_config.draft_model_config
-        draft_load_config = vllm_config.load_config
-
-        return (
-            VllmConfig.get_quantization_config(draft_model_config, draft_load_config)
-            if draft_model_config
-            else None
-        )
+        return get_draft_quant_config(vllm_config)
 
 
 @support_torch_compile
@@ -63,6 +65,9 @@ def __init__(
         self.config = vllm_config.speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
+        # Get drafter's quantization config
+        self.quant_config = get_draft_quant_config(vllm_config)
+
         self.embed_tokens = VocabParallelEmbedding(
             self.config.vocab_size,
             self.config.hidden_size,
@@ -80,8 +85,14 @@ def __init__(
                 for i in range(self.config.num_hidden_layers)
             ]
         )
-        self.fc = torch.nn.Linear(
-            self.config.hidden_size * 2, self.config.hidden_size, bias=False
+        self.fc = ReplicatedLinear(
+            input_size=self.config.hidden_size * 2,
+            output_size=self.config.hidden_size,
+            bias=False,
+            params_dtype=vllm_config.model_config.dtype,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "fc"),
+            return_bias=False,
         )
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -117,6 +128,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
+            # Handle kv cache quantization scales
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            # Remapping the name FP8 kv-scale
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index a3bcc5eeb32b..75c671311b49 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -11,19 +11,27 @@
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import QKVParallelLinear
+from vllm.model_executor.layers.linear import QKVParallelLinear, ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
 
-from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
+from .utils import (
+    AutoWeightsLoader,
+    get_draft_quant_config,
+    maybe_prefix,
+    process_eagle_weight,
+)
 
 logger = init_logger(__name__)
 
@@ -66,14 +74,7 @@ def __init__(
 
     def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
         """Use drafter's quantization config instead of verifier's."""
-        draft_model_config = vllm_config.speculative_config.draft_model_config
-        draft_load_config = vllm_config.load_config
-
-        return (
-            VllmConfig.get_quantization_config(draft_model_config, draft_load_config)
-            if draft_model_config
-            else None
-        )
+        return get_draft_quant_config(vllm_config)
 
     def _norm_before_residual(
         self, hidden_states: torch.Tensor
@@ -140,6 +141,9 @@ def __init__(
         self.config = vllm_config.speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
+        # Get drafter's quantization config
+        self.quant_config = get_draft_quant_config(vllm_config)
+
         current_vllm_config = get_current_vllm_config()
 
         self.embed_tokens = VocabParallelEmbedding(
@@ -160,13 +164,19 @@ def __init__(
             ]
         )
         if hasattr(self.config, "target_hidden_size"):
-            self.fc = torch.nn.Linear(
-                self.config.target_hidden_size * 3, self.config.hidden_size, bias=False
-            )
+            fc_input_size = self.config.target_hidden_size * 3
         else:
-            self.fc = torch.nn.Linear(
-                self.config.hidden_size * 3, self.config.hidden_size, bias=False
-            )
+            fc_input_size = self.config.hidden_size * 3
+        self.fc = ReplicatedLinear(
+            input_size=fc_input_size,
+            output_size=self.config.hidden_size,
+            bias=False,
+            params_dtype=vllm_config.model_config.dtype,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "fc"),
+            return_bias=False,
+        )
+
         self.norm = RMSNorm(
             self.config.hidden_size,
             eps=self.config.rms_norm_eps,
@@ -211,6 +221,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         for name, loaded_weight in weights:
             if "midlayer." in name:
                 name = name.replace("midlayer.", "layers.0.")
+            # Handle kv cache quantization scales
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            # Remapping the name FP8 kv-scale
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 0d811fbc7585..ca5af358e2ee 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -18,6 +18,9 @@
     get_tensor_model_parallel_world_size,
 )
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import supports_any_eagle
 from vllm.multimodal import NestedTensors
@@ -715,6 +718,30 @@ def maybe_prefix(prefix: str, name: str) -> str:
     return name if not prefix else f"{prefix}.{name}"
 
 
+def get_draft_quant_config(
+    vllm_config: VllmConfig,
+) -> QuantizationConfig | None:
+    """Get quantization config for Draft models.
+
+    Draft models should use their own quantization config instead of the verifier/target
+    model's config. This helper retrieves the draft model's quantization config.
+
+    Args:
+        vllm_config: The vLLM configuration object.
+
+    Returns:
+        The draft model's config if available, None otherwise.
+    """
+    draft_model_config = vllm_config.speculative_config.draft_model_config
+    draft_load_config = vllm_config.load_config
+
+    return (
+        VllmConfig.get_quantization_config(draft_model_config, draft_load_config)
+        if draft_model_config
+        else None
+    )
+
+
 def extract_layer_index(layer_name: str, num_attn_module: int = 1) -> int:
     """
     Extract the layer index from the module name.

From a289cc1dde4a1aeee05492bbe4cc39a18f070135 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 17 Nov 2025 18:09:47 -0500
Subject: [PATCH 136/578] [Test] Batch Invariant: Rename and organize tests
 (#27421)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/v1/determinism/conftest.py              |  11 ++
 .../test_batch_invariance.py                  |  75 +-------
 .../test_online_batch_invariance.py           | 161 ++++++++++++++++++
 .../test_rms_norm_batch_invariant.py          |   7 +-
 tests/v1/determinism/utils.py                 |  74 ++++++++
 5 files changed, 248 insertions(+), 80 deletions(-)
 create mode 100644 tests/v1/determinism/conftest.py
 rename tests/v1/{generation => determinism}/test_batch_invariance.py (92%)
 create mode 100644 tests/v1/determinism/test_online_batch_invariance.py
 rename tests/v1/{generation => determinism}/test_rms_norm_batch_invariant.py (97%)
 create mode 100644 tests/v1/determinism/utils.py

diff --git a/tests/v1/determinism/conftest.py b/tests/v1/determinism/conftest.py
new file mode 100644
index 000000000000..3c2136e00584
--- /dev/null
+++ b/tests/v1/determinism/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch):
+    """Automatically enable batch invariant kernel overrides for all tests."""
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
+    yield
diff --git a/tests/v1/generation/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
similarity index 92%
rename from tests/v1/generation/test_batch_invariance.py
rename to tests/v1/determinism/test_batch_invariance.py
index 8fd038bca5d0..f018ee551dbf 100644
--- a/tests/v1/generation/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -6,66 +6,9 @@
 
 import pytest
 import torch
+from utils import _extract_step_logprobs, _random_prompt, skip_unsupported
 
 from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-
-skip_unsupported = pytest.mark.skipif(
-    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
-    reason="Requires CUDA and >= Hopper (SM90)",
-)
-
-
-@pytest.fixture(autouse=True)
-def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch):
-    """Automatically enable batch invariant kernel overrides for all tests."""
-    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
-    yield
-
-
-def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
-    # Generate more realistic prompts that will actually produce varied tokens
-    # Use a mix of common English text patterns
-
-    prompt_templates = [
-        # Question-answer style
-        "Question: What is the capital of France?\nAnswer: The capital of France is",
-        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
-        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
-        # Story/narrative style
-        "Once upon a time in a distant galaxy, there lived",
-        "The old man walked slowly down the street, remembering",
-        "In the year 2157, humanity finally discovered",
-        # Technical/code style
-        "To implement a binary search tree in Python, first we need to",
-        "The algorithm works by iterating through the array and",
-        "Here's how to optimize database queries using indexing:",
-        # Factual/informative style
-        "The Renaissance was a period in European history that",
-        "Climate change is caused by several factors including",
-        "The human brain contains approximately 86 billion neurons which",
-        # Conversational style
-        "I've been thinking about getting a new laptop because",
-        "Yesterday I went to the store and bought",
-        "My favorite thing about summer is definitely",
-    ]
-
-    # Pick a random template
-    base_prompt = random.choice(prompt_templates)
-
-    if max_words < min_words:
-        max_words = min_words
-    target_words = random.randint(min_words, max_words)
-
-    if target_words > 50:
-        # For longer prompts, repeat context
-        padding_text = (
-            " This is an interesting topic that deserves more explanation. "
-            * (target_words // 50)
-        )
-        base_prompt = base_prompt + padding_text
-
-    return base_prompt
 
 
 @skip_unsupported
@@ -204,22 +147,6 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
                 llm_bsN.shutdown()
 
 
-def _extract_step_logprobs(request_output):
-    if getattr(request_output, "outputs", None):
-        inner = request_output.outputs[0]
-        if hasattr(inner, "logprobs") and inner.logprobs is not None:
-            t = torch.tensor(
-                [
-                    inner.logprobs[i][tid].logprob
-                    for i, tid in enumerate(inner.token_ids)
-                ],
-                dtype=torch.float32,
-            )
-            return t, inner.token_ids
-
-    return None, None
-
-
 @skip_unsupported
 @pytest.mark.parametrize(
     "backend",
diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py
new file mode 100644
index 000000000000..23f47863dd23
--- /dev/null
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+HTTP-based batch invariance test: send requests to a running
+vLLM server and compare BS=1 vs BS=N results (tokens and per-step logprobs).
+
+Environment variables:
+  - VLLM_TEST_MODEL: served model name (e.g., Qwen/Qwen3-1.7B / DeepSeek-R1)
+  - VLLM_TP_SIZE: tensor parallelism size (e.g., 4)
+
+"""
+
+import os
+import random
+import sys
+from typing import Any
+
+import openai
+from utils import _random_prompt, skip_unsupported
+
+from tests.utils import RemoteOpenAIServer
+
+
+def _request_completion(
+    client: openai.OpenAI,
+    model: str,
+    prompt: Any,
+    sp: dict[str, Any],
+    max_retries: int = 3,
+    retry_backoff: float = 0.5,
+) -> dict[str, Any] | None:
+    payload: dict[str, Any] = {"model": model, "prompt": prompt}
+    payload.update(sp)
+
+    for attempt in range(max_retries + 1):
+        try:
+            completion = client.completions.create(**payload)
+            # Convert to plain dict so downstream logic can keep using
+            # dict-style access just like with raw HTTP JSON.
+            return completion.model_dump()
+        except Exception as e:  # pragma: no cover
+            if attempt < max_retries:
+                import time as _t
+
+                _t.sleep(retry_backoff * (2**attempt))
+                continue
+            sys.stderr.write(f"Error: {e}\n")
+            return None
+    return None
+
+
+def _extract_tokens_and_logprobs(
+    choice: dict[str, Any],
+) -> tuple[list[Any], list[float] | None]:
+    tokens: list[Any] = []
+    token_logprobs: list[float] | None = None
+    lp = choice.get("logprobs")
+    if lp and isinstance(lp, dict):
+        tokens = lp.get("token_ids") or lp.get("tokens") or []
+        token_logprobs = lp.get("token_logprobs", None)
+    return tokens, token_logprobs
+
+
+def _compare_bs1_vs_bsn_single_process(
+    prompts: list[str],
+    sp_kwargs: dict[str, Any],
+    client: openai.OpenAI,
+    model_name: str,
+) -> None:
+    # BS=1
+    bs1_tokens_per_prompt: list[list[Any]] = []
+    bs1_logprobs_per_prompt: list[list[float] | None] = []
+    for p in prompts:
+        resp = _request_completion(client, model_name, p, sp_kwargs)
+        if resp is None or not resp.get("choices"):
+            raise AssertionError("BS=1 empty/failed response")
+        choice = resp["choices"][0]
+        toks, lps = _extract_tokens_and_logprobs(choice)
+        if lps is None:
+            raise AssertionError(
+                "logprobs not returned; ensure server supports 'logprobs'"
+            )
+        bs1_tokens_per_prompt.append(list(toks))
+        bs1_logprobs_per_prompt.append(list(lps))
+
+    # BS=N
+    bsN_tokens_per_prompt: list[list[Any]] = [None] * len(prompts)  # type: ignore[list-item]
+    bsN_logprobs_per_prompt: list[list[float] | None] = [None] * len(prompts)
+    resp = _request_completion(client, model_name, prompts, sp_kwargs)
+    if resp is None or not resp.get("choices"):
+        raise AssertionError("BS=N empty/failed batched response")
+    choices = resp.get("choices", [])
+    if len(choices) != len(prompts):
+        raise AssertionError(
+            f"BS=N choices length {len(choices)} != num prompts {len(prompts)}"
+        )
+    for idx, choice in enumerate(choices):
+        toks, lps = _extract_tokens_and_logprobs(choice)
+        if lps is None:
+            raise AssertionError(f"BS=N missing logprobs for prompt {idx}")
+        bsN_tokens_per_prompt[idx] = list(toks)
+        bsN_logprobs_per_prompt[idx] = list(lps)
+
+    # compare
+    for i, (tokens_bs1, tokens_bsN, logprobs_bs1, logprobs_bsN) in enumerate(
+        zip(
+            bs1_tokens_per_prompt,
+            bsN_tokens_per_prompt,
+            bs1_logprobs_per_prompt,
+            bsN_logprobs_per_prompt,
+        )
+    ):
+        if tokens_bs1 != tokens_bsN:
+            raise AssertionError(
+                f"Prompt {i} (sampling): Different tokens sampled. "
+                f"BS=1 tokens: {tokens_bs1} BS=N tokens: {tokens_bsN}"
+            )
+        if logprobs_bs1 is None or logprobs_bsN is None:
+            raise AssertionError(f"Prompt {i}: Missing logprobs in one of the runs")
+        if len(logprobs_bs1) != len(logprobs_bsN):
+            raise AssertionError(
+                f"Prompt {i}: Different number of steps: "
+                f"{len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)."
+            )
+        for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
+            if a != b:
+                diff = abs(a - b)
+                raise AssertionError(
+                    f"Prompt {i} Step {t}: Bitwise mismatch "
+                    f"(abs diff={diff:.6e}). "
+                    f"BS=1 tokens: {tokens_bs1} BS=N tokens: {tokens_bsN}"
+                )
+
+
+@skip_unsupported
+def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN():
+    random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
+    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    prompts_all = [_random_prompt(10, 50) for _ in range(32)]
+
+    sp_kwargs: dict[str, Any] = {
+        "temperature": 0.6,
+        "top_p": 1.0,
+        "max_tokens": 8,
+        "seed": 42,
+        "logprobs": 5,
+    }
+
+    tp_size = os.getenv("VLLM_TP_SIZE", "1")
+    server_args: list[str] = []
+    if tp_size:
+        server_args += ["-tp", tp_size]
+
+    with RemoteOpenAIServer(model_name, server_args) as server:
+        client = server.get_client()
+        _compare_bs1_vs_bsn_single_process(
+            prompts=prompts_all,
+            sp_kwargs=sp_kwargs,
+            client=client,
+            model_name=model_name,
+        )
diff --git a/tests/v1/generation/test_rms_norm_batch_invariant.py b/tests/v1/determinism/test_rms_norm_batch_invariant.py
similarity index 97%
rename from tests/v1/generation/test_rms_norm_batch_invariant.py
rename to tests/v1/determinism/test_rms_norm_batch_invariant.py
index f79eba58d6ef..390872519528 100644
--- a/tests/v1/generation/test_rms_norm_batch_invariant.py
+++ b/tests/v1/determinism/test_rms_norm_batch_invariant.py
@@ -9,15 +9,10 @@
 
 import pytest
 import torch
+from utils import skip_unsupported
 
 from vllm.model_executor.layers.batch_invariant import rms_norm as triton_rms_norm
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.platforms import current_platform
-
-skip_unsupported = pytest.mark.skipif(
-    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
-    reason="Requires CUDA and >= Hopper (SM90)",
-)
 
 
 @skip_unsupported
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
new file mode 100644
index 000000000000..5141837faea0
--- /dev/null
+++ b/tests/v1/determinism/utils.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+skip_unsupported = pytest.mark.skipif(
+    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
+    reason="Requires CUDA and >= Hopper (SM90)",
+)
+
+
+def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
+    # Generate more realistic prompts that will actually produce varied tokens
+    # Use a mix of common English text patterns
+
+    prompt_templates = [
+        # Question-answer style
+        "Question: What is the capital of France?\nAnswer: The capital of France is",
+        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
+        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
+        # Story/narrative style
+        "Once upon a time in a distant galaxy, there lived",
+        "The old man walked slowly down the street, remembering",
+        "In the year 2157, humanity finally discovered",
+        # Technical/code style
+        "To implement a binary search tree in Python, first we need to",
+        "The algorithm works by iterating through the array and",
+        "Here's how to optimize database queries using indexing:",
+        # Factual/informative style
+        "The Renaissance was a period in European history that",
+        "Climate change is caused by several factors including",
+        "The human brain contains approximately 86 billion neurons which",
+        # Conversational style
+        "I've been thinking about getting a new laptop because",
+        "Yesterday I went to the store and bought",
+        "My favorite thing about summer is definitely",
+    ]
+
+    # Pick a random template
+    base_prompt = random.choice(prompt_templates)
+
+    if max_words < min_words:
+        max_words = min_words
+    target_words = random.randint(min_words, max_words)
+
+    if target_words > 50:
+        # For longer prompts, repeat context
+        padding_text = (
+            " This is an interesting topic that deserves more explanation. "
+            * (target_words // 50)
+        )
+        base_prompt = base_prompt + padding_text
+
+    return base_prompt
+
+
+def _extract_step_logprobs(request_output):
+    if getattr(request_output, "outputs", None):
+        inner = request_output.outputs[0]
+        if hasattr(inner, "logprobs") and inner.logprobs is not None:
+            t = torch.tensor(
+                [
+                    inner.logprobs[i][tid].logprob
+                    for i, tid in enumerate(inner.token_ids)
+                ],
+                dtype=torch.float32,
+            )
+            return t, inner.token_ids
+
+    return None, None

From f77bce001a6261da0661f0192c8cddd1ca453750 Mon Sep 17 00:00:00 2001
From: Pranav <56645758+pranav4501@users.noreply.github.com>
Date: Mon, 17 Nov 2025 15:11:20 -0800
Subject: [PATCH 137/578] [Model] Add Afmoe architecture implementation
 (#28332)

Signed-off-by: Maziyar Panahi <maziyar.panahi@iscpif.fr>
Signed-off-by: Pranav <veldurthipranav@gmail.com>
Co-authored-by: Maziyar Panahi <maziyar.panahi@iscpif.fr>
---
 docs/models/supported_models.md             |   1 +
 tests/models/registry.py                    |   4 +
 vllm/model_executor/models/afmoe.py         | 711 ++++++++++++++++++++
 vllm/model_executor/models/registry.py      |   1 +
 vllm/transformers_utils/config.py           |   1 +
 vllm/transformers_utils/configs/__init__.py |   2 +
 vllm/transformers_utils/configs/afmoe.py    |  84 +++
 7 files changed, 804 insertions(+)
 create mode 100644 vllm/model_executor/models/afmoe.py
 create mode 100644 vllm/transformers_utils/configs/afmoe.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index d47aeaab511b..bd14bbb9ab66 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -351,6 +351,7 @@ th {
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
+| `AfmoeForCausalLM` | Afmoe | TBA | ✅︎ | ✅︎ |
 | `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ |
 | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
 | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 644d0619215f..094f921e4305 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -173,6 +173,10 @@ def check_available_online(
 
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
+    "AfmoeForCausalLM": _HfExamplesInfo(
+        "arcee-ai/Trinity-Nano",
+        is_available_online=False,
+    ),
     "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"),
     "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True),
     "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True),
diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py
new file mode 100644
index 000000000000..6f654f47495f
--- /dev/null
+++ b/vllm/model_executor/models/afmoe.py
@@ -0,0 +1,711 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only AfMoE model compatible with HuggingFace weights."""
+
+import typing
+from collections.abc import Callable, Iterable
+from itertools import islice
+from typing import Any
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
+from vllm.model_executor.models.llama import LlamaMLP as AfmoeMLP
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    WeightsMapper,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from vllm.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+
+class AfmoeMoE(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.route_scale = config.route_scale
+        self.score_func = config.score_func
+        self.route_norm = config.route_norm
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.num_experts
+        self.n_shared_experts: int = config.num_shared_experts
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        # Router gate
+        self.gate = nn.Linear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            dtype=torch.float32,
+        )
+        self.expert_bias = nn.Parameter(
+            torch.empty(config.num_experts, dtype=torch.float32)
+        )
+
+        # Load balancing settings
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        self.shared_experts = None
+        # Shared experts
+        if config.num_shared_experts > 0:
+            intermediate_size = config.moe_intermediate_size * config.num_shared_experts
+            self.shared_experts = AfmoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+
+        # Routed experts using SharedFusedMoE
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=self.route_norm if self.score_func == "sigmoid" else False,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=self.score_func,
+            routed_scaling_factor=self.route_scale,
+            e_score_correction_bias=self.expert_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits = self.gate(hidden_states.to(dtype=torch.float32))
+
+        fused_moe_out = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        if self.shared_experts is not None:
+            shared_output, final_hidden_states = fused_moe_out
+            final_hidden_states = final_hidden_states + shared_output
+        else:
+            final_hidden_states = fused_moe_out
+        if self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class AfmoeAttention(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        layer_idx: int,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: dict[str, Any] | None = None,
+        max_position_embeddings: int = 131072,
+        head_dim: int | None = None,
+        rms_norm_eps: float = 1e-05,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        # Check if this is a local attention layer
+        self.is_local_attention = config.layer_types[layer_idx] == "sliding_attention"
+        self.sliding_window = config.sliding_window if self.is_local_attention else None
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        # Gating projection
+        self.gate_proj = ColumnParallelLinear(
+            hidden_size,
+            self.total_num_heads * self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_proj",
+        )
+
+        # Q/K normalization
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        # Only create rotary embeddings for local attention
+        if self.is_local_attention:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=rope_theta,
+                rope_scaling=rope_scaling,
+                is_neox_style=True,
+            )
+        else:
+            self.rotary_emb = None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=self.sliding_window,
+            prefix=f"{prefix}.attn",
+            attn_type=attn_type,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        gate, _ = self.gate_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # Apply Q/K normalization
+        q = self.q_norm(q.reshape(-1, self.num_heads, self.head_dim)).reshape(q.shape)
+        k = self.k_norm(k.reshape(-1, self.num_kv_heads, self.head_dim)).reshape(
+            k.shape
+        )
+
+        # Apply rotary embeddings only for local attention
+        if self.is_local_attention and self.rotary_emb is not None:
+            q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v)
+
+        # Apply gating
+        attn_output = attn_output * torch.sigmoid(gate)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class AfmoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
+
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        self.layer_idx = extract_layer_index(prefix)
+
+        self.self_attn = AfmoeAttention(
+            config=config,
+            layer_idx=self.layer_idx,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=config.head_dim,
+            rms_norm_eps=config.rms_norm_eps,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        # MoE or dense FFN
+        self.moe_enabled = self.layer_idx >= config.num_dense_layers
+        if self.moe_enabled:
+            self.mlp = AfmoeMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
+            )
+        else:
+            self.mlp = AfmoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)  # attn norm b
+
+        # Fully Connected
+        hidden_states, residual = self.pre_mlp_layernorm(  # ffn norm a
+            hidden_states, residual
+        )
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)  # ffn norm b
+
+        return hidden_states, residual
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
+class AfmoeModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        enable_eplb = vllm_config.parallel_config.enable_eplb
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+        self.mup_enabled = config.mup_enabled
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size, config.hidden_size, prefix=f"{prefix}.embed_tokens"
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: AfmoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                enable_eplb=enable_eplb,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+
+            # Apply muP input scaling if enabled
+            if self.mup_enabled:
+                hidden_states = hidden_states * (self.config.hidden_size**0.5)
+
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def make_empty_intermediate_tensors(
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return SharedFusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if (weight_name not in name) or ("self_attn.gate_proj" in name):
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
+                        continue
+
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
+                    if success:
+                        name = name_mapped
+                        break
+                else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class AfmoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_suffix={
+            ".router.gate.weight": ".gate.weight",
+        },
+    )
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = AfmoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size, config.hidden_size, quant_config=quant_config
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+        self.expert_weights = []
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = config.num_hidden_layers - config.num_dense_layers
+        self.num_expert_groups = config.n_group
+
+        self.moe_layers: list[SharedFusedMoE] = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, AfmoeDecoderLayer)
+            if layer.moe_enabled:
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None and self.num_moe_layers > 0:
+            raise RuntimeError("No AfmoeMoE layer found in model.layers.")
+
+        if example_moe is not None:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4af8fa01f562..6e9790de49bf 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -56,6 +56,7 @@
 
 _TEXT_GENERATION_MODELS = {
     # [Decoder-only]
+    "AfmoeForCausalLM": ("afmoe", "AfmoeForCausalLM"),
     "ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index b7418cfb7cc7..49250e071eab 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -77,6 +77,7 @@ def __getitem__(self, key):
 
 
 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
+    afmoe="AfmoeConfig",
     chatglm="ChatGLMConfig",
     deepseek_vl_v2="DeepseekVLV2Config",
     deepseek_v32=DeepseekV3Config,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index ac612b255143..dcae05a15fec 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -7,6 +7,7 @@
 - There is a need to override the existing config to support vLLM.
 """
 
+from vllm.transformers_utils.configs.afmoe import AfmoeConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
@@ -40,6 +41,7 @@
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
+    "AfmoeConfig",
     "ChatGLMConfig",
     "DeepseekVLV2Config",
     "DotsOCRConfig",
diff --git a/vllm/transformers_utils/configs/afmoe.py b/vllm/transformers_utils/configs/afmoe.py
new file mode 100644
index 000000000000..9b634fd037a3
--- /dev/null
+++ b/vllm/transformers_utils/configs/afmoe.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class AfmoeConfig(PretrainedConfig):
+    model_type = "afmoe"
+
+    def __init__(
+        self,
+        vocab_size: int = 200_192,
+        hidden_size: int = 2048,
+        intermediate_size: int = 6144,
+        moe_intermediate_size: int = 1408,
+        num_hidden_layers: int = 32,
+        num_dense_layers: int = 1,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int | None = None,
+        head_dim: int = 128,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 131072,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-5,
+        use_cache: bool = True,
+        tie_word_embeddings: bool = False,
+        rope_theta: float = 10000.0,
+        rope_scaling: dict | None = None,
+        num_experts: int = 64,
+        num_experts_per_tok: int = 6,
+        num_shared_experts: int = 2,
+        num_expert_groups: int = 1,
+        num_limited_groups: int = 1,
+        score_func: str = "sigmoid",
+        route_norm: bool = True,
+        route_scale: float = 1.0,
+        global_attn_every_n_layers: int = 4,
+        sliding_window: int = 2048,
+        layer_types: list[str] | None = None,
+        attention_dropout: float = 0.0,
+        mup_enabled: bool = False,
+        n_group: int = 1,
+        topk_group: int = 1,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_dense_layers = num_dense_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads or num_attention_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_shared_experts = num_shared_experts
+        self.num_expert_groups = num_expert_groups
+        self.num_limited_groups = num_limited_groups
+        self.score_func = score_func
+        self.route_norm = route_norm
+        self.route_scale = route_scale
+
+        self.global_attn_every_n_layers = global_attn_every_n_layers
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        self.attention_dropout = attention_dropout
+
+        self.mup_enabled = mup_enabled
+        self.n_group = n_group
+        self.topk_group = topk_group
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["AfmoeConfig"]

From 61485844fc5190721b1edf6bed9aa4d5567b70e0 Mon Sep 17 00:00:00 2001
From: Bangsheng Tang <5318912+bangshengtang@users.noreply.github.com>
Date: Mon, 17 Nov 2025 15:22:11 -0800
Subject: [PATCH 138/578] [BugFix] Corner case that could cause out-of-sync
 with external launcher mode and dp >1 (#28774)

---
 vllm/v1/worker/gpu_model_runner.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2a8ff746f112..0102ca4739ad 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2663,6 +2663,18 @@ def execute_model(
                         return make_empty_encoder_model_runner_output(scheduler_output)
 
                 if not num_scheduled_tokens:
+                    if (
+                        self.parallel_config.distributed_executor_backend
+                        == "external_launcher"
+                        and self.parallel_config.data_parallel_size > 1
+                    ):
+                        # this is a corner case when both external launcher
+                        # and DP are enabled, num_scheduled_tokens could be
+                        # 0, and has_unfinished_requests in the outer loop
+                        # returns True. before returning early here we call
+                        # dummy run to ensure coordinate_batch_across_dp
+                        # is called into to avoid out of sync issues.
+                        self._dummy_run(1)
                     if not has_kv_transfer_group():
                         # Return empty ModelRunnerOutput if no work to do.
                         return EMPTY_MODEL_RUNNER_OUTPUT

From 552cac95b5da283844a9994b94d4b1308a0a0565 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Mon, 17 Nov 2025 15:32:22 -0800
Subject: [PATCH 139/578] [Misc] Fix wrong comment in scheduler (#28880)

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 61640e856ac1..4323141c435b 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -472,9 +472,9 @@ def schedule(self) -> SchedulerOutput:
                     num_computed_tokens = (
                         num_new_local_computed_tokens + num_external_computed_tokens
                     )
-                # KVTransfer: WAITING reqs have num_computed_tokens > 0
-                # after async KV recvs are completed.
                 else:
+                    # KVTransfer: WAITING reqs have num_computed_tokens > 0
+                    # after async KV recvs are completed.
                     new_computed_blocks = self.kv_cache_manager.empty_kv_cache_blocks
                     num_new_local_computed_tokens = 0
                     num_computed_tokens = request.num_computed_tokens
@@ -483,12 +483,12 @@ def schedule(self) -> SchedulerOutput:
                 external_load_encoder_input = []
                 new_encoder_compute_budget = encoder_compute_budget
 
-                # KVTransfer: loading remote KV, do not allocate for new work.
                 if load_kv_async:
+                    # KVTransfer: loading remote KV, do not allocate for new work.
                     assert num_external_computed_tokens > 0
                     num_new_tokens = 0
-                # Number of tokens to be scheduled.
                 else:
+                    # Number of tokens to be scheduled.
                     # We use `request.num_tokens` instead of
                     # `request.num_prompt_tokens` to consider the resumed
                     # requests, which have output tokens.

From b6e04390d3ea5ebc79ac70d1b76d638c56fa8ce2 Mon Sep 17 00:00:00 2001
From: Benjamin Bartels <benjamin@bartels.dev>
Date: Tue, 18 Nov 2025 03:13:25 +0000
Subject: [PATCH 140/578] [Bugfix] Fix Kimi-K2 tool parser concatenated tool
 calls parsing (#28831)

Signed-off-by: Thomas Mao <yiyeguhu@gmail.com>
Signed-off-by: bbartels <benjamin@bartels.dev>
Co-authored-by: Thomas Mao <yiyeguhu@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 tests/tool_use/test_kimi_k2_tool_parser.py    | 122 ++++++++++++++++++
 .../tool_parsers/kimi_k2_tool_parser.py       |   3 +-
 2 files changed, 124 insertions(+), 1 deletion(-)

diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py
index 33dabbc7e7b9..3a48b5206141 100644
--- a/tests/tool_use/test_kimi_k2_tool_parser.py
+++ b/tests/tool_use/test_kimi_k2_tool_parser.py
@@ -60,6 +60,11 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
     ids=[
         "tool_call_with_content_before",
         "multi_tool_call_with_content_before",
+        "concatenated_tool_calls_bug_fix",
+        "three_concatenated_tool_calls",
+        "mixed_spacing_tool_calls",
+        "angle_brackets_in_json",
+        "newlines_in_json",
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
@@ -114,6 +119,123 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
             ],
             "I'll help you check the weather. ",
         ),
+        (
+            """I'll get the weather and news for LA today. First, let me get the weather using Los Angeles coordinates, and then get the latest news. <|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"latitude": 34.0522, "longitude": -118.2437}<|tool_call_end|><|tool_call_begin|>functions.get_news:1<|tool_call_argument_begin|>{"content": "Los Angeles today"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {"latitude": 34.0522, "longitude": -118.2437}
+                        ),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.get_news:1",
+                    function=FunctionCall(
+                        name="get_news",
+                        arguments=json.dumps({"content": "Los Angeles today"}),
+                    ),
+                    type="function",
+                ),
+            ],
+            "I'll get the weather and news for LA today. First, let me get the weather using Los Angeles coordinates, and then get the latest news. ",
+        ),
+        (
+            """I'll help you with multiple tasks. <|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"city": "New York"}<|tool_call_end|><|tool_call_begin|>functions.get_news:1<|tool_call_argument_begin|>{"topic": "technology"}<|tool_call_end|><|tool_call_begin|>functions.send_email:2<|tool_call_argument_begin|>{"to": "user@example.com", "subject": "Daily Update"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps({"city": "New York"}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.get_news:1",
+                    function=FunctionCall(
+                        name="get_news",
+                        arguments=json.dumps({"topic": "technology"}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.send_email:2",
+                    function=FunctionCall(
+                        name="send_email",
+                        arguments=json.dumps(
+                            {"to": "user@example.com", "subject": "Daily Update"}
+                        ),
+                    ),
+                    type="function",
+                ),
+            ],
+            "I'll help you with multiple tasks. ",
+        ),
+        (
+            """Mixed spacing test. <|tool_calls_section_begin|> <|tool_call_begin|> functions.test:0 <|tool_call_argument_begin|> {} <|tool_call_end|><|tool_call_begin|>functions.test2:1<|tool_call_argument_begin|>{}<|tool_call_end|> <|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.test:0",
+                    function=FunctionCall(
+                        name="test",
+                        arguments=json.dumps({}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.test2:1",
+                    function=FunctionCall(
+                        name="test2",
+                        arguments=json.dumps({}),
+                    ),
+                    type="function",
+                ),
+            ],
+            "Mixed spacing test. ",
+        ),
+        (
+            """I need to process HTML content. <|tool_calls_section_begin|><|tool_call_begin|>functions.process_html:0<|tool_call_argument_begin|>{"html": "<div>content</div>", "text": "normal text"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.process_html:0",
+                    function=FunctionCall(
+                        name="process_html",
+                        arguments=json.dumps(
+                            {"html": "<div>content</div>", "text": "normal text"}
+                        ),
+                    ),
+                    type="function",
+                )
+            ],
+            "I need to process HTML content. ",
+        ),
+        (
+            """I need to process formatted JSON. <|tool_calls_section_begin|><|tool_call_begin|>functions.process_data:0<|tool_call_argument_begin|>{
+  "name": "test",
+  "value": 123,
+  "nested": {
+    "key": "value"
+  }
+}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.process_data:0",
+                    function=FunctionCall(
+                        name="process_data",
+                        arguments=json.dumps(
+                            {"name": "test", "value": 123, "nested": {"key": "value"}},
+                            indent=2,
+                        ),
+                    ),
+                    type="function",
+                )
+            ],
+            "I need to process formatted JSON. ",
+        ),
     ],
 )
 def test_extract_tool_calls(
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
index a84c9e454716..2b84c60a3b84 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -60,7 +60,8 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.tool_call_end_token: str = "<|tool_call_end|>"
 
         self.tool_call_regex = re.compile(
-            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*?)\s*<\|tool_call_end\|>"
+            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*<\|tool_call_end\|>",
+            re.DOTALL,
         )
 
         self.stream_tool_call_portion_regex = re.compile(

From 88ab591f0b20c28cb167fd65d10ccade99d873ae Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 17 Nov 2025 22:16:03 -0500
Subject: [PATCH 141/578] Run macos smoke test workflow on main commit (#28752)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .github/workflows/macos-smoke-test.yml | 15 ++++++++++-----
 requirements/cpu-build.txt             |  5 +++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index 8d40aa587bf0..42b05ecd5ac0 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -1,6 +1,9 @@
 name: macOS Apple Silicon Smoke Test
 
 on:
+  push:
+    branches:
+      - main
   workflow_dispatch:  # Manual trigger
 
 jobs:
@@ -19,13 +22,15 @@ jobs:
             pyproject.toml
           python-version: '3.12'
 
-      - name: Install dependencies
+      - name: Create virtual environment
         run: |
-          uv pip install -r requirements/cpu-build.txt
-          uv pip install -r requirements/cpu.txt
+          uv venv
+          echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
 
-      - name: Build vLLM
-        run: uv pip install -v -e .
+      - name: Install dependencies and build vLLM
+        run: |
+          uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
+          uv pip install -e .
         env:
           CMAKE_BUILD_PARALLEL_LEVEL: 4
 
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 331d02be6621..81d429a5e5f8 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -4,8 +4,9 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
+torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.0; platform_system == "Darwin"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6

From d0a73620cc85a840323d25b28772efac04c006e2 Mon Sep 17 00:00:00 2001
From: xuebwang-amd <xuebwang@amd.com>
Date: Tue, 18 Nov 2025 11:16:45 +0800
Subject: [PATCH 142/578] [ROCm][Quantization] add apply_vllm_mapper in quark
 config for models like gpt-oss (#28638)

Signed-off-by: xuebwang-amd <xuebwang@amd.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../layers/quantization/quark/quark.py        | 35 ++++++++++++++++---
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 1bb698faf46d..f59e5e2a0af7 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -32,6 +32,7 @@
     deep_compare,
     should_ignore_layer,
 )
+from vllm.model_executor.models.utils import WeightsMapper
 from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
@@ -57,7 +58,6 @@ def __init__(
         self.kv_cache_group = kv_cache_group
         self.kv_cache_config = kv_cache_config
         self.pack_method = pack_method
-        self.ignore: list[str] = cast(list[str], self.quant_config.get("exclude", []))
 
     def get_linear_method(self) -> "QuarkLinearMethod":
         return QuarkLinearMethod(self)
@@ -72,14 +72,42 @@ def get_min_capability(cls) -> int:
     def get_name(self) -> QuantizationMethods:
         return "quark"
 
+    def apply_vllm_mapper(  # noqa: B027
+        self, hf_to_vllm_mapper: "WeightsMapper"
+    ):
+        """
+        Interface for models to update module names referenced in
+        quantization configs in order to reflect the vllm model structure
+
+        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
+            structure of the qconfig) to vllm model structure
+        """
+        quant_config_with_hf_to_vllm_mapper = {}
+
+        for k, v in self.quant_config.items():
+            if isinstance(v, list):
+                quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_list(v)
+            elif isinstance(v, dict):
+                quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_dict(v)
+            else:
+                if isinstance(v, str):
+                    mapped_v_list = hf_to_vllm_mapper.apply_list([v])
+                    if mapped_v_list:
+                        quant_config_with_hf_to_vllm_mapper[k] = mapped_v_list[0]
+                else:
+                    quant_config_with_hf_to_vllm_mapper[k] = v
+
+        self.quant_config = quant_config_with_hf_to_vllm_mapper
+
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
         # Check if the layer is skipped for quantization.
+        exclude_layers = cast(list[str], self.quant_config.get("exclude"))
         if should_ignore_layer(
-            prefix, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
+            prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping
         ):
             return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
@@ -93,9 +121,6 @@ def get_quant_method(
             return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix)
         return None
 
-    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
-        self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
-
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "QuarkConfig":
         export_config = config.get("export")

From 3ddcf4601171797b6e63eda6b5956136441b3408 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 17 Nov 2025 23:29:29 -0500
Subject: [PATCH 143/578] [Refactor] Remove Unused Func in Batch Invariant
 (#28881)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/batch_invariant.py | 73 -------------------
 1 file changed, 73 deletions(-)

diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 746a543ab827..7920d117de5e 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import contextlib
 import os
-from collections import namedtuple
 from collections.abc import Callable
 from functools import cache
 from typing import Any
@@ -725,10 +723,6 @@ def linear_batch_invariant(input, weight, bias=None):
 _original_cublaslt_workspace_size = None
 
 
-def is_batch_invariant_mode_enabled():
-    return _batch_invariant_MODE
-
-
 def enable_batch_invariant_mode():
     global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
     global _original_fp16_reduction_precision, _original_bf16_reduction_precision
@@ -791,73 +785,6 @@ def enable_batch_invariant_mode():
     torch.backends.cuda.preferred_blas_library(backend="cublaslt")
 
 
-def disable_batch_invariant_mode():
-    global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
-    global _original_fp16_reduction_precision, _original_bf16_reduction_precision
-    global _original_cublas_workspace_cfg, _original_cublaslt_workspace_size
-    if not _batch_invariant_MODE:
-        return
-
-    if _batch_invariant_LIB is not None:
-        _batch_invariant_LIB._destroy()
-    if _original_torch_bmm is not None:
-        torch.bmm = _original_torch_bmm
-        _original_torch_bmm = None
-
-    if _original_bf16_reduction_precision is not None:
-        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (
-            _original_bf16_reduction_precision
-        )
-        _original_bf16_reduction_precision = None
-    if _original_fp16_reduction_precision is not None:
-        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
-            _original_fp16_reduction_precision
-        )
-        _original_fp16_reduction_precision = None
-
-    torch.backends.cuda.preferred_blas_library(backend="default")
-
-    if not is_torch_equal_or_newer("2.10.0.dev"):
-        # Set cublas env vars to previous results. If previous results are None,
-        # that means the env vars were not set, so we should remove them.
-        if _original_cublas_workspace_cfg:
-            os.environ["CUBLAS_WORKSPACE_CONFIG"] = _original_cublas_workspace_cfg
-        elif "CUBLAS_WORKSPACE_CONFIG" in os.environ:
-            del os.environ["CUBLAS_WORKSPACE_CONFIG"]
-
-        if _original_cublaslt_workspace_size:
-            os.environ["CUBLASLT_WORKSPACE_SIZE"] = _original_cublaslt_workspace_size
-        elif "CUBLASLT_WORKSPACE_SIZE" in os.environ:
-            del os.environ["CUBLASLT_WORKSPACE_SIZE"]
-
-    _original_cublas_workspace_cfg = None
-    _original_cublaslt_workspace_size = None
-
-    _batch_invariant_MODE = False
-    _batch_invariant_LIB = None
-
-
-@contextlib.contextmanager
-def set_batch_invariant_mode(enabled: bool = True):
-    global _batch_invariant_MODE, _batch_invariant_LIB
-    old_data = (_batch_invariant_MODE, _batch_invariant_LIB)
-    if enabled:
-        enable_batch_invariant_mode()
-    else:
-        disable_batch_invariant_mode()
-    yield
-    if _batch_invariant_LIB is not None:
-        _batch_invariant_LIB._destroy()
-    _batch_invariant_MODE, _batch_invariant_LIB = old_data
-
-
-AttentionBlockSize = namedtuple("AttentionBlockSize", ["block_m", "block_n"])
-
-
-def get_batch_invariant_attention_block_size() -> AttentionBlockSize:
-    return AttentionBlockSize(block_m=16, block_n=16)
-
-
 @cache
 def vllm_is_batch_invariant():
     env_key = "VLLM_BATCH_INVARIANT"

From bf9e1e8767fb4d1143b7e042ed940b84ef031c66 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 18 Nov 2025 12:30:29 +0800
Subject: [PATCH 144/578] [Bugfix] Fix wrong CLI defaults for dynamic
 `SchedulerConfig` fields (#28872)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../openai/test_enable_force_include_usage.py  |  4 ++--
 vllm/engine/arg_utils.py                       | 18 +++++++++++++++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py
index 3ddf2308eb1d..9d527c45c1fa 100644
--- a/tests/entrypoints/openai/test_enable_force_include_usage.py
+++ b/tests/entrypoints/openai/test_enable_force_include_usage.py
@@ -17,7 +17,7 @@ def chat_server_with_force_include_usage(request):  # noqa: F811
         "128",
         "--enforce-eager",
         "--max-num-seqs",
-        "1",
+        "4",
         "--enable-force-include-usage",
         "--port",
         "55857",
@@ -78,7 +78,7 @@ def transcription_server_with_force_include_usage():
         "--dtype",
         "bfloat16",
         "--max-num-seqs",
-        "1",
+        "4",
         "--enforce-eager",
         "--enable-force-include-usage",
         "--gpu-memory-utilization",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d011dfdbfbb2..ab6e5e594c23 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1046,10 +1046,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             description=SchedulerConfig.__doc__,
         )
         scheduler_group.add_argument(
-            "--max-num-batched-tokens", **scheduler_kwargs["max_num_batched_tokens"]
+            "--max-num-batched-tokens",
+            **{
+                **scheduler_kwargs["max_num_batched_tokens"],
+                "default": None,
+            },
         )
         scheduler_group.add_argument(
-            "--max-num-seqs", **scheduler_kwargs["max_num_seqs"]
+            "--max-num-seqs",
+            **{
+                **scheduler_kwargs["max_num_seqs"],
+                "default": None,
+            },
         )
         scheduler_group.add_argument(
             "--max-num-partial-prefills", **scheduler_kwargs["max_num_partial_prefills"]
@@ -1071,7 +1079,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--scheduling-policy", **scheduler_kwargs["policy"]
         )
         scheduler_group.add_argument(
-            "--enable-chunked-prefill", **scheduler_kwargs["enable_chunked_prefill"]
+            "--enable-chunked-prefill",
+            **{
+                **scheduler_kwargs["enable_chunked_prefill"],
+                "default": None,
+            },
         )
         scheduler_group.add_argument(
             "--disable-chunked-mm-input", **scheduler_kwargs["disable_chunked_mm_input"]

From 083cf326dc9ce92aae6b85fcef678a28e867afe9 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Tue, 18 Nov 2025 05:32:14 +0100
Subject: [PATCH 145/578] [Doc]: fix typos in various files (#28863)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 docs/contributing/profiling.md                | 2 +-
 docs/design/io_processor_plugins.md           | 2 +-
 docs/design/logits_processors.md              | 4 ++--
 docs/features/disagg_prefill.md               | 2 +-
 docs/features/lora.md                         | 2 +-
 vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 7941b1f49ee8..7634cc0859ed 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -224,6 +224,6 @@ snakeviz expensive_function.prof
 
 Leverage VLLM_GC_DEBUG environment variable to debug GC costs.
 
-- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
+- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elapsed times
 - VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger to log top 5
   collected objects for each gc.collect
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index 2f4b17f191a5..91ab4deae71d 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -1,6 +1,6 @@
 # IO Processor Plugins
 
-IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
+IO Processor plugins are a feature that allows pre- and post-processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
 
 When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
 
diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md
index acf7fc245462..8eadeb386fcf 100644
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@@ -411,7 +411,7 @@ Logits processor `update_state()` implementations should assume the following mo
 
         * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
 
-        * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
+        * **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
 
 5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
 
@@ -548,7 +548,7 @@ Built-in logits processors are always loaded when the vLLM engine starts. See th
 
 Review these logits processor implementations for guidance on writing built-in logits processors.
 
-Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model.
+Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforementioned logits processor programming model.
 
 * Allowed token IDs
 
diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index 3e8cb87e37d3..fd4f249f2ec6 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -91,6 +91,6 @@ Disaggregated prefilling is highly related to infrastructure, so vLLM relies on
 
 We recommend three ways of implementations:
 
-- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
+- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc.). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
 - **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
 - **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.
diff --git a/docs/features/lora.md b/docs/features/lora.md
index 3a85b52d89b6..d42a3cef76bd 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -4,7 +4,7 @@ This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09
 
 LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vllm.model_executor.models.interfaces.SupportsLoRA].
 
-Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
+Adapters can be efficiently served on a per-request basis with minimal overhead. First we download the adapter(s) and save
 them locally with
 
 ```python
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index 893972144e99..e2dd47dbb4e6 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -154,7 +154,7 @@ def _fused_moe_lora_kernel(
         k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K)
         # pre-fetch lora weight
         b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
-        # GDC wait waits for ALL programs in the the prior kernel to complete
+        # GDC wait waits for ALL programs in the prior kernel to complete
         # before continuing.
         if USE_GDC and not IS_PRIMARY:
             tl.extra.cuda.gdc_wait()

From 0168f69e50898fd5f09ac64a0d735039e57e7806 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Tue, 18 Nov 2025 12:33:46 +0800
Subject: [PATCH 146/578] [Misc] Remove unnecessary parentheses from log
 statements (#28897)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/model_executor/models/registry.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 6e9790de49bf..a2de597c87d8 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -597,7 +597,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None:
                     mi_dict = json.load(file)
             except FileNotFoundError:
                 logger.debug(
-                    ("Cached model info file for class %s.%s not found"),
+                    "Cached model info file for class %s.%s not found",
                     self.module_name,
                     self.class_name,
                 )
@@ -605,7 +605,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None:
 
             if mi_dict["hash"] != module_hash:
                 logger.debug(
-                    ("Cached model info file for class %s.%s is stale"),
+                    "Cached model info file for class %s.%s is stale",
                     self.module_name,
                     self.class_name,
                 )
@@ -615,7 +615,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None:
             return _ModelInfo(**mi_dict["modelinfo"])
         except Exception:
             logger.debug(
-                ("Cached model info for class %s.%s error. "),
+                "Cached model info for class %s.%s error. ",
                 self.module_name,
                 self.class_name,
             )
@@ -650,14 +650,14 @@ def inspect_model_cls(self) -> _ModelInfo:
             mi = self._load_modelinfo_from_cache(module_hash)
             if mi is not None:
                 logger.debug(
-                    ("Loaded model info for class %s.%s from cache"),
+                    "Loaded model info for class %s.%s from cache",
                     self.module_name,
                     self.class_name,
                 )
                 return mi
             else:
                 logger.debug(
-                    ("Cache model info for class %s.%s miss. Loading model instead."),
+                    "Cache model info for class %s.%s miss. Loading model instead.",
                     self.module_name,
                     self.class_name,
                 )

From 5bdd15527770ef39cc4c3cdca008fb4f9cf8a15f Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 17 Nov 2025 21:26:32 -0800
Subject: [PATCH 147/578] [CI] Fix async scheduling + spec decoding test flake
 (#28902)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/e2e/test_async_scheduling.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index f732b05f09f9..00d93e1ba0b5 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -84,6 +84,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
         "num_speculative_tokens": 2,
         "model": "nm-testing/Llama3_2_1B_speculator.eagle3",
     }
+    # Set small draft model len to force doesn't-fit-in-drafter case.
     spec_config_short = spec_config | {"max_model_len": 50}
 
     # test_preemption, executor, async_scheduling,
@@ -174,13 +175,14 @@ def run_tests(
                 ):
                     if "spec_mml=None" in test_config:
                         assert (
-                            pytest.approx(test_acceptance_rate, rel=5e-2)
-                            == base_acceptance_rate
+                            test_acceptance_rate > base_acceptance_rate
+                            or test_acceptance_rate
+                            == pytest.approx(base_acceptance_rate, rel=5e-2)
                         )
                     else:
                         # Currently the reported acceptance rate is expected to be
                         # lower when we sometimes skip drafting altogether.
-                        assert test_acceptance_rate > 0.05
+                        assert test_acceptance_rate > 0.1
                 print(
                     f"PASSED: config=[{test_config}], params={params}"
                     f" accept_rate={test_acceptance_rate}"

From 5bb1da5190b54aefb08478c6b1170f97722b8bdb Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 18 Nov 2025 13:28:31 +0800
Subject: [PATCH 148/578] [MISC] Remove format.sh (#28906)

Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
---
 format.sh | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100755 format.sh

diff --git a/format.sh b/format.sh
deleted file mode 100755
index 6ba93e0a19ba..000000000000
--- a/format.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
-echo "Please run 'pip install -r requirements/lint.txt', followed by"
-echo "'pre-commit install' to install the pre-commit hooks."
-echo "Then linters will run automatically before each commit."
\ No newline at end of file

From 896e41ae04d18b0f984eefbb41b920aa7505f5d1 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 18 Nov 2025 16:10:55 +0800
Subject: [PATCH 149/578] [CI/Build] Replace wikipedia url with local server
 ones (#28908)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/entrypoints/openai/test_metrics.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index dbcec9d31fc9..4e7b765d7713 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -16,6 +16,7 @@
 
 from vllm import version
 
+from ...conftest import LocalAssetServer
 from ...utils import RemoteOpenAIServer
 
 MODELS = {
@@ -69,7 +70,6 @@ async def client(server):
 
 
 _PROMPT = "Hello my name is Robert and I love magic"
-_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
 
 def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
@@ -250,6 +250,7 @@ async def test_metrics_counts(
 
 @pytest.mark.asyncio
 async def test_metrics_exist(
+    local_asset_server: LocalAssetServer,
     server: RemoteOpenAIServer,
     client: openai.AsyncClient,
     model_key: str,
@@ -265,13 +266,21 @@ async def test_metrics_exist(
             temperature=0.0,
         )
     else:
+        # https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
         await client.chat.completions.create(
             model=model_name,
             messages=[
                 {
                     "role": "user",
                     "content": [
-                        {"type": "image_url", "image_url": {"url": _IMAGE_URL}},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": local_asset_server.url_for(
+                                    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                                ),
+                            },
+                        },
                         {"type": "text", "text": "What's in this image?"},
                     ],
                 }

From 439368496db48d8f992ba8c606a0c0b1eebbfa69 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 18 Nov 2025 00:20:45 -0800
Subject: [PATCH 150/578] [BugFix] Fix PP/async scheduling with pooling models
 (#28899)

Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/v1/engine/core.py           | 3 ++-
 vllm/v1/executor/ray_executor.py | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 97286c6e2e5e..d49eb752d56a 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -184,6 +184,7 @@ def __init__(
             vllm_config.ec_transfer_config is not None
             and vllm_config.ec_transfer_config.is_ec_producer
         )
+        self.is_pooling_model = vllm_config.model_config.runner_type == "pooling"
 
         self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None
         if vllm_config.cache_config.enable_prefix_caching or kv_connector is not None:
@@ -392,7 +393,7 @@ def step_with_batch_queue(
             if not self.ec_producer:
                 model_executed = scheduler_output.total_num_scheduled_tokens > 0
 
-            if not model_executed:
+            if self.is_pooling_model or not model_executed:
                 # No sampling required (no requests scheduled).
                 future = cast(Future[ModelRunnerOutput], exec_future)
             else:
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index 55db7445c9c7..406eafcd339b 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -99,9 +99,9 @@ def _init_executor(self) -> None:
         # KV connector setup
         self.has_connector = self.vllm_config.kv_transfer_config is not None
 
-        self.ec_producer = (
-            self.vllm_config.ec_transfer_config is not None
-            and self.vllm_config.ec_transfer_config.is_ec_producer
+        self.uses_sampler = self.vllm_config.model_config.runner_type != "pooling" and (
+            self.vllm_config.ec_transfer_config is None
+            or not self.vllm_config.ec_transfer_config.is_ec_producer
         )
 
         self.scheduler_output: SchedulerOutput | None = None
@@ -401,7 +401,7 @@ def execute_model(  # type: ignore[override]
                 "after execute_model() returns None."
             )
 
-        if self.ec_producer or not scheduler_output.total_num_scheduled_tokens:
+        if not self.uses_sampler or not scheduler_output.total_num_scheduled_tokens:
             # Model will not execute, call model runner immediately.
             return self._execute_dag(scheduler_output, None, non_block)
 

From 285eaa42857ba2a8f377fdd0dcd84120260d8f65 Mon Sep 17 00:00:00 2001
From: Song Zhixin <szxfml@gmail.com>
Date: Tue, 18 Nov 2025 18:53:44 +0800
Subject: [PATCH 151/578] [Bugfix] Safeguard against missing backend in
 AttentionBackendEnum (#28846)

Signed-off-by: jesse <szxfml@gmail.com>
Signed-off-by: Song Zhixin <szxfml@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/attention/layer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 37f9a4b383ce..a8e796a1eab6 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -310,7 +310,8 @@ def __init__(
             kv_sharing_target_layer_name,
             **extra_impl_args,
         )
-        self.backend = AttentionBackendEnum[self.attn_backend.get_name()]
+        backend_name = self.attn_backend.get_name()
+        self.backend = AttentionBackendEnum.__members__.get(backend_name)
         self.dtype = dtype
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how

From b9489f51e1c61c96378e12c9523f9de7043ca294 Mon Sep 17 00:00:00 2001
From: Canlin Guo <canlinguosdu@gmail.com>
Date: Tue, 18 Nov 2025 19:51:54 +0800
Subject: [PATCH 152/578] [Model][Perf] Use cos and sin cache in QwenVL
 (#28798)

Signed-off-by: gcanlin <canlinguosdu@gmail.com>
---
 .../layers/rotary_embedding/base.py           |   5 +
 vllm/model_executor/models/glm4_1v.py         |  88 +++++-------
 vllm/model_executor/models/qwen2_5_vl.py      | 123 ++++++++--------
 vllm/model_executor/models/qwen2_vl.py        | 135 ++++++------------
 .../models/qwen3_omni_moe_thinker.py          |  40 ++++--
 vllm/model_executor/models/qwen3_vl.py        |  44 ++++--
 6 files changed, 218 insertions(+), 217 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index ce4f40680b0a..4114b21168cc 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -83,6 +83,11 @@ def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> None:
         ):
             self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
 
+    def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]:
+        cos_sin = self.cos_sin_cache[:seqlen]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        return cos, sin
+
 
 class RotaryEmbedding(RotaryEmbeddingBase):
     def __init__(
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 6953b805653b..65c3fc2d9e97 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -65,6 +65,7 @@
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -341,7 +342,8 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
         seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
@@ -353,10 +355,12 @@ def forward(
         batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v))
-        if rotary_pos_emb is not None:
+        if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None:
             # [2 * b, s, heads, head_dim]
             qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
+            qk_rotated = apply_rotary_pos_emb_vision(
+                qk_concat, rotary_pos_emb_cos, rotary_pos_emb_sin
+            )
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         if self.is_flash_attn_backend:
@@ -454,14 +458,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
         seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
             seqlens=seqlens,
         )
@@ -660,44 +666,6 @@ def forward(
         return embeddings
 
 
-class Glm4vVisionRotaryEmbedding(nn.Module):
-    def __init__(self, dim: int, theta: float = 10000.0) -> None:
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self._seq_len_cached = 0
-        self._freqs_cached = None
-
-    def update_freqs_cache(self, seqlen: int) -> None:
-        if seqlen > self._seq_len_cached:
-            seqlen *= 2
-            self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (
-                self.theta
-                ** (
-                    torch.arange(
-                        0,
-                        self.dim,
-                        2,
-                        dtype=torch.float,
-                        device=self.inv_freq.device,
-                    )
-                    / self.dim
-                )
-            )
-            seq = torch.arange(
-                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
-            )
-            freqs = torch.outer(seq, self.inv_freq)
-            self._freqs_cached = freqs
-
-    def forward(self, seqlen: int) -> torch.Tensor:
-        self.update_freqs_cache(seqlen)
-        return self._freqs_cached[:seqlen]
-
-
 class Glm4vVisionTransformer(nn.Module):
     def __init__(
         self,
@@ -731,7 +699,13 @@ def __init__(
 
         norm_layer = partial(RMSNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
         self.blocks = nn.ModuleList(
             [
                 Glm4vVisionBlock(
@@ -789,7 +763,9 @@ def dtype(self) -> torch.dtype:
     def device(self) -> torch.device:
         return self.patch_embed.proj.weight.device
 
-    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+    def rot_pos_emb(
+        self, grid_thw: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         pos_ids = []
         for t, h, w in grid_thw:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
@@ -817,9 +793,18 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb, pos_ids
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
+        cos_w = cos[pos_ids[:, 1]]
+        sin_h = sin[pos_ids[:, 0]]
+        sin_w = sin[pos_ids[:, 1]]
+
+        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
+        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        return cos_combined, sin_combined, pos_ids
 
     def compute_attn_mask_seqlen(
         self,
@@ -848,7 +833,9 @@ def forward(
         x = self.post_conv_layernorm(x)
 
         # compute position embedding
-        rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb_cos, rotary_pos_emb_sin, image_type_ids = self.rot_pos_emb(
+            grid_thw
+        )
         # compute cu_seqlens
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
@@ -867,7 +854,8 @@ def forward(
             x = blk(
                 x,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
                 seqlens=seqlens,
             )
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 897dd7ef29f1..2e4fd9645d88 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -64,6 +64,7 @@
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.vision import should_torch_compile_mm_vit
@@ -363,7 +364,8 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
         seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
@@ -378,13 +380,15 @@ def forward(
             head=self.num_attention_heads_per_partition,
         )
 
-        if rotary_pos_emb is not None:
+        if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None:
             qk, v = qkv[:, :, :2], qkv[:, :, 2]
 
             qk_reshaped = einops.rearrange(
                 qk, "b s two head head_dim -> (two b) s head head_dim", two=2
             )
-            qk_rotated = apply_rotary_pos_emb_vision(qk_reshaped, rotary_pos_emb)
+            qk_rotated = apply_rotary_pos_emb_vision(
+                qk_reshaped, cos=rotary_pos_emb_cos, sin=rotary_pos_emb_sin
+            )
             qk_rotated = qk_rotated.view(
                 2,
                 batch_size,
@@ -434,7 +438,8 @@ def forward(
     dynamic_arg_dims={
         "x": 0,
         "cu_seqlens": 0,
-        "rotary_pos_emb": 0,
+        "rotary_pos_emb_cos": 0,
+        "rotary_pos_emb_sin": 0,
         "seqlens": 0,
     },
     mark_unbacked_dims={"seqlens": 0},
@@ -485,14 +490,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
         seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
             seqlens=seqlens,
         )
@@ -588,42 +595,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
-class Qwen2_5_VisionRotaryEmbedding(nn.Module):
-    def __init__(self, dim: int, theta: float = 10000.0) -> None:
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        inv_freq = 1.0 / (
-            theta ** (torch.arange(0, dim, 2, dtype=torch.float, device="cpu") / dim)
-        )
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self._seq_len_cached = 0
-        self._freqs_cached = None
-
-    def update_freqs_cache(self, seqlen: int) -> None:
-        if seqlen > self._seq_len_cached:
-            seqlen *= 2
-            self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (
-                self.theta
-                ** (
-                    torch.arange(
-                        0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device
-                    )
-                    / self.dim
-                )
-            )
-            seq = torch.arange(
-                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
-            )
-            freqs = torch.outer(seq, self.inv_freq)
-            self._freqs_cached = freqs
-
-    def forward(self, seqlen: int) -> torch.Tensor:
-        self.update_freqs_cache(seqlen)
-        return self._freqs_cached[:seqlen]
-
-
 class Qwen2_5_VisionTransformer(nn.Module):
     def __init__(
         self,
@@ -666,7 +637,13 @@ def __init__(
 
         norm_layer = partial(RMSNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
 
         use_upstream_fa = False
         self.attn_backend = get_vit_attn_backend(
@@ -757,15 +734,30 @@ def rotary_pos_emb_thw(self, t, h, w):
         )
         pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
         max_size = max(h, w)
-        rotary_pos_emb_full = self.rotary_pos_emb(max_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        rotary_pos_emb = rotary_pos_emb.reshape(
-            rotary_pos_emb.shape[0] // self.spatial_merge_unit,
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_size)
+
+        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
+        cos_w = cos[pos_ids[:, 1]]
+        sin_h = sin[pos_ids[:, 0]]
+        sin_w = sin[pos_ids[:, 1]]
+
+        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
+        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+
+        cos_combined = cos_combined.reshape(
+            cos_combined.shape[0] // self.spatial_merge_unit,
+            self.spatial_merge_unit,
+            -1,
+        )
+        sin_combined = sin_combined.reshape(
+            sin_combined.shape[0] // self.spatial_merge_unit,
             self.spatial_merge_unit,
             -1,
         )
 
-        return rotary_pos_emb
+        return cos_combined, sin_combined
 
     def get_window_index_thw(self, grid_t, grid_h, grid_w):
         vit_merger_window_size = (
@@ -807,14 +799,19 @@ def get_window_index_thw(self, grid_t, grid_h, grid_w):
     @lru_cache(maxsize=1024)  # noqa: B019
     def get_rope_by_thw(self, t, h, w):
         window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw(t, h, w)
-        rotary_pos_emb_thw = self.rotary_pos_emb_thw(t, h, w)
-        rotary_pos_emb_thw = rotary_pos_emb_thw[window_index_thw, :, :]
-        rotary_pos_emb_thw = rotary_pos_emb_thw.flatten(start_dim=0, end_dim=1)
+        cos_thw, sin_thw = self.rotary_pos_emb_thw(t, h, w)
+
+        cos_thw = cos_thw[window_index_thw, :, :]
+        cos_thw = cos_thw.flatten(start_dim=0, end_dim=1)
+        sin_thw = sin_thw[window_index_thw, :, :]
+        sin_thw = sin_thw.flatten(start_dim=0, end_dim=1)
+
         cu_seqlens_thw = torch.repeat_interleave(
             torch.tensor([h * w], dtype=torch.int32), t
         )
         return (
-            rotary_pos_emb_thw,
+            cos_thw,
+            sin_thw,
             window_index_thw,
             cu_seqlens_window_thw,
             cu_seqlens_thw,
@@ -849,7 +846,8 @@ def forward(
     ) -> torch.Tensor:
         # patchify
         seq_len, _ = x.size()
-        rotary_pos_emb = []
+        rotary_pos_emb_cos = []
+        rotary_pos_emb_sin = []
         window_index: list = []
         cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)]
         cu_seqlens: list = []
@@ -865,7 +863,8 @@ def forward(
             llm_w = w // self.spatial_merge_size
 
             (
-                rotary_pos_emb_thw,
+                cos_thw,
+                sin_thw,
                 window_index_thw,
                 cu_seqlens_window_thw,
                 cu_seqlens_thw,
@@ -878,11 +877,13 @@ def forward(
             cu_window_seqlens_last = cu_seqlens_window_thw[-1]
             cu_window_seqlens.append(cu_seqlens_window_thw)
 
-            rotary_pos_emb.append(rotary_pos_emb_thw)
+            rotary_pos_emb_cos.append(cos_thw)
+            rotary_pos_emb_sin.append(sin_thw)
 
             cu_seqlens.append(cu_seqlens_thw)
 
-        rotary_pos_emb = torch.cat(rotary_pos_emb)
+        rotary_pos_emb_cos = torch.cat(rotary_pos_emb_cos)
+        rotary_pos_emb_sin = torch.cat(rotary_pos_emb_sin)
         window_index = torch.cat(window_index)
         # compute reverse indices
         reverse_indices = self.invert_permutation(window_index)
@@ -901,7 +902,12 @@ def forward(
 
         cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
         cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True)
-        rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True)
+        rotary_pos_emb_cos = rotary_pos_emb_cos.to(
+            device=self.device, non_blocking=True
+        )
+        rotary_pos_emb_sin = rotary_pos_emb_sin.to(
+            device=self.device, non_blocking=True
+        )
         window_index = window_index.to(device=hidden_states.device, non_blocking=True)
         reverse_indices = reverse_indices.to(
             device=hidden_states.device, non_blocking=True
@@ -928,7 +934,8 @@ def forward(
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens_now,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen_now,
                 seqlens=seqlens_now,
             )
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 5d21e249fc4c..53df5972a8fe 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -32,7 +32,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange, repeat
+from einops import rearrange
 from transformers import BatchFeature
 from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
@@ -59,7 +59,9 @@
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.rotary_embedding.common import (
+    apply_rotary_emb_torch,
     dispatch_rotary_emb_function,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -275,47 +277,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(
-            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
-        )
-
-
-def apply_rotary_emb_torch(
-    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+def apply_rotary_pos_emb_vision(
+    t: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
 ) -> torch.Tensor:
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos = repeat(
-        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    sin = repeat(
-        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    rotary_emb_function = dispatch_rotary_emb_function(
+        default=partial(apply_rotary_emb_torch, is_neox_style=True)
     )
-    return torch.cat(
-        [
-            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
-            x[..., ro_dim:],
-        ],
-        dim=-1,
-    )
-
-
-def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
-    rotary_emb_function = dispatch_rotary_emb_function(default=apply_rotary_emb_torch)
-    t_ = t.float()
-    cos = freqs.cos()
-    sin = freqs.sin()
-    output = rotary_emb_function(t_, cos, sin).type_as(t)
+    output = rotary_emb_function(t, cos, sin).type_as(t)
     return output
 
 
@@ -412,7 +380,8 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
         seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
@@ -424,11 +393,13 @@ def forward(
         batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...") for x in (q, k, v))
-        if rotary_pos_emb is not None:
-            # [2 * b, s, heads, head_dim]
-            qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
-            q, k = torch.chunk(qk_rotated, 2, dim=0)
+
+        # [2 * b, s, heads, head_dim]
+        qk_concat = torch.cat([q, k], dim=0)
+        qk_rotated = apply_rotary_pos_emb_vision(
+            qk_concat, rotary_pos_emb_cos, rotary_pos_emb_sin
+        )
+        q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         if self.is_flash_attn_backend:
             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
@@ -534,14 +505,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
         seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
             seqlens=seqlens,
         )
@@ -628,40 +601,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
-class Qwen2VisionRotaryEmbedding(nn.Module):
-    def __init__(self, dim: int, theta: float = 10000.0) -> None:
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self._seq_len_cached = 0
-        self._freqs_cached = None
-
-    def update_freqs_cache(self, seqlen: int) -> None:
-        if seqlen > self._seq_len_cached:
-            seqlen *= 2
-            self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (
-                self.theta
-                ** (
-                    torch.arange(
-                        0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device
-                    )
-                    / self.dim
-                )
-            )
-            seq = torch.arange(
-                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
-            )
-            freqs = torch.outer(seq, self.inv_freq)
-            self._freqs_cached = freqs
-
-    def forward(self, seqlen: int) -> torch.Tensor:
-        self.update_freqs_cache(seqlen)
-        return self._freqs_cached[:seqlen]
-
-
 class Qwen2VisionTransformer(nn.Module):
     def __init__(
         self,
@@ -700,7 +639,13 @@ def __init__(
 
         norm_layer = partial(nn.LayerNorm, eps=norm_eps)
         head_dim = embed_dim // num_heads
-        self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
 
         self.blocks = nn.ModuleList(
             [
@@ -744,7 +689,9 @@ def dtype(self) -> torch.dtype:
     def device(self) -> torch.device:
         return self.patch_embed.proj.weight.device
 
-    def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor:
+    def rot_pos_emb(
+        self, grid_thw: list[list[int]]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         pos_ids = []
         max_grid_size = 0
         for t, h, w in grid_thw:
@@ -773,9 +720,18 @@ def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor:
             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
             max_grid_size = max(max_grid_size, h, w)
         pos_ids = torch.cat(pos_ids, dim=0)
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
+        cos_w = cos[pos_ids[:, 1]]
+        sin_h = sin[pos_ids[:, 0]]
+        sin_w = sin[pos_ids[:, 1]]
+
+        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
+        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        return cos_combined, sin_combined
 
     def compute_attn_mask_seqlen(
         self, cu_seqlens: torch.Tensor
@@ -806,7 +762,7 @@ def forward(
             grid_thw_list = grid_thw.tolist()
 
         # compute position embedding
-        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
 
         # compute cu_seqlens
         cu_seqlens = torch.repeat_interleave(
@@ -824,7 +780,8 @@ def forward(
             x = blk(
                 x,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
                 seqlens=seqlens,
             )
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 40b80ce2387c..8274b92138f7 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -60,6 +60,7 @@
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo
@@ -90,7 +91,6 @@
 )
 from .qwen2_5_vl import (
     Qwen2_5_VisionAttention,
-    Qwen2_5_VisionRotaryEmbedding,
     Qwen2_5_VLProcessingInfo,
 )
 from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
@@ -221,14 +221,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
         seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
             seqlens=seqlens,
         )
@@ -332,7 +334,13 @@ def __init__(
 
         norm_layer = partial(nn.LayerNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
 
         self.blocks = nn.ModuleList(
             [
@@ -416,9 +424,19 @@ def rot_pos_emb(self, grid_thw):
             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
+        cos_w = cos[pos_ids[:, 1]]
+        sin_h = sin[pos_ids[:, 0]]
+        sin_w = sin[pos_ids[:, 1]]
+
+        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
+        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+
+        return cos_combined, sin_combined
 
     def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
         num_grid_per_side = self.num_grid_per_side
@@ -508,7 +526,7 @@ def forward(
         if self.apply_vit_abs_pos_embed:
             pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
             hidden_states = hidden_states + pos_embeds
-        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw)
 
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
@@ -519,7 +537,8 @@ def forward(
         cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
 
         hidden_states = hidden_states.unsqueeze(1)
-        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
+        rotary_pos_emb_cos = rotary_pos_emb_cos.to(hidden_states.device)
+        rotary_pos_emb_sin = rotary_pos_emb_sin.to(hidden_states.device)
         max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
 
         hidden_states_list = []
@@ -529,7 +548,8 @@ def forward(
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
                 seqlens=seqlens,
             )
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 7f0c9372991d..99a4007ef7f2 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -63,6 +63,7 @@
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
@@ -95,7 +96,6 @@
 )
 from .qwen2_5_vl import (
     Qwen2_5_VisionAttention,
-    Qwen2_5_VisionRotaryEmbedding,
     Qwen2_5_VLImageEmbeddingInputs,
     Qwen2_5_VLImageInputs,
     Qwen2_5_VLImagePixelInputs,
@@ -232,14 +232,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
         seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
             seqlens=seqlens,
         )
@@ -339,7 +341,13 @@ def __init__(
 
         norm_layer = partial(nn.LayerNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
 
         self.merger = Qwen3_VisionPatchMerger(
             d_model=vision_config.out_hidden_size,
@@ -452,9 +460,19 @@ def rot_pos_emb(self, grid_thw: list[list[int]]):
             for t, h, w in grid_thw
         ]
         pos_ids = torch.cat(pos_ids, dim=0)
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
+        cos_w = cos[pos_ids[:, 1]]
+        sin_h = sin[pos_ids[:, 0]]
+        sin_w = sin[pos_ids[:, 1]]
+
+        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
+        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+
+        return cos_combined, sin_combined
 
     def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
         num_grid_per_side = self.num_grid_per_side
@@ -547,8 +565,13 @@ def forward(
 
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list)
         hidden_states = hidden_states + pos_embeds
-        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
-        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, non_blocking=True)
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
+        rotary_pos_emb_cos = rotary_pos_emb_cos.to(
+            hidden_states.device, non_blocking=True
+        )
+        rotary_pos_emb_sin = rotary_pos_emb_sin.to(
+            hidden_states.device, non_blocking=True
+        )
 
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
@@ -564,7 +587,8 @@ def forward(
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
                 seqlens=seqlens,
             )

From 184b12fdc6dce87485e3bd793e13e90421f93924 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 18 Nov 2025 15:07:50 +0100
Subject: [PATCH 153/578] [Bugfix][NIXL] Fix `block_size_ratio` when logical
 !=physical blocks   (#28925)

Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../kv_connector/v1/nixl_connector.py          | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index a70c98b63713..5ff95876ef34 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -677,12 +677,13 @@ class TpKVTopology:
         mapping between local and remote TP workers.
         """
 
-        tp_size: int
         tp_rank: int
         remote_tp_size: dict[EngineId, int]
         is_mla: bool
         total_num_kv_heads: int
         attn_backend: type[AttentionBackend]
+        engine_id: EngineId
+        remote_block_size: dict[EngineId, int]
 
         def __post_init__(self):
             # Figure out whether the first dimension of the cache is K/V
@@ -710,8 +711,13 @@ def split_k_and_v(self) -> bool:
                 self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first
             )
 
-        block_size: int
-        remote_block_size: dict[EngineId, int]
+        @property
+        def tp_size(self) -> int:
+            return self.remote_tp_size[self.engine_id]
+
+        @property
+        def block_size(self) -> int:
+            return self.remote_block_size[self.engine_id]
 
         def tp_ratio(
             self,
@@ -957,13 +963,12 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         self.xfer_stats = NixlKVConnectorStats()
 
         self.kv_topo = self.TpKVTopology(
-            tp_size=self.world_size,
             tp_rank=self.tp_rank,
+            engine_id=self.engine_id,
             remote_tp_size=self._tp_size,  # shared state
+            remote_block_size=self._block_size,  # shared state
             is_mla=self.use_mla,
             total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
-            block_size=self.block_size,
-            remote_block_size=self._block_size,
             attn_backend=backend,
         )
         self._use_pallas = self.kv_topo._use_pallas
@@ -1185,6 +1190,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
                         self.block_size // kernel_block_size
                     )
                     self.block_size = kernel_block_size
+                    self._block_size[self.engine_id] = kernel_block_size
 
                 seen_base_addresses.append(base_addr)
                 curr_tensor_size_bytes = cache.numel() * cache.element_size()

From f6aa122698790fb1a544e8d80ba97c49f02be945 Mon Sep 17 00:00:00 2001
From: Alex <30671301+killershrimp@users.noreply.github.com>
Date: Tue, 18 Nov 2025 08:21:48 -0600
Subject: [PATCH 154/578] [CI Sprint] Quantization CI Cleanup (#24130)

Signed-off-by: Alex Yun <alexyun04@gmail.com>
---
 tests/quantization/test_compressed_tensors.py    |  4 ++--
 tests/quantization/test_cpu_offload.py           | 16 ++++++++--------
 tests/quantization/test_experts_int8.py          |  6 ++++--
 tests/quantization/test_fp8.py                   | 13 ++++++++-----
 tests/quantization/test_ipex_quant.py            |  4 ++--
 tests/quantization/test_lm_head.py               |  2 +-
 tests/quantization/test_modelopt.py              |  2 +-
 tests/quantization/test_ptpc_fp8.py              |  3 ++-
 .../test_register_quantization_config.py         |  6 +++---
 tests/quantization/test_torchao.py               |  2 +-
 10 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index e7d902ed26aa..31b65189b5ec 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -141,7 +141,7 @@ def zp_valid(zp: torch.Tensor | None):
         "neuralmagic/Llama-3.2-1B-quantized.w8a8",
     ],
 )
-@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("max_tokens", [4])
 @pytest.mark.parametrize("num_logprobs", [10])
 @pytest.mark.parametrize(
     "use_aiter", [True, False] if current_platform.is_rocm() else [False]
@@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(
             example_prompts, max_tokens, num_logprobs
         )
 
-    with vllm_runner(model_path, dtype=dtype) as vllm_model:
+    with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs
         )
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index a3fb4a695347..1591ce1c4f5a 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
     # Test loading a quantized checkpoint
     compare_two_settings(
         "neuralmagic/Qwen2-1.5B-Instruct-FP8",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
     # Test GPTQ Marlin
     compare_two_settings(
         "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
     # Test AWQ Marlin
     compare_two_settings(
         "Qwen/Qwen2-1.5B-Instruct-AWQ",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
     # Test wNa16
     compare_two_settings(
         "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index 2a72f734e431..b992e976ac30 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -21,7 +21,7 @@
 )
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("max_tokens", [4])
 def test_model_experts_int8_startup(
     hf_runner,
     vllm_runner,
@@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_transformers_version(on_fail="skip")
 
-    with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
+    with vllm_runner(
+        model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
+    ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index f02da2996ffe..7bcac9ad768e 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -45,10 +45,10 @@ def test_model_load_and_run(
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
-    with vllm_runner(model_id) as llm:
+    with vllm_runner(model_id, enforce_eager=True) as llm:
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         print(outputs[0][1])
 
 
@@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
 
     # `LLM.apply_model` requires pickling a function.
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
+    with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:
 
         def check_model(model):
             attn = model.model.layers[0].self_attn.attn
@@ -112,7 +112,7 @@ def check_model(model):
 
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         print(outputs[0][1])
 
 
@@ -142,7 +142,10 @@ def test_load_fp16_model(
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
     with vllm_runner(
-        "facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
+        "facebook/opt-125m",
+        quantization="fp8",
+        enforce_eager=True,
+        kv_cache_dtype=kv_cache_dtype,
     ) as llm:
 
         def check_model(model):
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
index ae9b1df3377d..4f3c52df6c28 100644
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -26,7 +26,7 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", DTYPE)
 def test_ipex_quant(vllm_runner, model, dtype):
-    with vllm_runner(model, dtype=dtype) as llm:
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+    with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
     assert output
     print(output)
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index f009a4cfb870..d92dfaa2cc7b 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -49,4 +49,4 @@ def check_model(model):
 
         vllm_model.apply_model(check_model)
 
-        print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])
+        print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
index 8abf65d29784..0298994c396f 100644
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -88,6 +88,6 @@ def check_model(model):
         llm.apply_model(check_model)
 
         # Run a simple generation test to ensure the model works
-        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
         print(f"ModelOpt FP8 output: {output}")
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
index e8ea4148585b..61efd2ce66c7 100644
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
             "facebook/opt-125m",
             dtype=dtype,
             quantization="ptpc_fp8",
+            enforce_eager=True,
             kv_cache_dtype=kv_cache_dtype,
         )
     except AssertionError as e:
@@ -65,5 +66,5 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 8da048703df9..a09856c78559 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -23,8 +23,8 @@
     get_quantization_config,
     register_quantization_config,
 )
-from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
-    QuantizationConfig,
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,  # noqa: E501
 )
 
 
@@ -142,5 +142,5 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=1)
         assert output
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index 82413f36e997..fb8d6130c377 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -392,7 +392,7 @@ def get_weight_attrs(model):
             assert not has_int4_preshuffled_tensor
 
         assert weight_attrs == [False, 1, 0, True]
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
 
         assert output
 

From 49a986ecd445db2220e750b61ba653658ea3db9b Mon Sep 17 00:00:00 2001
From: Ido Segev <idos@pliops.com>
Date: Tue, 18 Nov 2025 18:38:22 +0200
Subject: [PATCH 155/578] [Benchmark] multi_turn: Report warmup-inclusive
 runtime (#28937)

Signed-off-by: Ido Segev <idos@pliops.com>
---
 benchmarks/multi_turn/README.md               |  4 ++
 .../benchmark_serving_multi_turn.py           | 59 +++++++++++++++----
 2 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md
index f5b5c6c97d48..b0be1e3a69a6 100644
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -55,6 +55,10 @@ output_num_chunks  166.0    99.01   11.80    79.00    90.00    98.00   108.75
 ----------------------------------------------------------------------------------------------------
 ```
 
+If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
+and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
+benchmark-only runtime so the reported throughput stays comparable).
+
 ### JSON configuration file for synthetic conversations generation
 
 The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 772d685ad90f..e23f6b923f1b 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -1076,6 +1076,7 @@ def process_statistics(
     verbose: bool,
     gen_conv_args: GenConvArgs | None = None,
     excel_output: bool = False,
+    warmup_runtime_sec: float | None = None,
 ) -> None:
     if len(client_metrics) == 0:
         logger.info("No samples to process")
@@ -1169,8 +1170,13 @@ def process_statistics(
         # Convert milliseconds to seconds
         runtime_sec = runtime_sec / 1000.0
         requests_per_sec = float(len(df)) / runtime_sec
-
-        params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
+        params = {
+            "runtime_sec": runtime_sec,
+            "requests_per_sec": requests_per_sec,
+        }
+        if warmup_runtime_sec is not None:
+            params["warmup_runtime_sec"] = warmup_runtime_sec
+            params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
 
         # Generate a summary of relevant metrics (and drop irrelevant data)
         df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
@@ -1552,6 +1558,8 @@ async def main() -> None:
         url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
     )
 
+    warmup_runtime_sec: float | None = None
+
     # Warm-up step
     if args.warmup_step:
         # Only send a single user prompt from every conversation.
@@ -1566,26 +1574,56 @@ async def main() -> None:
         # all clients should finish their work before exiting
         warmup_bench_args = bench_args._replace(early_stop=False)
 
-        logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
+        logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
+        warmup_start_ns = time.perf_counter_ns()
         conversations, _ = await main_mp(
             warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
         )
-        logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
+        warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
+        logger.info(
+            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+            Color.PURPLE,
+            warmup_runtime_sec,
+            warmup_runtime_sec * 1000,
+            Color.RESET,
+        )
+        logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
 
     # Run the benchmark
-    start_time = time.perf_counter_ns()
+    benchmark_start_ns = time.perf_counter_ns()
     client_convs, client_metrics = await main_mp(
         client_args, req_args, bench_args, tokenizer, conversations
     )
-    total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
+    benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
 
     # Calculate requests per second
-    total_runtime_sec = total_runtime_ms / 1000.0
-    rps = len(client_metrics) / total_runtime_sec
+    requests_per_sec = len(client_metrics) / benchmark_runtime_sec
+    benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
     logger.info(
-        f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
-        f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
+        "%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
+        "requests per second: %.3f%s",
+        Color.GREEN,
+        benchmark_runtime_sec,
+        benchmark_runtime_ms,
+        requests_per_sec,
+        Color.RESET,
     )
+    if warmup_runtime_sec is not None:
+        total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
+        logger.info(
+            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+            Color.GREEN,
+            warmup_runtime_sec,
+            warmup_runtime_sec * 1000,
+            Color.RESET,
+        )
+        logger.info(
+            "%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
+            Color.GREEN,
+            total_runtime_sec,
+            total_runtime_sec * 1000,
+            Color.RESET,
+        )
 
     # Benchmark parameters
     params = {
@@ -1610,6 +1648,7 @@ async def main() -> None:
         verbose=args.verbose,
         gen_conv_args=gen_conv_args,
         excel_output=args.excel_output,
+        warmup_runtime_sec=warmup_runtime_sec,
     )
 
     if args.output_file is not None:

From c2612371ad76a966cbbc443da3f3f91a4f4a3138 Mon Sep 17 00:00:00 2001
From: Luciano Martins <lucianomartins@google.com>
Date: Tue, 18 Nov 2025 13:56:29 -0300
Subject: [PATCH 156/578] [Model] Add Gemma3 GGUF multimodal support (#27772)

Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Luciano Martins <lucianommartins@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 requirements/common.txt                       |   2 +-
 .../generation/test_multimodal_gguf.py        | 115 +++++++++++
 tests/models/quantization/test_gguf.py        |   9 +-
 vllm/config/model.py                          |  20 +-
 .../layers/quantization/gguf.py               |  67 ++++++-
 .../model_loader/gguf_loader.py               | 186 ++++++++++++++++--
 .../model_loader/weight_utils.py              |  10 +-
 vllm/model_executor/models/gemma3_mm.py       | 172 ++++++++++------
 vllm/model_executor/models/siglip.py          |  27 +++
 vllm/transformers_utils/config.py             |  11 ++
 vllm/transformers_utils/gguf_utils.py         | 166 ++++++++++++++++
 vllm/transformers_utils/processor.py          |  31 ++-
 vllm/transformers_utils/utils.py              |   1 +
 vllm/v1/worker/gpu_model_runner.py            |  19 ++
 14 files changed, 751 insertions(+), 85 deletions(-)
 create mode 100644 tests/models/multimodal/generation/test_multimodal_gguf.py
 create mode 100644 vllm/transformers_utils/gguf_utils.py

diff --git a/requirements/common.txt b/requirements/common.txt
index ad92ba3ad827..1058ab91a02a 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -30,7 +30,7 @@ filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/31
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
-gguf >= 0.13.0
+gguf >= 0.17.0
 mistral_common[image] >= 1.8.5
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
diff --git a/tests/models/multimodal/generation/test_multimodal_gguf.py b/tests/models/multimodal/generation/test_multimodal_gguf.py
new file mode 100644
index 000000000000..e596b20c6302
--- /dev/null
+++ b/tests/models/multimodal/generation/test_multimodal_gguf.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Literal, NamedTuple
+
+import pytest
+from huggingface_hub import hf_hub_download
+from pytest import MarkDecorator
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.assets.image import ImageAsset
+from vllm.utils.torch_utils import set_default_torch_num_threads
+
+from ....conftest import PromptImageInput, VllmRunner
+from ...utils import check_logprobs_close
+
+
+class GGUFMMTestConfig(NamedTuple):
+    original_model: str
+    gguf_repo: str
+    gguf_backbone: str
+    gguf_mmproj: str
+    prompt: list[str]
+    mm_data: dict[Literal["images"], PromptImageInput]
+    max_model_len: int = 4096
+    marks: list[MarkDecorator] = []
+
+    @property
+    def gguf_model(self):
+        hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj)
+        return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
+
+
+GEMMA3_CONFIG = GGUFMMTestConfig(
+    original_model="google/gemma-3-4b-it",
+    gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
+    gguf_backbone="gemma-3-4b-it-q4_0.gguf",
+    gguf_mmproj="mmproj-model-f16-4B.gguf",
+    prompt=["<start_of_image>Describe this image in detail:"],
+    mm_data={"images": [ImageAsset("stop_sign").pil_image]},
+    marks=[pytest.mark.core_model],
+)
+
+MODELS_TO_TEST = [GEMMA3_CONFIG]
+
+
+def run_multimodal_gguf_test(
+    vllm_runner: type[VllmRunner],
+    model: GGUFMMTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+):
+    # Run gguf model.
+    with (
+        set_default_torch_num_threads(1),
+        vllm_runner(
+            model_name=model.gguf_model,
+            enforce_eager=True,
+            tokenizer_name=model.original_model,
+            dtype=dtype,
+            max_model_len=model.max_model_len,
+        ) as gguf_model,
+    ):
+        gguf_outputs = gguf_model.generate_greedy_logprobs(
+            prompts=model.prompt,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            **model.mm_data,
+        )
+
+    # Run unquantized model.
+    with vllm_runner(
+        model_name=model.original_model,
+        enforce_eager=True,  # faster tests
+        dtype=dtype,
+        max_model_len=model.max_model_len,
+    ) as original_model:
+        original_outputs = original_model.generate_greedy_logprobs(
+            prompts=model.prompt,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            **model.mm_data,
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=original_outputs,
+        outputs_1_lst=gguf_outputs,
+        name_0="original",
+        name_1="gguf",
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(test_config, marks=test_config.marks)
+        for test_config in MODELS_TO_TEST
+    ],
+)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(
+    vllm_runner: type[VllmRunner],
+    model: GGUFMMTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    run_multimodal_gguf_test(vllm_runner, model, dtype, max_tokens, num_logprobs)
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 5e2438857aee..3b9597507ac1 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -78,6 +78,12 @@ def gguf_model(self):
     gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
 )
 
+GEMMA3_CONFIG = GGUFTestConfig(
+    original_model="google/gemma-3-270m-it",
+    gguf_repo="ggml-org/gemma-3-270m-it-qat-GGUF",
+    gguf_filename="gemma-3-270m-it-qat-Q4_0.gguf",
+)
+
 MODELS = [
     # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
     QWEN2_CONFIG,
@@ -85,6 +91,7 @@ def gguf_model(self):
     GPT2_CONFIG,
     STABLELM_CONFIG,
     DOLPHIN_CONFIG,
+    GEMMA3_CONFIG,
     # STARCODER_CONFIG, # broken
 ]
 
@@ -148,7 +155,7 @@ def check_model_outputs(
     "model",
     [pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
 )
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("tp_size", [1])
diff --git a/vllm/config/model.py b/vllm/config/model.py
index b3a28af6de38..49fe0bcd9a2a 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -33,10 +33,14 @@
     try_get_generation_config,
     try_get_safetensors_metadata,
     try_get_tokenizer_config,
+    uses_custom_attention_masks,
     uses_mrope,
 )
+from vllm.transformers_utils.gguf_utils import (
+    maybe_patch_hf_config_from_gguf,
+)
 from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import maybe_model_redirect
+from vllm.transformers_utils.utils import check_gguf_file, maybe_model_redirect
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.torch_utils import common_broadcastable_dtype
 
@@ -450,6 +454,12 @@ def __post_init__(
         self.model = maybe_model_redirect(self.model)
         # The tokenizer is consistent with the model by default.
         if self.tokenizer is None:
+            if check_gguf_file(self.model):
+                raise ValueError(
+                    "Using a tokenizer is mandatory when loading a GGUF model. "
+                    "Please specify the tokenizer path or name using the "
+                    "--tokenizer argument."
+                )
             self.tokenizer = self.model
         if self.tokenizer_revision is None:
             self.tokenizer_revision = self.revision
@@ -508,6 +518,10 @@ def __post_init__(
             hf_overrides_kw=hf_overrides_kw,
             hf_overrides_fn=hf_overrides_fn,
         )
+        hf_config = maybe_patch_hf_config_from_gguf(
+            self.model,
+            hf_config,
+        )
 
         self.hf_config = hf_config
         if dict_overrides:
@@ -1605,6 +1619,10 @@ def uses_alibi(self) -> bool:
     def uses_mrope(self) -> bool:
         return uses_mrope(self.hf_config)
 
+    @property
+    def uses_custom_attention_masks(self) -> bool:
+        return uses_custom_attention_masks(self.hf_config)
+
     @property
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index caabcd0ca0ee..42d7a67371ae 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
+from collections.abc import Callable, Mapping
+from types import MappingProxyType
 from typing import Any, Optional
 
 import gguf
@@ -26,7 +27,11 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    UnquantizedEmbeddingMethod,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.models.utils import WeightsMapper
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils.torch_utils import direct_register_custom_op
 
@@ -65,18 +70,70 @@ def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
-            if is_layer_skipped_gguf(prefix, self.unquantized_modules):
+            if is_layer_skipped_gguf(
+                prefix, self.unquantized_modules, self.packed_modules_mapping
+            ):
                 return UnquantizedLinearMethod()
             return GGUFLinearMethod(self)
         elif isinstance(layer, VocabParallelEmbedding):
+            if is_layer_skipped_gguf(
+                prefix, self.unquantized_modules, self.packed_modules_mapping
+            ):
+                return UnquantizedEmbeddingMethod()
             return GGUFEmbeddingMethod(self)
         elif isinstance(layer, FusedMoE):
             return GGUFMoEMethod(self, layer.moe_config)
         return None
 
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        """
+        Interface for models to update module names referenced in
+        quantization configs in order to reflect the vllm model structure
+
+        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
+            structure of the qconfig) to vllm model structure
+        """
+        if self.unquantized_modules is not None:
+            self.unquantized_modules = hf_to_vllm_mapper.apply_list(
+                self.unquantized_modules
+            )
+
+
+def is_layer_skipped_gguf(
+    prefix: str,
+    unquantized_modules: list[str],
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
+):
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    proj_name = prefix.split(".")[-1]
+    if proj_name in fused_mapping:
+        shard_prefixes = [
+            prefix.replace(proj_name, shard_proj_name)
+            for shard_proj_name in fused_mapping[proj_name]
+        ]
+
+        is_skipped = None
+        for shard_prefix in shard_prefixes:
+            is_shard_skipped = any(
+                shard_prefix in module_name for module_name in unquantized_modules
+            )
+
+            if is_skipped is None:
+                is_skipped = is_shard_skipped
+            elif is_shard_skipped != is_skipped:
+                raise ValueError(
+                    f"Detected some but not all shards of {prefix} "
+                    "are quantized. All shards of fused layers "
+                    "to have the same precision."
+                )
+    else:
+        is_skipped = any(module_name in prefix for module_name in unquantized_modules)
 
-def is_layer_skipped_gguf(prefix: str, unquantized_modules: list[str]):
-    return any(module_name in prefix for module_name in unquantized_modules)
+    assert is_skipped is not None
+    return is_skipped
 
 
 UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 7db1fc167c4f..2416836be03c 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -7,10 +7,11 @@
 import torch
 import torch.nn as nn
 from huggingface_hub import hf_hub_download
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoModelForImageTextToText
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.config.load import LoadConfig
+from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.utils import (
     initialize_model,
@@ -21,8 +22,11 @@
     get_gguf_weight_type_map,
     gguf_quant_weights_iterator,
 )
+from vllm.transformers_utils.gguf_utils import detect_gguf_multimodal
 from vllm.utils.torch_utils import set_default_torch_dtype
 
+logger = init_logger(__name__)
+
 
 class GGUFModelLoader(BaseModelLoader):
     """
@@ -67,7 +71,15 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
         https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
         """
         config = model_config.hf_config
+        # Get text config to handle both nested (multimodal) and flat
+        # (text-only) config structures. For multimodal models like
+        # Gemma3Config, this returns config.text_config. For text-only
+        # models, this returns config itself.
+        text_config = config.get_text_config()
         model_type = config.model_type
+        is_multimodal = (
+            hasattr(config, "vision_config") and config.vision_config is not None
+        )
         gguf_to_hf_name_map = {}
         # hack: ggufs have a different name than transformers
         if model_type == "cohere":
@@ -115,24 +127,167 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
                 break
         if arch is None:
             raise RuntimeError(f"Unknown gguf model_type: {model_type}")
-        num_layers = config.num_hidden_layers
-        name_map = gguf.get_tensor_name_map(arch, num_layers)
+        text_num_layers = text_config.num_hidden_layers
+        text_name_map = gguf.get_tensor_name_map(arch, text_num_layers)
+
+        if is_multimodal:
+            mm_proj_arch = gguf.MODEL_ARCH.MMPROJ
+            vision_num_layers = config.vision_config.num_hidden_layers
+            vision_name_map = gguf.get_tensor_name_map(mm_proj_arch, vision_num_layers)
+        else:
+            vision_name_map = None
+
+        # Create dummy model to extract parameter names
+        # For multimodal: use AutoModelForImageTextToText to get
+        # language + vision + projector params
+        # For text-only: use AutoModelForCausalLM to get language model params
+        auto_cls = (
+            AutoModelForImageTextToText if is_multimodal else AutoModelForCausalLM
+        )
         with torch.device("meta"):
-            dummy_model = AutoModelForCausalLM.from_config(
+            dummy_model = auto_cls.from_config(
                 config, trust_remote_code=model_config.trust_remote_code
             )
+
         state_dict = dummy_model.state_dict()
+        if hf_checkpoint_map := getattr(
+            dummy_model, "_checkpoint_conversion_mapping", None
+        ):
+
+            def revert_hf_rename(name: str) -> str:
+                for original_name, hf_name in hf_checkpoint_map.items():
+                    if hf_name in name:
+                        name = name.replace(hf_name, original_name).lstrip("^")
+                return name
+
+            state_dict = {
+                revert_hf_rename(name): tensor for name, tensor in state_dict.items()
+            }
+
+        def find_hf_name_in_tensor_map(hf_name: str) -> str | None:
+            """
+            Map HuggingFace parameter name to GGUF tensor name.
+
+            This function handles the mismatch between HF parameter naming
+            conventions and gguf-py's expected format:
+            1. Strips 'model.' prefix (common in multimodal models)
+            2. Converts '_weight' suffix to '.weight' (Gemma3 compatibility)
+            3. Searches vision_name_map for multimodal parameters
+            4. Falls back to text_name_map for language model parameters
+
+            Args:
+                hf_name: Full HuggingFace parameter name (e.g.,
+                        'model.multi_modal_projector.mm_soft_emb_norm.weight')
+
+            Returns:
+                GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
+                or None if no mapping found
+            """
+            # Strip 'language_model.' prefix for multimodal models - gguf-py
+            # tensor mappings expect parameter names without this prefix.
+            # Note: 'model.' prefix should be KEPT for text-only models as
+            # gguf-py expects it.
+            if hf_name.startswith("language_model."):
+                hf_name = hf_name[15:]  # Remove 'language_model.'
+
+            # Parse parameter name and suffix
+            if hf_name.endswith((".weight", ".bias")):
+                base_name, suffix = hf_name.rsplit(".", 1)
+            else:
+                base_name, suffix = hf_name, ""
+                # Handle '_weight' suffix (Gemma3 naming: parameter ends with
+                # '_weight' instead of '.weight')
+                if base_name.endswith("_weight"):
+                    base_name = base_name[:-7]  # Remove '_weight'
+                    suffix = "weight"
+
+            gguf_name = None
+            # Priority 1: Search vision/projector parameters for multimodal models
+            if vision_name_map is not None:
+                gguf_name = vision_name_map.get_name(base_name)
+
+            # Priority 2: Search text backbone parameters
+            if gguf_name is None:
+                gguf_name = text_name_map.get_name(base_name)
+
+            if gguf_name is None:
+                return None
 
+            return gguf_name + "." + suffix
+
+        # Build mapping and track unmapped parameters
+        unmapped_params = []
         for hf_name in state_dict:
-            name, suffix = hf_name.rsplit(".", 1)
-            gguf_name = name_map.get_name(name)
-            gguf_to_hf_name_map[f"{gguf_name}.{suffix}"] = hf_name
+            gguf_name_with_suffix = find_hf_name_in_tensor_map(hf_name)
+
+            # Track mapping success
+            if gguf_name_with_suffix is not None:
+                gguf_to_hf_name_map[gguf_name_with_suffix] = hf_name
+                logger.debug("Mapped GGUF %s → HF %s", gguf_name_with_suffix, hf_name)
+            elif hf_name not in gguf_to_hf_name_map.values():
+                # Parameter not in manual overrides either
+                unmapped_params.append(hf_name)
+
+        # All parameters must be mapped: both vision/projector and backbone
+        if unmapped_params:
+            raise RuntimeError(
+                f"Failed to map GGUF parameters "
+                f"({len(unmapped_params)}): "
+                f"{unmapped_params}"
+            )
         return gguf_to_hf_name_map
 
+    def _get_gguf_weight_type(
+        self,
+        model_config: ModelConfig,
+        model_name_or_path: str,
+        gguf_to_hf_name_map: dict[str, str],
+    ) -> dict[str, str]:
+        weight_type_map = get_gguf_weight_type_map(
+            model_config.model, gguf_to_hf_name_map
+        )
+        is_multimodal = hasattr(model_config.hf_config, "vision_config")
+        if is_multimodal:
+            mmproj_file = detect_gguf_multimodal(model_name_or_path)
+            assert mmproj_file is not None, (
+                "Could not find mm_proj file for multimodal GGUF model"
+            )
+            logger.info("Loading extra mm_proj weights from %s...", mmproj_file)
+            mm_proj_weight_type_map = get_gguf_weight_type_map(
+                mmproj_file, gguf_to_hf_name_map
+            )
+            weight_type_map.update(mm_proj_weight_type_map)
+        return weight_type_map
+
     def _get_weights_iterator(
-        self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str]
+        self,
+        model_config: ModelConfig,
+        model_name_or_path: str,
+        gguf_to_hf_name_map: dict[str, str],
     ) -> Generator[tuple[str, torch.Tensor], None, None]:
-        return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
+        """
+        Iterate over GGUF model weights, loading from both main model file and
+        mmproj.gguf for multimodal Gemma3 models.
+
+        For Gemma3 multimodal GGUF models:
+        - Main file (gemma-3-*.gguf): Language model weights (model.*)
+        - mmproj file (mmproj*.gguf): Vision tower + projector weights (v.*, mm.*)
+
+        Yields:
+            Tuples of (parameter_name, tensor) for all model weights
+        """
+        hf_config = model_config.hf_config
+        is_multimodal = hasattr(hf_config, "vision_config")
+
+        if is_multimodal:
+            # Load mm_proj (mm_encoder + projector) for multimodal weights
+            mmproj_file = detect_gguf_multimodal(model_name_or_path)
+            assert mmproj_file is not None, (
+                "Could not find mm_proj file for multimodal GGUF model"
+            )
+            yield from gguf_quant_weights_iterator(mmproj_file, gguf_to_hf_name_map)
+
+        yield from gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
 
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model)
@@ -141,7 +296,7 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         local_model_path = self._prepare_weights(model_config.model)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         model.load_weights(
-            self._get_weights_iterator(local_model_path, gguf_weights_map)
+            self._get_weights_iterator(model_config, local_model_path, gguf_weights_map)
         )
 
     def load_model(
@@ -156,14 +311,19 @@ def load_model(
         ):
             model_config.hf_config.update({"tie_word_embeddings": True})
 
-        weight_type_map = get_gguf_weight_type_map(model_config.model, gguf_weights_map)
-
+        weight_type_map = self._get_gguf_weight_type(
+            model_config, local_model_path, gguf_weights_map
+        )
         # filter out unquantized modules to skip
         unquant_names = [
             name.removesuffix(".weight")
             for name, weight_type in weight_type_map.items()
-            if weight_type == "F32" and name.endswith(".weight")
+            if weight_type in ("F32", "F16", "BF16") and name.endswith(".weight")
         ]
+        logger.debug(
+            "GGUF unquantized modules: %s",
+            unquant_names,
+        )
         vllm_config.quant_config.unquantized_modules.extend(unquant_names)
 
         target_device = torch.device(device_config.device)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 93986e5f2fc0..89634cbf4124 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -836,7 +836,11 @@ def gguf_quant_weights_iterator(
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
     """
     Iterate over the quant weights in the model gguf files and convert
-    them to torch tensors
+    them to torch tensors.
+    Be careful of the order of yielding weight types and weights data,
+    we have to yield all weight types first before yielding any weights.
+    Otherwise it would cause issue when loading weights with for packed
+    layer with different quant types.
     """
 
     reader = gguf.GGUFReader(gguf_file)
@@ -846,7 +850,7 @@ def gguf_quant_weights_iterator(
             weight_type = tensor.tensor_type
             name = gguf_to_hf_name_map[tensor.name]
 
-            if weight_type.name != "F32":
+            if weight_type.name not in ("F32", "BF16", "F16"):
                 weight_type_name = name.replace("weight", "qweight_type")
                 weight_type = torch.tensor(weight_type)
                 yield weight_type_name, weight_type
@@ -856,7 +860,7 @@ def gguf_quant_weights_iterator(
             weight = tensor.data
             weight_type = tensor.tensor_type
             name = gguf_to_hf_name_map[tensor.name]
-            if weight_type.name != "F32":
+            if weight_type.name not in ("F32", "BF16", "F16"):
                 name = name.replace("weight", "qweight")
             param = torch.tensor(weight)
             yield name, param
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 8e2bbe8f7990..fe83c8b63b01 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, TypeAlias
+from typing import Annotated, Any, Literal
 
 import torch
 from torch import nn
@@ -20,12 +20,7 @@
     MultiModalFieldConfig,
     MultiModalKwargsItems,
 )
-from vllm.multimodal.parse import (
-    ImageEmbeddingItems,
-    ImageProcessorItems,
-    ImageSize,
-    MultiModalDataItems,
-)
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems
 from vllm.multimodal.processing import (
     BaseMultiModalProcessor,
     BaseProcessingInfo,
@@ -76,15 +71,7 @@ class Gemma3ImagePixelInputs(TensorSchema):
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
 
-class Gemma3ImageEmbeddingInputs(TensorSchema):
-    type: Literal["image_embeds"] = "image_embeds"
-    image_embeds: Annotated[
-        torch.Tensor,
-        TensorShape("ni", "nf", "hs"),
-    ]
-
-
-Gemma3ImageInputs: TypeAlias = Gemma3ImagePixelInputs | Gemma3ImageEmbeddingInputs
+Gemma3ImageInputs = Gemma3ImagePixelInputs
 
 
 class Gemma3ProcessingInfo(BaseProcessingInfo):
@@ -191,9 +178,8 @@ def get_num_crops(
     def get_image_repl(
         self,
         *,
-        image_width: int | None,
-        image_height: int | None,
-        num_crops: int | None = None,
+        image_width: int,
+        image_height: int,
         processor: Gemma3Processor | None,
     ) -> PromptUpdateDetails[str]:
         if processor is None:
@@ -201,13 +187,11 @@ def get_image_repl(
 
         boi_token = processor.boi_token
 
-        if num_crops is None:
-            assert image_width is not None and image_height is not None
-            num_crops = self.get_num_crops(
-                image_width=image_width,
-                image_height=image_height,
-                processor=processor,
-            )
+        num_crops = self.get_num_crops(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
 
         if num_crops == 0:
             image_text = boi_token
@@ -337,7 +321,6 @@ def _get_mm_fields_config(
         return dict(
             pixel_values=MultiModalFieldConfig.flat_from_sizes("image", num_patches),
             num_patches=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -350,19 +333,7 @@ def _get_prompt_updates(
         image_token = hf_processor.boi_token
 
         def get_replacement_gemma3(item_idx: int):
-            images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems)
-            )
-
-            if isinstance(images, ImageEmbeddingItems):
-                # For image embedding inputs, only support no crops cases
-                # since it's not supported in hf processor anyway
-                return self.info.get_image_repl(
-                    image_width=None,
-                    image_height=None,
-                    num_crops=0,
-                    processor=hf_processor,
-                )
+            images = mm_items.get_items("image", ImageProcessorItems)
 
             image_size = images.get_image_size(item_idx)
             return self.info.get_image_repl(
@@ -586,19 +557,17 @@ def _parse_and_validate_image_input(
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
+        assert image_embeds is None, "Gemma3 does not support image_embeds."
+        if pixel_values is None:
+            return None
 
-        if pixel_values is not None:
-            image_size = self.config.vision_config.image_size
-            return Gemma3ImagePixelInputs(
-                pixel_values=pixel_values,
-                num_patches=num_patches,
-                resolve_bindings={"h": image_size, "w": image_size},
-            )
-        elif image_embeds is not None:
-            return Gemma3ImageEmbeddingInputs(
-                image_embeds=image_embeds,
-                type="image_embeds",
-            )
+        image_size = self.config.vision_config.image_size
+
+        return Gemma3ImagePixelInputs(
+            pixel_values=pixel_values,
+            num_patches=num_patches,
+            resolve_bindings={"h": image_size, "w": image_size},
+        )
 
     def _image_pixels_to_features(
         self,
@@ -610,9 +579,7 @@ def _image_pixels_to_features(
     def _process_image_input(
         self,
         image_input: Gemma3ImageInputs,
-    ) -> torch.Tensor | list[torch.Tensor]:
-        if image_input["type"] == "image_embeds":
-            return image_input["image_embeds"]
+    ) -> list[torch.Tensor]:
         assert self.vision_tower is not None
 
         pixel_values = image_input["pixel_values"]
@@ -629,13 +596,33 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
 
         return self._process_image_input(image_input)
 
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = True,
+    ) -> torch.Tensor:
+        # Early return for text-only inference (no multimodal data)
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        # Use interface default with OOV handling enabled
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -657,6 +644,79 @@ def forward(
 
         return hidden_states
 
+    def generate_attention_masks(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mask_dtype: torch.dtype,
+    ) -> dict[str, Any]:
+        """Generate custom attention masks for Gemma3 multimodal inputs.
+
+        This is called by V1 engine's gpu_model_runner during preprocessing
+        to generate attention masks that allow bidirectional attention between
+        image tokens while maintaining causal attention for text.
+        """
+        # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
+        # This is a HACK. Fix this.
+        start_indices = (positions == 0).cpu().nonzero()
+        num_seqs = len(start_indices)
+        seq_lens = []
+        for i in range(num_seqs):
+            start_idx = start_indices[i]
+            end_idx = start_indices[i + 1] if i < num_seqs - 1 else len(input_ids)
+            seq_lens.append(end_idx - start_idx)
+
+        global_attn_masks = []
+        local_attn_masks = []
+        start_idx = 0
+        for seq_idx, seq_len in enumerate(seq_lens):
+            end_idx = start_idx + seq_len
+            input_token_ids = input_ids[start_idx:end_idx]
+
+            # Find image token positions
+            img_pos = input_token_ids == self.config.image_token_index
+
+            start_idx = end_idx
+
+            # Create a global causal mask
+            global_attn_mask = torch.empty(
+                1,
+                1,
+                seq_len,
+                seq_len,
+                dtype=mask_dtype,
+                device=input_ids.device,
+            )
+            global_attn_mask.fill_(float("-inf"))
+            # Fill the lower triangle with 0 (causal attention)
+            global_attn_mask = global_attn_mask.triu(diagonal=1)
+
+            # Enable bidirectional attention between image tokens
+            img_mask = torch.zeros_like(global_attn_mask)
+            img_mask[:, :, :, img_pos] += 1
+            img_mask[:, :, img_pos, :] += 1
+            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
+            global_attn_masks.append(global_attn_mask)
+
+            # GGUF compatibility: config might be Gemma3TextConfig directly
+            text_config = getattr(self.config, "text_config", self.config)
+            sliding_window = text_config.sliding_window
+            if sliding_window is not None:
+                # Create a local causal mask with sliding window (1024)
+                local_attn_mask = torch.ones_like(global_attn_mask)
+                local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window)
+                local_attn_mask = torch.where(
+                    local_attn_mask == 0, global_attn_mask, float("-inf")
+                )
+                local_attn_masks.append(local_attn_mask)
+
+        return {
+            "has_images": True,
+            "seq_lens": seq_lens,
+            "global_attn_masks": global_attn_masks,
+            "local_attn_masks": local_attn_masks,
+        }
+
     def prepare_attn_masks(
         self,
         input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index b175dd60cf65..42d906d089f9 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -827,6 +827,7 @@ def __init__(
     ) -> None:
         super().__init__()
 
+        self.quant_config = quant_config
         self.vision_model = SiglipVisionTransformer(
             config,
             quant_config,
@@ -911,12 +912,38 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 break
             else:
                 param = params_dict[name]
+                param = maybe_swap_ffn_param(
+                    name, param, loaded_weight, params_dict, self.quant_config
+                )
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
+def maybe_swap_ffn_param(
+    name: str,
+    param: torch.Tensor,
+    loaded_weight: torch.Tensor,
+    params_dict: dict[str, torch.Tensor],
+    quant_config: QuantizationConfig,
+) -> torch.Tensor:
+    if not (quant_config and quant_config.get_name() == "gguf") or ".fc" not in name:
+        return param
+    # Some GGUF models have fc1 and fc2 weights swapped
+    tp_size = get_tensor_model_parallel_world_size()
+    output_dim = getattr(param, "output_dim", 0)
+    output_size = param.size(output_dim) * tp_size
+    weight_out_size = loaded_weight.size(output_dim)
+    if ".fc1." in name and output_size != weight_out_size:
+        new_name = name.replace(".fc1.", ".fc2.")
+        param = params_dict[new_name]
+    elif ".fc2." in name and output_size != weight_out_size:
+        new_name = name.replace(".fc2.", ".fc1.")
+        param = params_dict[new_name]
+    return param
+
+
 # Adapted from: https://github.com/huggingface/transformers/blob/v4.54.1/src/transformers/models/siglip/modeling_siglip.py#L200
 class SiglipTextEmbeddings(nn.Module):
     def __init__(self, config: SiglipTextConfig):
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 49250e071eab..ac4a71648cec 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -477,6 +477,17 @@ def is_interleaved(config: PretrainedConfig) -> bool:
     return False
 
 
+def uses_custom_attention_masks(config: PretrainedConfig) -> bool:
+    """Detect if model uses custom attention mask generation for multimodal.
+
+    Some multimodal models require custom attention masks that enable
+    bidirectional attention between image tokens while maintaining causal
+    attention for text tokens. Currently applies to Gemma3 multimodal models.
+    """
+    architectures = getattr(config, "architectures", [])
+    return "Gemma3ForConditionalGeneration" in architectures
+
+
 def _maybe_update_auto_config_kwargs(kwargs: dict[str, Any], model_type: str):
     """
     Update kwargs for AutoConfig initialization based on model_type
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
new file mode 100644
index 000000000000..2bf59c91a3bb
--- /dev/null
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""GGUF utility functions."""
+
+from pathlib import Path
+
+import gguf
+from gguf.constants import Keys, VisionProjectorType
+from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def detect_gguf_multimodal(model: str) -> Path | None:
+    """Check if GGUF model has multimodal projector file.
+
+    Args:
+        model: Model path string
+
+    Returns:
+        Path to mmproj file if found, None otherwise
+    """
+    if not model.endswith(".gguf"):
+        return None
+
+    try:
+        model_path = Path(model)
+        if not model_path.is_file():
+            return None
+
+        model_dir = model_path.parent
+        mmproj_patterns = ["mmproj.gguf", "mmproj-*.gguf", "*mmproj*.gguf"]
+        for pattern in mmproj_patterns:
+            mmproj_files = list(model_dir.glob(pattern))
+            if mmproj_files:
+                return mmproj_files[0]
+        return None
+    except Exception:
+        return None
+
+
+def extract_vision_config_from_gguf(mmproj_path: str) -> "SiglipVisionConfig | None":
+    """Extract vision config parameters from mmproj.gguf metadata.
+
+    Reads vision encoder configuration from GGUF metadata fields using
+    standardized GGUF constants. Automatically detects the projector type
+    (e.g., gemma3, llama4) and applies model-specific parameters accordingly.
+
+    The function extracts standard CLIP vision parameters from GGUF metadata
+    and applies projector-type-specific customizations. For unknown projector
+    types, it uses safe defaults from SiglipVisionConfig.
+
+    Args:
+        mmproj_path: Path to mmproj.gguf file (str or Path)
+
+    Returns:
+        SiglipVisionConfig if extraction succeeds, None if any required
+        field is missing from the GGUF metadata
+
+    Raises:
+        Exception: Exceptions from GGUF reading (file not found, corrupted
+            file, etc.) propagate directly from gguf.GGUFReader
+    """
+    reader = gguf.GGUFReader(str(mmproj_path))
+
+    # Detect projector type to apply model-specific parameters
+    projector_type = None
+    projector_type_field = reader.get_field(Keys.Clip.PROJECTOR_TYPE)
+    if projector_type_field:
+        try:
+            projector_type = bytes(projector_type_field.parts[-1]).decode("utf-8")
+        except (AttributeError, UnicodeDecodeError) as e:
+            logger.warning("Failed to decode projector type from GGUF: %s", e)
+
+    # Map GGUF field constants to SiglipVisionConfig parameters.
+    # Uses official GGUF constants from gguf-py for standardization.
+    # Format: {gguf_constant: (param_name, dtype)}
+    VISION_CONFIG_FIELDS = {
+        Keys.ClipVision.EMBEDDING_LENGTH: ("hidden_size", int),
+        Keys.ClipVision.FEED_FORWARD_LENGTH: ("intermediate_size", int),
+        Keys.ClipVision.BLOCK_COUNT: ("num_hidden_layers", int),
+        Keys.ClipVision.Attention.HEAD_COUNT: ("num_attention_heads", int),
+        Keys.ClipVision.IMAGE_SIZE: ("image_size", int),
+        Keys.ClipVision.PATCH_SIZE: ("patch_size", int),
+        Keys.ClipVision.Attention.LAYERNORM_EPS: ("layer_norm_eps", float),
+    }
+
+    # Extract and validate all required fields
+    config_params = {}
+    for gguf_key, (param_name, dtype) in VISION_CONFIG_FIELDS.items():
+        field = reader.get_field(gguf_key)
+        if field is None:
+            logger.warning(
+                "Missing required vision config field '%s' in mmproj.gguf",
+                gguf_key,
+            )
+            return None
+        # Extract scalar value from GGUF field and convert to target type
+        config_params[param_name] = dtype(field.parts[-1])
+
+    # Apply model-specific parameters based on projector type
+    if projector_type == VisionProjectorType.GEMMA3:
+        # Gemma3 doesn't use the vision pooling head (multihead attention)
+        # This is a vLLM-specific parameter used in SiglipVisionTransformer
+        config_params["vision_use_head"] = False
+        logger.info("Detected Gemma3 projector, disabling vision pooling head")
+    # Add other projector-type-specific customizations here as needed
+    # elif projector_type == VisionProjectorType.LLAMA4:
+    #     config_params["vision_use_head"] = ...
+
+    # Create config with extracted parameters
+    # Note: num_channels and attention_dropout use SiglipVisionConfig defaults
+    # (3 and 0.0 respectively) which are correct for all models
+    config = SiglipVisionConfig(**config_params)
+
+    if projector_type:
+        logger.info(
+            "Extracted vision config from mmproj.gguf (projector_type: %s)",
+            projector_type,
+        )
+    else:
+        logger.info("Extracted vision config from mmproj.gguf metadata")
+
+    return config
+
+
+def maybe_patch_hf_config_from_gguf(
+    model: str,
+    hf_config: PretrainedConfig,
+) -> PretrainedConfig:
+    """Patch HF config for GGUF models.
+
+    Applies GGUF-specific patches to HuggingFace config:
+    1. For multimodal models: patches architecture and vision config
+    2. For all GGUF models: overrides vocab_size from embedding tensor
+
+    This ensures compatibility with GGUF models that have extended
+    vocabularies (e.g., Unsloth) where the GGUF file contains more
+    tokens than the HuggingFace tokenizer config specifies.
+
+    Args:
+        model: Model path string
+        hf_config: HuggingFace config to patch in-place
+
+    Returns:
+        Updated HuggingFace config
+    """
+    # Patch multimodal config if mmproj.gguf exists
+    mmproj_path = detect_gguf_multimodal(model)
+    if mmproj_path is not None:
+        vision_config = extract_vision_config_from_gguf(str(mmproj_path))
+
+        # Create HF config for Gemma3 multimodal
+        text_config = hf_config.get_text_config()
+        is_gemma3 = hf_config.model_type in ("gemma3", "gemma3_text")
+        if vision_config is not None and is_gemma3:
+            new_hf_config = Gemma3Config.from_text_vision_configs(
+                text_config=text_config,
+                vision_config=vision_config,
+                architectures=["Gemma3ForConditionalGeneration"],
+            )
+            hf_config = new_hf_config
+
+    return hf_config
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index b3469c1b18f2..8deacb5b0791 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -18,7 +18,7 @@
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
 
-from vllm.transformers_utils.utils import convert_model_repo_to_path
+from vllm.transformers_utils.utils import check_gguf_file, convert_model_repo_to_path
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
@@ -236,9 +236,20 @@ def cached_processor_from_config(
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
+    if check_gguf_file(model_config.model):
+        assert not check_gguf_file(model_config.tokenizer), (
+            "For multimodal GGUF models, the original tokenizer "
+            "should be used to correctly load processor."
+        )
+        model = model_config.tokenizer
+        revision = model_config.tokenizer_revision
+    else:
+        model = model_config.model
+        revision = model_config.revision
+
     return cached_get_processor_without_dynamic_kwargs(
-        model_config.model,
-        revision=model_config.revision,
+        model,
+        revision=revision,
         trust_remote_code=model_config.trust_remote_code,
         processor_cls=processor_cls,  # type: ignore[arg-type]
         **_merge_mm_kwargs(model_config, processor_cls, **kwargs),
@@ -339,9 +350,19 @@ def cached_image_processor_from_config(
     model_config: "ModelConfig",
     **kwargs: Any,
 ):
+    if check_gguf_file(model_config.model):
+        assert not check_gguf_file(model_config.tokenizer), (
+            "For multimodal GGUF models, the original tokenizer "
+            "should be used to correctly load image processor."
+        )
+        model = model_config.tokenizer
+        revision = model_config.tokenizer_revision
+    else:
+        model = model_config.model
+        revision = model_config.revision
     return cached_get_image_processor(
-        model_config.model,
-        revision=model_config.revision,
+        model,
+        revision=revision,
         trust_remote_code=model_config.trust_remote_code,
         **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
     )
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 1ae42ba622dc..901a64d9d263 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -27,6 +27,7 @@ def is_cloud_storage(model_or_path: str) -> bool:
     return is_s3(model_or_path) or is_gcs(model_or_path)
 
 
+@cache
 def check_gguf_file(model: str | PathLike) -> bool:
     """Check if the file is a GGUF model."""
     model = Path(model)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0102ca4739ad..67f575f92cc6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -324,6 +324,7 @@ def __init__(
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
+        self.uses_custom_attention_masks = model_config.uses_custom_attention_masks
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
             model_config
         )
@@ -2346,6 +2347,24 @@ def _preprocess(
                 **self._init_model_kwargs(num_scheduled_tokens),
                 **self._extract_mm_kwargs(scheduler_output),
             }
+
+            # Generate custom attention masks for models that require them.
+            # V1 pre-generates embeddings, so forward() skips prepare_attn_masks().
+            # Check mm_features (mm_embeds is empty during decode).
+            has_mm_features = any(
+                req_state.mm_features for req_state in self.requests.values()
+            )
+            if (
+                self.uses_custom_attention_masks
+                and has_mm_features
+                and hasattr(self.model, "generate_attention_masks")
+            ):
+                mask_kwargs = self.model.generate_attention_masks(
+                    self.input_ids.gpu[:num_scheduled_tokens],
+                    self.positions.gpu[:num_scheduled_tokens],
+                    mask_dtype=self.model.dtype,
+                )
+                model_kwargs.update(mask_kwargs)
         elif self.enable_prompt_embeds and is_first_rank:
             # Get the input embeddings for the tokens that are not input embeds,
             # then put them into the appropriate positions.

From f226a3f0c11aed72f585ebd2942d4a6832adbfb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 18 Nov 2025 18:22:30 +0100
Subject: [PATCH 157/578] [CI][NIXL] Change default `block_size` for tests
 (#28927)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index ebc8575e5b39..87c9a105e936 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -49,8 +49,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
 DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
-PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-16}
-DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-16}
+PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-128}
+DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-128}
 
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)

From da8dadf68b5a2af849e7c5fd35ce9b8525d8d398 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 18 Nov 2025 09:26:07 -0800
Subject: [PATCH 158/578] [Minor] Rename `ec_producer` field to
 `is_ec_producer` (#28884)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index d49eb752d56a..3a25827cec38 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -180,7 +180,7 @@ def __init__(
             logger.info("Batch queue is enabled with size %d", self.batch_queue_size)
             self.batch_queue = deque(maxlen=self.batch_queue_size)
 
-        self.ec_producer = (
+        self.is_ec_producer = (
             vllm_config.ec_transfer_config is not None
             and vllm_config.ec_transfer_config.is_ec_producer
         )
@@ -390,7 +390,7 @@ def step_with_batch_queue(
             exec_future = self.model_executor.execute_model(
                 scheduler_output, non_block=True
             )
-            if not self.ec_producer:
+            if not self.is_ec_producer:
                 model_executed = scheduler_output.total_num_scheduled_tokens > 0
 
             if self.is_pooling_model or not model_executed:

From 0af3d4f0df360decc2115f43f5e4bc732342e7e4 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 19 Nov 2025 01:28:34 +0800
Subject: [PATCH 159/578] =?UTF-8?q?[FEAT]=20[AITER]=20[ROCm]=20integrate?=
 =?UTF-8?q?=20aiter=C2=A0sampling=C2=A0ops=20(#26084)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/v1/sample/ops/topk_topp_sampler.py | 77 +++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 02ea658b7f20..c6c7e924175f 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -7,6 +7,7 @@
 from packaging import version
 
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config.model import LogprobsMode
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
@@ -55,6 +56,17 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
                 self.forward = self.forward_native
             else:
                 self.forward = self.forward_cpu
+        elif (
+            logprobs_mode not in ("processed_logits", "processed_logprobs")
+            and rocm_aiter_ops.is_enabled()
+        ):
+            import aiter.ops.sampling  # noqa: F401
+
+            self.aiter_ops = torch.ops.aiter
+            logger.info_once(
+                "Using aiter sampler on ROCm (lazy import, sampling-only)."
+            )
+            self.forward = self.forward_hip
         else:
             self.forward = self.forward_native
 
@@ -138,6 +150,64 @@ def forward_cpu(
 
             return probs.div_(q).argmax(dim=-1).view(-1), logits_to_return
 
+    def forward_hip(
+        self,
+        logits: torch.Tensor,
+        generators: dict[int, torch.Generator],
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """Optimized ROCm/aiter path (same structure as forward_cuda)."""
+        if (k is None and p is None) or generators:
+            if generators:
+                logger.warning_once(
+                    "aiter sampler does not support per-request generators; "
+                    "falling back to PyTorch-native."
+                )
+            return self.forward_native(logits, generators, k, p)
+        assert self.logprobs_mode not in (
+            "processed_logits",
+            "processed_logprobs",
+        ), "aiter sampler does not support returning logits/logprobs."
+        return self.aiter_sample(logits, k, p, generators), None
+
+    def aiter_sample(
+        self,
+        logits: torch.Tensor,
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+        generators: dict[int, torch.Generator],
+    ) -> torch.Tensor:
+        """Sample from logits using aiter ops."""
+        use_top_k = k is not None
+        use_top_p = p is not None
+        # Joint k+p path
+        if use_top_p and use_top_k:
+            probs = logits.softmax(dim=-1, dtype=torch.float32).contiguous()
+            next_token_ids = self.aiter_ops.top_k_top_p_sampling_from_probs(
+                probs,
+                None,
+                *_to_tensor_scalar_tuple(k),
+                *_to_tensor_scalar_tuple(p),
+                deterministic=True,
+            )
+            return next_token_ids.view(-1)
+        # Top-p only path
+        elif use_top_p:
+            probs = logits.softmax(dim=-1, dtype=torch.float32).contiguous()
+            next_token_ids = self.aiter_ops.top_p_sampling_from_probs(
+                probs, None, *_to_tensor_scalar_tuple(p), deterministic=True
+            )
+            return next_token_ids.view(-1)
+        # Top-k only path
+        elif use_top_k:
+            probs = logits.softmax(dim=-1, dtype=torch.float32).contiguous()
+            renorm_probs = self.aiter_ops.top_k_renorm_probs(
+                probs, *_to_tensor_scalar_tuple(k)
+            )
+            return torch.multinomial(renorm_probs, num_samples=1).view(-1)
+        raise RuntimeError("aiter_sample was called with no active top-k or top-p.")
+
 
 # Note: this is a workaround for
 # https://github.com/pytorch/pytorch/pull/151218
@@ -288,3 +358,10 @@ def flashinfer_sample(
         )
 
     return next_token_ids.view(-1)
+
+
+def _to_tensor_scalar_tuple(x):
+    if isinstance(x, torch.Tensor):
+        return (x, 0)
+    else:
+        return (None, x)

From c64c0b78de4716ef019666663c56b6ceaa019463 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Tue, 18 Nov 2025 09:44:18 -0800
Subject: [PATCH 160/578] [chore] Move the rest of wikimedia url to S3 (#28921)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/features/multimodal_inputs.md                        | 2 +-
 docs/serving/openai_compatible_server.md                  | 2 +-
 examples/offline_inference/vision_language_pooling.py     | 4 ++--
 .../openai_chat_completion_client_for_multimodal.py       | 2 +-
 .../openai_chat_embedding_client_for_multimodal.py        | 2 +-
 tests/entrypoints/openai/test_vision.py                   | 8 ++++----
 tests/entrypoints/pooling/openai/test_vision_embedding.py | 8 ++++----
 .../language/pooling/test_mm_classifier_conversion.py     | 2 +-
 tests/multimodal/test_utils.py                            | 8 ++++----
 tests/utils.py                                            | 2 +-
 .../v1/entrypoints/openai/serving_responses/test_image.py | 8 ++++----
 11 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index cde2ec165712..5f684604e603 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -483,7 +483,7 @@ Then, you can use the OpenAI client as follows:
     )
 
     # Single-image input inference
-    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
     chat_response = client.chat.completions.create(
         model="microsoft/Phi-3.5-vision-instruct",
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 821628e6e317..23df3963823a 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -293,7 +293,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
             base_url="http://localhost:8000/v1",
             api_key="EMPTY",
         )
-        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+        image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
         response = create_chat_embeddings(
             client,
diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py
index 63d85d5d9eef..530aad4bc031 100644
--- a/examples/offline_inference/vision_language_pooling.py
+++ b/examples/offline_inference/vision_language_pooling.py
@@ -266,7 +266,7 @@ def get_query(modality: QueryModality):
         return ImageQuery(
             modality="image",
             image=fetch_image(
-                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg"  # noqa: E501
+                "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg"  # noqa: E501
             ),
         )
 
@@ -275,7 +275,7 @@ def get_query(modality: QueryModality):
             modality="text+image",
             text="A cat standing in the snow.",
             image=fetch_image(
-                "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg"  # noqa: E501
+                "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"  # noqa: E501
             ),
         )
 
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 520cbca003aa..3d1259276998 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -66,7 +66,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None:
 # Single-image input inference
 def run_single_image(model: str, max_completion_tokens: int) -> None:
     ## Use image url in the payload
-    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
     chat_completion_from_url = client.chat.completions.create(
         messages=[
             {
diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
index 261b810ce5d0..47c2c5030078 100644
--- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
@@ -21,7 +21,7 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
 
 def create_chat_embeddings(
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 2a7df08ea3b0..d83c6726e72d 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -17,10 +17,10 @@
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_ASSETS = [
-    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
 ]
 
 EXPECTED_MM_BEAM_SEARCH_RES = [
diff --git a/tests/entrypoints/pooling/openai/test_vision_embedding.py b/tests/entrypoints/pooling/openai/test_vision_embedding.py
index 944392d66fa5..1befb5a3cf7a 100644
--- a/tests/entrypoints/pooling/openai/test_vision_embedding.py
+++ b/tests/entrypoints/pooling/openai/test_vision_embedding.py
@@ -19,10 +19,10 @@
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_ASSETS = [
-    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
 ]
 
 
diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py
index 2482452645ef..a31a771238e2 100644
--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -75,7 +75,7 @@ def test_gemma_multimodal(
                 {
                     "type": "image_url",
                     "image_url": {
-                        "url": "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
+                        "url": "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/red_chair.jpg"
                     },
                 },
                 {"type": "text", "text": "A fine 19th century piece of furniture."},
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index ea795fcbbde5..639e290406fe 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -16,10 +16,10 @@
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_ASSETS = [
-    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
 ]
 
 TEST_VIDEO_URLS = [
diff --git a/tests/utils.py b/tests/utils.py
index c8f18384c511..c31a2aeeb9c8 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -676,7 +676,7 @@ def compare_all_settings(
                 results += _test_image_text(
                     client,
                     model,
-                    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+                    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
                 )
             elif method == "encode":
                 results += _test_embeddings(client, model, prompt)
diff --git a/tests/v1/entrypoints/openai/serving_responses/test_image.py b/tests/v1/entrypoints/openai/serving_responses/test_image.py
index 980d83b787e7..be5693bbf273 100644
--- a/tests/v1/entrypoints/openai/serving_responses/test_image.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py
@@ -15,10 +15,10 @@
 MAXIMUM_IMAGES = 2
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_ASSETS = [
-    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
 ]
 
 
From e4bb2684bcea12f72a36a6c48292f79534af849a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 19 Nov 2025 02:56:04 +0800
Subject: [PATCH 161/578] [Models] Replace all `nn.Conv2d` with vLLM's
 Conv2dLayer (#28842)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/conv.py            | 24 +++++++++++++--
 vllm/model_executor/models/aimv2.py           |  3 +-
 vllm/model_executor/models/blip.py            |  3 +-
 vllm/model_executor/models/chameleon.py       | 29 +++++++++----------
 vllm/model_executor/models/deepencoder.py     | 13 +++++----
 vllm/model_executor/models/dots_ocr.py        |  3 +-
 vllm/model_executor/models/glm4_1v.py         |  4 +--
 vllm/model_executor/models/glm4v.py           |  5 ++--
 .../models/idefics2_vision_model.py           |  3 +-
 vllm/model_executor/models/intern_vit.py      |  3 +-
 vllm/model_executor/models/interns1_vit.py    |  3 +-
 vllm/model_executor/models/keye.py            |  3 +-
 vllm/model_executor/models/midashenglm.py     |  3 +-
 vllm/model_executor/models/moonvit.py         |  3 +-
 vllm/model_executor/models/paddleocr_vl.py    |  3 +-
 vllm/model_executor/models/pixtral.py         |  5 ++--
 vllm/model_executor/models/qwen_vl.py         |  3 +-
 vllm/model_executor/models/siglip.py          |  3 +-
 vllm/model_executor/models/siglip2navit.py    |  5 ++--
 vllm/model_executor/models/step3_vl.py        |  7 +++--
 20 files changed, 83 insertions(+), 45 deletions(-)

diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py
index e6f2d2990c24..8d51e5bd9920 100644
--- a/vllm/model_executor/layers/conv.py
+++ b/vllm/model_executor/layers/conv.py
@@ -3,6 +3,7 @@
 """Conv Layer Class."""
 
 import math
+from typing import Literal
 
 import torch
 import torch.nn as nn
@@ -23,11 +24,11 @@ def __init__(
         out_channels: int,
         kernel_size: int | tuple[int, ...],
         stride: int | tuple[int, ...] = 1,
-        padding: int | tuple[int, ...] = 0,
+        padding: int | tuple[int, ...] | Literal["same", "valid"] = 0,
         dilation: int | tuple[int, ...] = 1,
         groups: int = 1,
         bias: bool = True,
-        padding_mode: str = "zeros",
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
         *,
         params_dtype: torch.dtype | None = None,
     ) -> None:
@@ -36,6 +37,22 @@ def __init__(
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
 
+        valid_padding_strings = {"same", "valid"}
+        if isinstance(padding, str) and padding not in valid_padding_strings:
+            raise ValueError(
+                f"Invalid padding string '{padding}'. "
+                f"Expected one of {valid_padding_strings}."
+            )
+
+        if padding == "same":
+            padding = (
+                kernel_size // 2
+                if isinstance(kernel_size, int)
+                else tuple(k // 2 for k in kernel_size)
+            )
+        elif padding == "valid":
+            padding = 0
+
         kernel_size = (
             (kernel_size,) * self.num_dim
             if isinstance(kernel_size, int)
@@ -45,6 +62,9 @@ def __init__(
         padding = (padding,) * self.num_dim if isinstance(padding, int) else padding
         dilation = (dilation,) * self.num_dim if isinstance(dilation, int) else dilation
 
+        if padding == "same" and any(s != 1 for s in stride):
+            raise ValueError("padding='same' is not supported for strided convolutions")
+
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.kernel_size = kernel_size
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index 5872e8196ead..3d000f3ac3ab 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -12,6 +12,7 @@
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.utils import divide
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -58,7 +59,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class AIMv2PatchEmbed(nn.Module):
     def __init__(self, config: AIMv2Config):
         super().__init__()
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             config.num_channels,
             config.hidden_size,
             kernel_size=(config.patch_size, config.patch_size),
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 2e4f73312efa..f31f99c0592b 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -12,6 +12,7 @@
 from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -47,7 +48,7 @@ def __init__(self, config: BlipVisionConfig | Blip2VisionConfig):
 
         self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=3,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index fb7476c45fcd..3c87bbfefab3 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -22,6 +22,7 @@
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -549,7 +550,7 @@ def forward(self, hidden_state: torch.Tensor):
 class ChameleonVQVAEEncoderConvDownsample(nn.Module):
     def __init__(self, in_channels: int):
         super().__init__()
-        self.conv = nn.Conv2d(
+        self.conv = Conv2dLayer(
             in_channels, in_channels, kernel_size=3, stride=2, padding=0
         )
 
@@ -577,23 +578,23 @@ def __init__(
         self.norm1 = torch.nn.GroupNorm(
             num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
         )
-        self.conv1 = torch.nn.Conv2d(
+        self.conv1 = Conv2dLayer(
             in_channels, out_channels, kernel_size=3, stride=1, padding=1
         )
         self.norm2 = torch.nn.GroupNorm(
             num_groups=32, num_channels=out_channels, eps=1e-6, affine=True
         )
         self.dropout = torch.nn.Dropout(config.dropout)
-        self.conv2 = torch.nn.Conv2d(
+        self.conv2 = Conv2dLayer(
             out_channels, out_channels, kernel_size=3, stride=1, padding=1
         )
         if self.in_channels != self.out_channels:
             if self.use_conv_shortcut:
-                self.conv_shortcut = torch.nn.Conv2d(
+                self.conv_shortcut = Conv2dLayer(
                     in_channels, out_channels, kernel_size=3, stride=1, padding=1
                 )
             else:
-                self.nin_shortcut = torch.nn.Conv2d(
+                self.nin_shortcut = Conv2dLayer(
                     in_channels, out_channels, kernel_size=1, stride=1, padding=0
                 )
 
@@ -626,16 +627,16 @@ def __init__(self, in_channels: int):
         self.norm = torch.nn.GroupNorm(
             num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
         )
-        self.q = torch.nn.Conv2d(
+        self.q = Conv2dLayer(
             in_channels, in_channels, kernel_size=1, stride=1, padding=0
         )
-        self.k = torch.nn.Conv2d(
+        self.k = Conv2dLayer(
             in_channels, in_channels, kernel_size=1, stride=1, padding=0
         )
-        self.v = torch.nn.Conv2d(
+        self.v = Conv2dLayer(
             in_channels, in_channels, kernel_size=1, stride=1, padding=0
         )
-        self.proj_out = torch.nn.Conv2d(
+        self.proj_out = Conv2dLayer(
             in_channels, in_channels, kernel_size=1, stride=1, padding=0
         )
 
@@ -681,7 +682,7 @@ def __init__(self, config: ChameleonVQVAEConfig):
         latent_channels = config.latent_channels
         channel_multiplier = config.channel_multiplier
 
-        self.conv_in = torch.nn.Conv2d(
+        self.conv_in = Conv2dLayer(
             in_channels, base_channels, kernel_size=3, stride=1, padding=1
         )
 
@@ -738,7 +739,7 @@ def __init__(self, config: ChameleonVQVAEConfig):
         self.norm_out = torch.nn.GroupNorm(
             num_groups=32, num_channels=block_in, eps=1e-6, affine=True
         )
-        self.conv_out = torch.nn.Conv2d(
+        self.conv_out = Conv2dLayer(
             block_in,
             2 * latent_channels if double_latent else latent_channels,
             kernel_size=3,
@@ -779,10 +780,8 @@ def __init__(self, config: ChameleonVQVAEConfig):
         super().__init__()
         self.encoder = ChameleonVQVAEEncoder(config)
         self.quantize = ChameleonVQVAEVectorQuantizer(config)
-        self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(
-            config.embed_dim, config.latent_channels, 1
-        )
+        self.quant_conv = Conv2dLayer(config.latent_channels, config.embed_dim, 1)
+        self.post_quant_conv = Conv2dLayer(config.embed_dim, config.latent_channels, 1)
         self.eval()  # Chameleon's VQ model is frozen
 
     def encode(
diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py
index e62a57eccc95..8f1660891fcb 100644
--- a/vllm/model_executor/models/deepencoder.py
+++ b/vllm/model_executor/models/deepencoder.py
@@ -19,6 +19,7 @@
 from transformers import CLIPVisionConfig
 
 from vllm.attention.layer import MultiHeadAttention
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
@@ -133,14 +134,14 @@ def __init__(
             self.blocks.append(block)
 
         self.neck = nn.Sequential(
-            nn.Conv2d(
+            Conv2dLayer(
                 embed_dim,
                 out_chans,
                 kernel_size=1,
                 bias=False,
             ),
             LayerNorm2d(out_chans),
-            nn.Conv2d(
+            Conv2dLayer(
                 out_chans,
                 out_chans,
                 kernel_size=3,
@@ -150,8 +151,10 @@ def __init__(
             LayerNorm2d(out_chans),
         )
 
-        self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
-        self.net_3 = nn.Conv2d(
+        self.net_2 = Conv2dLayer(
+            256, 512, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.net_3 = Conv2dLayer(
             512, 1024, kernel_size=3, stride=2, padding=1, bias=False
         )
 
@@ -500,7 +503,7 @@ def __init__(
         """
         super().__init__()
 
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
         )
 
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index f46caaa095c6..2d2251e83b5b 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -22,6 +22,7 @@
     get_tensor_model_parallel_world_size,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -471,7 +472,7 @@ def __init__(self, config):
         self.temporal_patch_size = config.temporal_patch_size
         self.embed_dim = config.embed_dim
         self.config = config
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             config.num_channels,
             config.embed_dim,
             kernel_size=(config.patch_size, config.patch_size),
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 65c3fc2d9e97..2c2f45c2453e 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -56,7 +56,7 @@
 from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
-from vllm.model_executor.layers.conv import Conv3dLayer
+from vllm.model_executor.layers.conv import Conv2dLayer, Conv3dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -734,7 +734,7 @@ def __init__(
         self.post_conv_layernorm = RMSNorm(
             vision_config.hidden_size, eps=vision_config.rms_norm_eps
         )
-        self.downsample = nn.Conv2d(
+        self.downsample = Conv2dLayer(
             in_channels=vision_config.hidden_size,
             out_channels=vision_config.out_hidden_size,
             kernel_size=vision_config.spatial_merge_size,
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 1c18ea0745f2..514082cf60ce 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -24,6 +24,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
@@ -78,7 +79,7 @@ class GLMVImagePixelInputs(TensorSchema):
 class EVA2CLIPPatchEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             config.in_channels,
             config.hidden_size,
             kernel_size=config.patch_size,
@@ -333,7 +334,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.linear_proj",
         )
-        self.conv = nn.Conv2d(
+        self.conv = Conv2dLayer(
             in_channels=vision_config.hidden_size,
             out_channels=config.hidden_size,
             kernel_size=2,
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 727c8ec0397c..06b8468e18db 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -30,6 +30,7 @@
 from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -60,7 +61,7 @@ def __init__(self, config: Idefics2VisionConfig):
         self.embed_dim = config.hidden_size
         self.image_size = config.image_size
         self.patch_size = config.patch_size
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 03918127c6ae..61aeafc2ab43 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -24,6 +24,7 @@
     tensor_model_parallel_all_gather,
 )
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -51,7 +52,7 @@ def __init__(self, config: PretrainedConfig):
 
         self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=3,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py
index 507503d75046..cb0414bbc95a 100644
--- a/vllm/model_executor/models/interns1_vit.py
+++ b/vllm/model_executor/models/interns1_vit.py
@@ -16,6 +16,7 @@
 
 from vllm.attention.layer import MultiHeadAttention
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -43,7 +44,7 @@ def __init__(self, config):
         self.num_patches = num_patches
         self.patch_shape = patch_shape
 
-        self.projection = nn.Conv2d(
+        self.projection = Conv2dLayer(
             num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
         )
 
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 1eb0eccc0411..8fc3db296aa7 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -24,6 +24,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -204,7 +205,7 @@ def __init__(self, config: PretrainedConfig):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py
index a84c99059cd9..d9b23811730d 100644
--- a/vllm/model_executor/models/midashenglm.py
+++ b/vllm/model_executor/models/midashenglm.py
@@ -39,6 +39,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -120,7 +121,7 @@ def __init__(
         self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.flatten = flatten
 
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             in_chans,
             embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index 8017c947bf9a..2e3e6dc166ad 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -53,6 +53,7 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import is_flash_attn_2_available
 
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
@@ -244,7 +245,7 @@ def __init__(
         )
         self.patch_size = patch_size
 
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             in_dim, out_dim, kernel_size=patch_size, stride=patch_size
         )
 
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 3ef6470070d1..dee0c16ab0f6 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -45,6 +45,7 @@
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -419,7 +420,7 @@ def __init__(self, config: PretrainedConfig):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 8cb7d6a889da..8a034fd72b02 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -31,6 +31,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -747,7 +748,7 @@ class VisionTransformer(nn.Module):
     def __init__(self, args: VisionEncoderArgs):
         super().__init__()
         self.args = args
-        self.patch_conv = nn.Conv2d(
+        self.patch_conv = Conv2dLayer(
             in_channels=args.num_channels,
             out_channels=args.hidden_size,
             kernel_size=args.patch_size,
@@ -1212,7 +1213,7 @@ def __init__(
 
         self.config = config
 
-        self.patch_conv = nn.Conv2d(
+        self.patch_conv = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=config.hidden_size,
             kernel_size=config.patch_size,
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 6a259cade9cf..4906cf441f6f 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -25,6 +25,7 @@
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     ReplicatedLinear,
@@ -333,7 +334,7 @@ def __init__(
         patch_height, patch_width = self.patch_size = (patch_size, patch_size)
         self.grid_size = (image_height // patch_height, image_width // patch_width)
         self.output_dim = output_dim
-        self.conv1 = nn.Conv2d(
+        self.conv1 = Conv2dLayer(
             in_channels=3,
             out_channels=width,
             kernel_size=patch_size,
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 42d906d089f9..ce5847bf79a5 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -24,6 +24,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -286,7 +287,7 @@ def __init__(self, config: SiglipVisionConfig):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 29dd164ad37f..46f5e67d659e 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -16,6 +16,7 @@
 from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     LinearBase,
@@ -67,7 +68,7 @@ def __init__(self, config: PretrainedConfig):
                 self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
 
         else:
-            self.patch_embedding = nn.Conv2d(
+            self.patch_embedding = Conv2dLayer(
                 in_channels=config.num_channels,
                 out_channels=self.embed_dim,
                 kernel_size=self.patch_size,
@@ -99,7 +100,7 @@ def forward(
         target_dtype = self.patch_embedding.weight.dtype
         if isinstance(self.patch_embedding, LinearBase):
             patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
-        elif isinstance(self.patch_embedding, nn.Conv2d):
+        elif isinstance(self.patch_embedding, Conv2dLayer):
             pixel_values = pixel_values.view(
                 -1,
                 self.config.num_channels * self.config.temporal_patch_size,
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 5d16be1eb312..1c60cb414812 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -20,6 +20,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -667,7 +668,7 @@ def __init__(self, config: Step3VisionEncoderConfig):
 
         self.class_embedding = nn.Parameter(torch.randn(1, self.embed_dim))
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
@@ -950,13 +951,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
                 prefix=maybe_prefix(prefix, "vision_model"),
                 use_data_parallel=self.use_data_parallel,
             )
-            self.vit_downsampler = nn.Conv2d(
+            self.vit_downsampler = Conv2dLayer(
                 config.vision_config.hidden_size,
                 config.vision_config.output_hidden_size,
                 kernel_size=2,
                 stride=config.understand_projector_stride,
             )
-            self.vit_downsampler2 = nn.Conv2d(
+            self.vit_downsampler2 = Conv2dLayer(
                 config.vision_config.output_hidden_size,
                 config.vision_config.output_hidden_size * 2,
                 kernel_size=3,

From c3e29786209d91d3842e839b62f4d1d815902262 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Tue, 18 Nov 2025 13:03:23 -0600
Subject: [PATCH 162/578] [NIXL] fix cpu PD after physical <> logical
 block_size PR (#28904)

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
---
 .../nixl_integration/run_accuracy_test.sh            |  9 +++++++--
 tools/install_nixl_from_source_ubuntu.py             |  1 +
 .../kv_transfer/kv_connector/v1/nixl_connector.py    | 12 +++++++++---
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index 87c9a105e936..453ccc81eb14 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -55,7 +55,7 @@ DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-128}
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
 
-SMI_BIN=$(which nvidia-smi || which rocm-smi)
+SMI_BIN=$(which nvidia-smi || which rocm-smi || echo "")
 
 # Trap the SIGINT signal (triggered by Ctrl+C)
 trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
@@ -91,8 +91,13 @@ get_model_args() {
 get_num_gpus() {
   if [[ "$SMI_BIN" == *"nvidia"* ]]; then
     echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
-  else
+  elif [[ "$SMI_BIN" == *"rocm"* ]]; then
     echo "$($SMI_BIN -l | grep GPU | wc -l)"
+  else
+    # works for non-cuda platforms,
+    # assuming at least 1 device and
+    # let system to decide which card to use
+    echo "1"
   fi
 }
 
diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
index a786abba95ad..b8a55c615426 100644
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -95,6 +95,7 @@ def install_system_dependencies():
         "meson",
         "libtool",
         "libtool-bin",
+        "pkg-config",
     ]
     run_command(["apt-get", "update"])
     run_command(["apt-get", "install", "-y"] + apt_packages)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 5ff95876ef34..1626f819af8b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -1161,6 +1161,14 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         # to better exploit the memory layout (ie num_blocks is the first dim).
         split_k_and_v = self.kv_topo.split_k_and_v
         tensor_size_bytes = None
+
+        # TODO (NickLucche): Get kernel_block_size in a cleaner way
+        # NHD default "view" for non-MLA cache
+        if self.device_type == "cpu":
+            block_size_position = -2
+        else:
+            block_size_position = -2 if self.use_mla else -3
+
         # Enable different block lengths for different layers when MLA is used.
         self.block_len_per_layer = list[int]()
         self.slot_size_per_layer = list[int]()  # HD bytes in kv terms
@@ -1175,9 +1183,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
                 if base_addr in seen_base_addresses:
                     continue
 
-                # TODO (NickLucche): Get kernel_block_size in a cleaner way
-                # NHD default "view" for non-MLA cache
-                kernel_block_size = cache.shape[-2] if self.use_mla else cache.shape[-3]
+                kernel_block_size = cache.shape[block_size_position]
 
                 if self.block_size != kernel_block_size:
                     logger.info_once(

From 2a2d5d2780bf25035438263605c7784f12afb718 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Wed, 19 Nov 2025 03:34:36 +0800
Subject: [PATCH 163/578] Replace `torch.cuda.Event` with `torch.Event` for
 better hardware compatibility (#26985)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 benchmarks/kernels/benchmark_cutlass_moe_fp8.py      |  4 ++--
 benchmarks/kernels/benchmark_moe.py                  |  4 ++--
 .../kernels/benchmark_moe_permute_unpermute.py       |  8 ++++----
 .../kernels/benchmark_per_token_group_quant.py       |  4 ++--
 benchmarks/kernels/benchmark_silu_mul_fp8_quant.py   |  4 ++--
 .../kernels/benchmark_trtllm_decode_attention.py     |  4 ++--
 .../kernels/benchmark_trtllm_prefill_attention.py    |  4 ++--
 benchmarks/kernels/benchmark_w8a8_block_fp8.py       |  4 ++--
 tests/kernels/attention/test_merge_attn_states.py    |  8 ++++----
 vllm/v1/kv_offload/worker/cpu_gpu.py                 |  6 +++---
 vllm/v1/worker/cpu_model_runner.py                   |  6 +++---
 vllm/v1/worker/gpu_input_batch.py                    |  4 ++--
 vllm/v1/worker/gpu_model_runner.py                   | 12 ++++++------
 vllm/v1/worker/ubatching.py                          |  8 ++++----
 vllm/v1/worker/xpu_model_runner.py                   |  9 +--------
 15 files changed, 41 insertions(+), 48 deletions(-)

diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
index 027f67ad4db6..e07d6c776bc0 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -255,8 +255,8 @@ def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
         torch.cuda.synchronize()
 
         # Timing
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
+        start_event = torch.Event(enable_timing=True)
+        end_event = torch.Event(enable_timing=True)
 
         latencies = []
         for _ in range(num_iters):
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c99951aa2782..a1af0b8aec3d 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -185,8 +185,8 @@ def run():
         graph.replay()
     torch.cuda.synchronize()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)
 
     latencies: list[float] = []
     for i in range(num_iters):
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index efa5a7386027..b8913a217c60 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -105,8 +105,8 @@ def run():
         graph.replay()
     torch.cuda.synchronize()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)
 
     latencies: list[float] = []
     for i in range(num_iters):
@@ -241,8 +241,8 @@ def run(input: tuple):
         graph.replay()
     torch.cuda.synchronize()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)
 
     latencies: list[float] = []
     for i in range(num_iters):
diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py
index bdc1eb733084..eba4d510258b 100644
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -30,8 +30,8 @@ def _time_cuda(
         fn()
     torch.cuda.synchronize()
 
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
+    start = torch.Event(enable_timing=True)
+    end = torch.Event(enable_timing=True)
 
     start.record()
     for _ in range(bench_iters):
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
index a5887aafd30d..de01ff197eab 100644
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -253,8 +253,8 @@ def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
         )
     torch.cuda.synchronize()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)
 
     # Benchmark
     latencies: list[float] = []
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 29ce18234dfa..1d0d6fbb9a47 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -127,8 +127,8 @@ def benchmark_decode(
 
     def time_fn(fn, warmup=10, trials=20):
         torch.cuda.synchronize()
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
+        start = torch.Event(enable_timing=True)
+        end = torch.Event(enable_timing=True)
         times = []
         for i in range(warmup):
             fn()
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 2a25d0374811..84bde723abf7 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -139,8 +139,8 @@ def benchmark_prefill(
 
     def time_fn(fn, warmup=10, trials=20):
         torch.cuda.synchronize()
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
+        start = torch.Event(enable_timing=True)
+        end = torch.Event(enable_timing=True)
         times = []
         for i in range(warmup):
             fn()
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index ab54f81985bc..b52500c8c521 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -183,8 +183,8 @@ def run():
         run()
     torch.cuda.synchronize()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)
 
     latencies: list[float] = []
     for i in range(num_iters):
diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
index 9b084f2f660b..c7662223e1ca 100644
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -150,8 +150,8 @@ def test_merge_attn_states(
     output_torch = output.clone()
     output_lse_torch = output_lse.clone()
     total_time_torch_kernel = 0
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
+    start = torch.Event(enable_timing=True)
+    end = torch.Event(enable_timing=True)
 
     # 0. Run the Torch kernel
     prefix_lse_torch = prefix_lse.clone()
@@ -188,8 +188,8 @@ def test_merge_attn_states(
     output_lse_ref_triton = output_lse.clone()
 
     total_time_triton_kernel = 0
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
+    start = torch.Event(enable_timing=True)
+    end = torch.Event(enable_timing=True)
 
     for _ in range(warmup_times):
         merge_attn_states_triton(
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index 646f9d0d7542..0f2ec4a1b41f 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -68,9 +68,9 @@ def __init__(
         self.h2d_stream = torch.cuda.Stream()
 
         # job_id -> transfer cuda event
-        self.transfer_events: dict[int, torch.cuda.Event] = {}
+        self.transfer_events: dict[int, torch.Event] = {}
         # list of cuda events available for re-use
-        self.events_pool: list[torch.cuda.Event] = []
+        self.events_pool: list[torch.Event] = []
 
         pin_memory = is_pin_memory_available()
 
@@ -153,7 +153,7 @@ def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
         )
         src_to_dst_tensor = torch.from_numpy(src_to_dst)
 
-        event = self.events_pool.pop() if self.events_pool else torch.cuda.Event()
+        event = self.events_pool.pop() if self.events_pool else torch.Event()
         with torch.cuda.stream(stream):
             for src_tensor, dst_tensor, kv_dim in zip(
                 src_tensors, dst_tensors, self.kv_dim_before_num_blocks
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 40f011fed1ad..6bfbc32d598f 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -96,14 +96,14 @@ class _StreamPlaceholder:
         def __init__(self, *args, **kwargs) -> None:
             pass
 
-    cuda_event = torch.cuda.Event
+    cuda_event = torch.Event
     cuda_stream = torch.cuda.Stream
     try:
-        torch.cuda.Event = _EventPlaceholder
+        torch.Event = _EventPlaceholder
         torch.cuda.Stream = _StreamPlaceholder
         yield
     finally:
-        torch.cuda.Event = cuda_event
+        torch.Event = cuda_event
         torch.cuda.Stream = cuda_stream
 
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 7cf6afa3fc37..023b5edb2c34 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -265,7 +265,7 @@ def __init__(
         # ids from prior step, if required by current sampling params
         # (e.g. penalties).
         self.sampled_token_ids_cpu: torch.Tensor | None = None
-        self.async_copy_ready_event: torch.cuda.Event | None = None
+        self.async_copy_ready_event: torch.Event | None = None
 
     @property
     def req_ids(self) -> list[str]:
@@ -891,7 +891,7 @@ def make_lora_inputs(
     def set_async_sampled_token_ids(
         self,
         sampled_token_ids_cpu: torch.Tensor,
-        async_copy_ready_event: torch.cuda.Event,
+        async_copy_ready_event: torch.Event,
     ) -> None:
         """
         In async scheduling case, store ref to sampled_token_ids_cpu
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 67f575f92cc6..506118d2d762 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -185,7 +185,7 @@ def __init__(
         self._invalid_req_indices = invalid_req_indices
 
         # Event on the copy stream so we can synchronize the non-blocking copy.
-        self.async_copy_ready_event = torch.cuda.Event()
+        self.async_copy_ready_event = torch.Event()
 
         # Keep a reference to the device tensor to avoid it being
         # deallocated until we finish copying it to the host.
@@ -435,10 +435,10 @@ def __init__(
         self.async_output_copy_stream: torch.cuda.Stream | None = None
         # cuda event to synchronize use of reused CPU tensors between steps
         # when async scheduling is enabled.
-        self.prepare_inputs_event: torch.cuda.Event | None = None
+        self.prepare_inputs_event: torch.Event | None = None
         if self.use_async_scheduling:
             self.async_output_copy_stream = torch.cuda.Stream()
-            self.prepare_inputs_event = torch.cuda.Event()
+            self.prepare_inputs_event = torch.Event()
 
         # self.cudagraph_batch_sizes sorts in ascending order.
         if (
@@ -549,7 +549,7 @@ def __init__(
 
         # Cached outputs.
         self._draft_token_ids: list[list[int]] | torch.Tensor | None = None
-        self.transfer_event = torch.cuda.Event()
+        self.transfer_event = torch.Event()
         self.sampled_token_ids_pinned_cpu = torch.empty(
             (self.max_num_reqs, 1),
             dtype=torch.int64,
@@ -559,10 +559,10 @@ def __init__(
 
         # Pre-allocated tensor for copying valid sampled token counts to CPU,
         # with dedicated stream for overlapping and event for coordination.
-        self.valid_sampled_token_count_event: torch.cuda.Event | None = None
+        self.valid_sampled_token_count_event: torch.Event | None = None
         self.valid_sampled_token_count_copy_stream: torch.cuda.Stream | None = None
         if self.use_async_scheduling and self.num_spec_tokens:
-            self.valid_sampled_token_count_event = torch.cuda.Event()
+            self.valid_sampled_token_count_event = torch.Event()
             self.valid_sampled_token_count_copy_stream = torch.cuda.Stream()
         self.valid_sampled_token_count_cpu = torch.empty(
             self.max_num_reqs,
diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py
index 9f16b1e6d03e..be8326e2fdbc 100644
--- a/vllm/v1/worker/ubatching.py
+++ b/vllm/v1/worker/ubatching.py
@@ -27,8 +27,8 @@ def __init__(
         ready_barrier: threading.Barrier,
         cpu_wait_event: threading.Event,
         cpu_signal_event: threading.Event,
-        gpu_comm_done_event: torch.cuda.Event,
-        gpu_compute_done_event: torch.cuda.Event,
+        gpu_comm_done_event: torch.Event,
+        gpu_compute_done_event: torch.Event,
         schedule: str = "default",
     ):
         self.id = id
@@ -207,8 +207,8 @@ def make_ubatch_contexts(
     Create a context manager for micro-batching synchronization.
     """
     cpu_events = [threading.Event() for _ in range(num_micro_batches)]
-    gpu_comm_done_events = [torch.cuda.Event() for _ in range(num_micro_batches)]
-    gpu_compute_done_events = [torch.cuda.Event() for _ in range(num_micro_batches)]
+    gpu_comm_done_events = [torch.Event() for _ in range(num_micro_batches)]
+    gpu_compute_done_events = [torch.Event() for _ in range(num_micro_batches)]
 
     assert len(forward_contexts) == 2
 
diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py
index 4f82c18da73a..30563305853a 100644
--- a/vllm/v1/worker/xpu_model_runner.py
+++ b/vllm/v1/worker/xpu_model_runner.py
@@ -37,19 +37,12 @@ def _sync_device(self) -> None:
 
 @contextmanager
 def _torch_cuda_wrapper():
-    class _EventPlaceholder:
-        def __init__(self, *args, **kwargs) -> None:
-            self.record = lambda: None
-            self.synchronize = lambda: None
-
     try:
         # replace cuda APIs with xpu APIs, this should work by default
-        torch.cuda.Event = torch.xpu.Event
         torch.cuda.Stream = torch.xpu.Stream
         torch.cuda.default_stream = torch.xpu.current_stream
         torch.cuda.current_stream = torch.xpu.current_stream
         torch.cuda.stream = torch.xpu.stream
         yield
     finally:
-        # if anything goes wrong, just patch it with a placeholder
-        torch.cuda.Event = _EventPlaceholder
+        pass

From 67745d189fd981ee824bde35666a3737a962c031 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 18 Nov 2025 15:29:06 -0500
Subject: [PATCH 164/578] Supress verbose logs from
 model_hosting_container_standards (#28949)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 3cf66fcd27e2..3974f45a7135 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -5,6 +5,7 @@
 import importlib
 import inspect
 import json
+import logging
 import multiprocessing
 import multiprocessing.forkserver as forkserver
 import os
@@ -2020,6 +2021,9 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     # Add process-specific prefix to stdout and stderr.
     decorate_logs("APIServer")
 
+    # Suppress verbose logs from model_hosting_container_standards
+    logging.getLogger("model_hosting_container_standards").setLevel(logging.ERROR)
+
     listen_address, sock = setup_server(args)
     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
 

From 49ef847aa82c93615d5d86fac81e4716e9cd27cd Mon Sep 17 00:00:00 2001
From: Johnny <johnnync13@gmail.com>
Date: Wed, 19 Nov 2025 01:44:27 +0100
Subject: [PATCH 165/578] [NVIDIA] Guard SM100 CUTLASS MoE macro to SM100
 builds v2 (#28938)

Signed-off-by: johnnynunez <johnnynuca14@gmail.com>
Signed-off-by: Johnny <johnnynuca14@gmail.com>
---
 CMakeLists.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a37040edbf1..c1c7478b9f3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -512,9 +512,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
   # require CUDA 12.8 or later
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
   else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
     set(SRCS
@@ -619,9 +619,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # FP4 Archs and flags
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
   else()
-    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
     set(SRCS
@@ -695,7 +695,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
   else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
@@ -741,9 +741,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
   else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")

From 9912b8ccb861593d76216afa583ac593faf5a309 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 18 Nov 2025 19:45:20 -0500
Subject: [PATCH 166/578] [Build] Add OpenAI triton_kernels (#28788)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 .gitignore                                    |  3 ++
 CMakeLists.txt                                |  5 ++
 cmake/external_projects/triton_kernels.cmake  | 53 +++++++++++++++++++
 setup.py                                      | 17 ++++++
 .../layers/quantization/utils/mxfp4_utils.py  |  2 +
 vllm/utils/import_utils.py                    | 40 +++++++++++++-
 6 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 cmake/external_projects/triton_kernels.cmake

diff --git a/.gitignore b/.gitignore
index 50070d7898fe..7cda86478664 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,9 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
 
+# OpenAI triton kernels copied from source
+vllm/third_party/triton_kernels/*
+
 # triton jit
 .triton
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c1c7478b9f3e..ae8e6175443f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1030,6 +1030,11 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
     WITH_SOABI)
 endif()
 
+# For CUDA and HIP builds also build the triton_kernels external package.
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+    include(cmake/external_projects/triton_kernels.cmake)
+endif()
+
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
     include(cmake/external_projects/flashmla.cmake)
diff --git a/cmake/external_projects/triton_kernels.cmake b/cmake/external_projects/triton_kernels.cmake
new file mode 100644
index 000000000000..d35ad123dd9d
--- /dev/null
+++ b/cmake/external_projects/triton_kernels.cmake
@@ -0,0 +1,53 @@
+# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
+
+set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
+
+# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
+# be directly set to the triton_kernels python directory. 
+if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
+  message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
+  FetchContent_Declare(
+          triton_kernels
+          SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR}
+  )
+
+else()
+  set(TRITON_GIT "https://github.com/triton-lang/triton.git")
+  message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}")
+  FetchContent_Declare(
+          triton_kernels
+          # TODO (varun) : Fetch just the triton_kernels directory from Triton
+          GIT_REPOSITORY https://github.com/triton-lang/triton.git
+          GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG}
+          GIT_PROGRESS TRUE
+          SOURCE_SUBDIR python/triton_kernels/triton_kernels
+  )
+endif()
+
+# Fetch content 
+FetchContent_MakeAvailable(triton_kernels)
+
+if (NOT triton_kernels_SOURCE_DIR)
+  message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR")
+endif()
+
+if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
+  set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/")
+else()
+  set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/")
+endif()
+
+message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}")
+
+add_custom_target(triton_kernels)
+
+# Ensure the vllm/third_party directory exists before installation
+install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")")
+
+## Copy .py files to install directory.
+install(DIRECTORY
+        ${TRITON_KERNELS_PYTHON_DIR}
+        DESTINATION 
+        vllm/third_party/triton_kernels/
+        COMPONENT triton_kernels
+        FILES_MATCHING PATTERN "*.py")
diff --git a/setup.py b/setup.py
index e9b36e2a2e03..5591bcb13244 100644
--- a/setup.py
+++ b/setup.py
@@ -299,6 +299,20 @@ def run(self):
             os.makedirs(os.path.dirname(dst_file), exist_ok=True)
             self.copy_file(file, dst_file)
 
+        if _is_cuda() or _is_hip():
+            # copy vllm/third_party/triton_kernels/**/*.py from self.build_lib
+            # to current directory so that they can be included in the editable
+            # build
+            print(
+                f"Copying {self.build_lib}/vllm/third_party/triton_kernels "
+                "to vllm/third_party/triton_kernels"
+            )
+            shutil.copytree(
+                f"{self.build_lib}/vllm/third_party/triton_kernels",
+                "vllm/third_party/triton_kernels",
+                dirs_exist_ok=True,
+            )
+
 
 class precompiled_build_ext(build_ext):
     """Disables extension building when using precompiled binaries."""
@@ -633,6 +647,9 @@ def _read_requirements(filename: str) -> list[str]:
 if _is_cuda() or _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
     ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
+    # Optional since this doesn't get built (produce an .so file). This is just
+    # copying the relevant .py files from the source repository.
+    ext_modules.append(CMakeExtension(name="vllm.triton_kernels", optional=True))
 
 if _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 34a31bcf6a74..cbc46810a26a 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -8,6 +8,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
+from vllm.utils.import_utils import has_triton_kernels
 from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
 
 logger = init_logger(__name__)
@@ -15,6 +16,7 @@
 
 def _swizzle_mxfp4(quant_tensor, scale, num_warps):
     """weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel"""
+    assert has_triton_kernels()
     import triton_kernels.matmul_ogs_details.opt_flags as opt_flags
     from triton_kernels.numerics import InFlexData
     from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py
index f01d2c7a6a33..ff0f0350fd94 100644
--- a/vllm/utils/import_utils.py
+++ b/vllm/utils/import_utils.py
@@ -18,6 +18,10 @@
 import regex as re
 from typing_extensions import Never
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 # TODO: This function can be removed if transformer_modules classes are
 # serialized by value when communicating between processes
@@ -62,6 +66,35 @@ def import_pynvml():
     return pynvml
 
 
+@cache
+def import_triton_kernels():
+    """
+    For convenience, prioritize triton_kernels that is available in
+    `site-packages`. Use `vllm.third_party.triton_kernels` as a fall-back.
+    """
+    if _has_module("triton_kernels"):
+        import triton_kernels
+
+        logger.debug_once(
+            f"Loading module triton_kernels from {triton_kernels.__file__}.",
+            scope="local",
+        )
+    elif _has_module("vllm.third_party.triton_kernels"):
+        import vllm.third_party.triton_kernels as triton_kernels
+
+        logger.debug_once(
+            f"Loading module triton_kernels from {triton_kernels.__file__}.",
+            scope="local",
+        )
+        sys.modules["triton_kernels"] = triton_kernels
+    else:
+        logger.info_once(
+            "triton_kernels unavailable in this build. "
+            "Please consider installing triton_kernels from "
+            "https://github.com/triton-lang/triton/tree/main/python/triton_kernels"
+        )
+
+
 def import_from_path(module_name: str, file_path: str | os.PathLike):
     """
     Import a Python file according to its file path.
@@ -397,7 +430,12 @@ def has_deep_gemm() -> bool:
 
 def has_triton_kernels() -> bool:
     """Whether the optional `triton_kernels` package is available."""
-    return _has_module("triton_kernels")
+    is_available = _has_module("triton_kernels") or _has_module(
+        "vllm.third_party.triton_kernels"
+    )
+    if is_available:
+        import_triton_kernels()
+    return is_available
 
 
 def has_tilelang() -> bool:

From 1395461f5fb76145433c1dc8a3b7262ee3799bf8 Mon Sep 17 00:00:00 2001
From: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Date: Wed, 19 Nov 2025 02:49:36 +0200
Subject: [PATCH 167/578] [Hybrid][torch.compile] Refactor mamba2 forward to
 avoid obscuring linear projections under custom op (#28587)

Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
---
 .../layers/mamba/mamba_mixer2.py              | 156 ++++++++++--------
 vllm/model_executor/models/bamba.py           |   3 +-
 vllm/model_executor/models/falcon_h1.py       |   4 +-
 .../model_executor/models/granitemoehybrid.py |   3 +-
 vllm/model_executor/models/mamba2.py          |   3 +-
 vllm/model_executor/models/nemotron_h.py      |   3 +-
 vllm/model_executor/models/zamba2.py          |   6 +-
 7 files changed, 90 insertions(+), 88 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index fb45afa33dad..57313990b820 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -426,6 +426,10 @@ def __init__(
         # `ColumnParallelLinear` and `MergedColumnParallelLinear`,
         # and `set_weight_attrs` doesn't allow to override it
         self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+        conv_weights = self.conv1d.weight.view(
+            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
+        )
+        self.register_buffer("conv_weights", conv_weights, persistent=False)
 
         # - these are TPed by heads to reduce the size of the
         #   temporal shape
@@ -459,6 +463,17 @@ def __init__(
             intermediate_size, n_groups, self.use_rms_norm, eps=rms_norm_eps
         )
 
+        # - get hidden_states, B and C after depthwise convolution.
+        self.split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split(
+            hidden_states_B_C,
+            [
+                self.intermediate_size // self.tp_size,
+                self.groups_ssm_state_size // self.tp_size,
+                self.groups_ssm_state_size // self.tp_size,
+            ],
+            dim=-1,
+        )
+
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
@@ -470,10 +485,24 @@ def __init__(
         self.cache_config = cache_config
         self.prefix = prefix
 
+        # Pre-compute sizes for forward pass
+        self.tped_intermediate_size = self.intermediate_size // self.tp_size
+        self.tped_conv_size = self.conv_dim // self.tp_size
+        self.tped_dt_size = self.num_heads // self.tp_size
+
+        self.split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split(
+            hidden_states_B_C,
+            [
+                self.tped_intermediate_size,
+                self.groups_ssm_state_size // self.tp_size,
+                self.groups_ssm_state_size // self.tp_size,
+            ],
+            dim=-1,
+        )
+
     def forward_native(
         self,
         hidden_states: torch.Tensor,
-        output: torch.Tensor,
         mup_vector: torch.Tensor | None = None,
     ):
         pass
@@ -481,22 +510,55 @@ def forward_native(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        output: torch.Tensor,
         mup_vector: torch.Tensor | None = None,
     ):
+        # 1. Gated MLP's linear projection
+        projected_states, _ = self.in_proj(hidden_states)
+        if mup_vector is not None:
+            projected_states = projected_states * mup_vector
+
+        # 2. Prepare inputs for conv + SSM
+        ssm_output = torch.empty(
+            [
+                hidden_states.shape[0],
+                (self.num_heads // self.tp_size) * self.head_dim,
+            ],
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+
+        # 3. conv + SSM
+        # (split `projected_states` into hidden_states_B_C, dt in the custom op to
+        # ensure it is not treated as an intermediate tensor by torch compile)
         torch.ops.vllm.mamba_mixer2(
-            hidden_states,
-            output,
+            projected_states,
+            ssm_output,
             self.prefix,
-            mup_vector,
         )
 
-    def forward_cuda(
+        # 4. gated MLP
+        # GatedRMSNorm internally applying SiLU to the gate
+        # SiLU is applied internally before normalization, unlike standard
+        # norm usage
+        gate = projected_states[..., : self.tped_intermediate_size]
+        hidden_states = self.norm(ssm_output, gate)
+
+        # 5. Final linear projection
+        output, _ = self.out_proj(hidden_states)
+
+        return output
+
+    def conv_ssm_forward(
         self,
-        hidden_states: torch.Tensor,
+        projected_states: torch.Tensor,
         output: torch.Tensor,
-        mup_vector: torch.Tensor | None = None,
     ):
+        hidden_states_B_C, dt = torch.split(
+            projected_states[..., self.tped_intermediate_size :],
+            [self.tped_conv_size, self.tped_dt_size],
+            dim=-1,
+        )
+
         forward_context = get_forward_context()
         # attn_metadata contains metadata necessary for the mamba2 triton
         # kernels to operate in continuous batching and in chunked prefill
@@ -524,46 +586,13 @@ def forward_cuda(
             cu_chunk_seqlen_p = attn_metadata.cu_chunk_seqlen_p
             last_chunk_indices_p = attn_metadata.last_chunk_indices_p
 
-        # 1. Gated MLP's linear projection
-        projected_states, _ = self.in_proj(hidden_states)
-
-        if mup_vector is not None:
-            projected_states = projected_states * mup_vector
-
-        gate, hidden_states_B_C, dt = torch.split(
-            projected_states,
-            [
-                self.intermediate_size // self.tp_size,
-                self.conv_dim // self.tp_size,
-                self.num_heads // self.tp_size,
-            ],
-            dim=-1,
-        )
-
-        conv_weights = self.conv1d.weight.view(
-            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
-        )
-
-        # - get hidden_states, B and C after depthwise convolution.
-        split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split(
-            hidden_states_B_C,
-            [
-                self.intermediate_size // self.tp_size,
-                self.groups_ssm_state_size // self.tp_size,
-                self.groups_ssm_state_size // self.tp_size,
-            ],
-            dim=-1,
-        )
-
         if attn_metadata is None:
             # profile run
             hidden_states_B_C = (
                 hidden_states_B_C.transpose(0, 1).clone().transpose(0, 1)
             ).contiguous()
-            hidden_states, _B, _C = split_hidden_states_B_C_fn(hidden_states_B_C)
-            hidden_states = self.norm(hidden_states, gate)
-            out, _ = self.out_proj(hidden_states)
-            return out
+            hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C)
+            return hidden_states
 
         # NOTE: V0 put prefill before decode, v1 puts decode before prefill
         num_prefills = attn_metadata.num_prefills  # request count
@@ -622,18 +651,8 @@ def forward_cuda(
             block_idx_first_scheduled_token_p = None
             num_computed_tokens_p = None
 
-        # Preallocate output tensor to avoid memcpy cost for merging prefill
-        # and decode outputs
-        preallocated_ssm_out = torch.empty(
-            [
-                num_prefill_tokens + num_decodes,
-                (self.num_heads // self.tp_size) * self.head_dim,
-            ],
-            dtype=hidden_states.dtype,
-            device=hidden_states.device,
-        )
         preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split(
-            preallocated_ssm_out,
+            output[:num_actual_tokens],
             [num_decodes, num_prefill_tokens],
             dim=0,
         )
@@ -658,7 +677,7 @@ def forward_cuda(
             )  # this is the form that causal-conv see
             hidden_states_B_C_p = causal_conv1d_fn(
                 x,
-                conv_weights,
+                self.conv_weights,
                 self.conv1d.bias,
                 activation=self.activation,
                 conv_states=conv_state,
@@ -673,7 +692,9 @@ def forward_cuda(
                 query_start_loc=query_start_loc_p,
             ).transpose(0, 1)[:num_prefill_tokens]
 
-            hidden_states_p, B_p, C_p = split_hidden_states_B_C_fn(hidden_states_B_C_p)
+            hidden_states_p, B_p, C_p = self.split_hidden_states_B_C_fn(
+                hidden_states_B_C_p
+            )
 
             # 3. State Space Model sequence transformation
             initial_states = None
@@ -815,7 +836,7 @@ def forward_cuda(
             hidden_states_B_C_d = causal_conv1d_update(
                 hidden_states_B_C_d,
                 conv_state,
-                conv_weights,
+                self.conv_weights,
                 self.conv1d.bias,
                 self.activation,
                 conv_state_indices=state_indices_tensor_d,
@@ -823,7 +844,9 @@ def forward_cuda(
                 initial_state_idx=block_idx_last_computed_token_d,
             )
 
-            hidden_states_d, B_d, C_d = split_hidden_states_B_C_fn(hidden_states_B_C_d)
+            hidden_states_d, B_d, C_d = self.split_hidden_states_B_C_fn(
+                hidden_states_B_C_d
+            )
 
             # 3. State Space Model sequence transformation
             n_groups = self.n_groups // self.tp_size
@@ -861,15 +884,6 @@ def forward_cuda(
                 out=preallocated_ssm_out_d.view(num_decodes, -1, self.head_dim),
             )
 
-        # 4. gated MLP
-        # GatedRMSNorm internally applying SiLU to the gate
-        # SiLU is applied internally before normalization, unlike standard
-        # norm usage
-        hidden_states = self.norm(preallocated_ssm_out, gate[:num_actual_tokens])
-
-        # 5. Final linear projection
-        output[:num_actual_tokens], _ = self.out_proj(hidden_states)
-
     def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
         assert self.model_config is not None
         assert self.cache_config is not None
@@ -901,21 +915,19 @@ def get_attn_backend(self) -> type["AttentionBackend"]:
 
 
 def mamba_mixer2(
-    hidden_states: torch.Tensor,
+    projected_states: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    mup_vector: torch.Tensor | None = None,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
-    self.forward_cuda(hidden_states=hidden_states, output=output, mup_vector=mup_vector)
+    self.conv_ssm_forward(projected_states=projected_states, output=output)
 
 
 def mamba_mixer2_fake(
-    hidden_states: torch.Tensor,
+    projected_states: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    mup_vector: torch.Tensor | None = None,
 ) -> None:
     return
 
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index e0a2defd5127..c6cc83487fec 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -138,8 +138,7 @@ def forward(
         else:
             hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
-        output = torch.empty_like(hidden_states)
-        self.mamba(hidden_states, output)
+        output = self.mamba(hidden_states)
         # Fully Connected
         hidden_states, residual = self.pre_ff_layernorm(output, residual)
         hidden_states = self.feed_forward(hidden_states)
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 3653425b8e1c..b985847af5da 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -198,10 +198,8 @@ def forward(
         residual: torch.Tensor | None,
         **kwargs,
     ):
-        output = torch.empty_like(hidden_states)
-        self.mamba(
+        output = self.mamba(
             hidden_states,
-            output,
             mup_vector=self.mup_vector,
         )
         return output, residual
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 05177f1d1ac2..a340112ec62a 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -115,8 +115,7 @@ def forward(
     ):
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
-        output = torch.empty_like(hidden_states)
-        self.mamba(hidden_states, output)
+        output = self.mamba(hidden_states)
         hidden_states = residual + output * self.residual_multiplier
 
         residual = hidden_states
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index fc17f98be198..5fcfa9431230 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -87,8 +87,7 @@ def forward(
         else:
             hidden_states, residual = self.norm(hidden_states, residual)
 
-        output = torch.empty_like(hidden_states)
-        self.mixer(hidden_states, output)
+        output = self.mixer(hidden_states)
         return output, residual
 
 
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index f7e0caf410e1..8675eff59222 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -376,8 +376,7 @@ def forward(
         else:
             hidden_states, residual = self.norm(hidden_states, residual)
 
-        output = torch.empty_like(hidden_states)
-        self.mixer(hidden_states, output)
+        output = self.mixer(hidden_states)
         return output, residual
 
 
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 64e6979c8fcf..729a9655d087 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -567,11 +567,7 @@ def forward(
         hidden_states = self.input_layernorm(hidden_states)
 
         # Process through Mamba mixer
-        output = torch.empty_like(hidden_states)
-        self.mamba(
-            hidden_states,
-            output,
-        )
+        output = self.mamba(hidden_states)
 
         # residual connection after mamba
         hidden_states = residual + output

From da94c7c0eb8dabea9c500dbd70fa042497497689 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 18 Nov 2025 16:52:41 -0800
Subject: [PATCH 168/578] Move online quantization to `model.load_weights`
 (#26327)

Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
---
 examples/offline_inference/rlhf.py            |   2 +-
 .../offline_inference/rlhf_online_quant.py    | 162 ++++++++++++++
 .../model_loader/default_loader.py            |  46 +---
 .../model_loader/online_quantization.py       | 205 +++++++++++-------
 vllm/model_executor/model_loader/utils.py     |   8 +
 vllm/model_executor/models/utils.py           |   4 +
 6 files changed, 314 insertions(+), 113 deletions(-)
 create mode 100644 examples/offline_inference/rlhf_online_quant.py

diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index 0c09e603271d..6f05968ce065 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -62,7 +62,7 @@ def __init__(self, *args, **kwargs):
 
 # Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
 # Learn more about Ray placement groups:
-# https://docs.ray.io/en/latest/placement-groups.html
+# https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html
 pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
 ray.get(pg_inference.ready())
 scheduling_inference = PlacementGroupSchedulingStrategy(
diff --git a/examples/offline_inference/rlhf_online_quant.py b/examples/offline_inference/rlhf_online_quant.py
new file mode 100644
index 000000000000..2d98ad22c589
--- /dev/null
+++ b/examples/offline_inference/rlhf_online_quant.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
+
+The script separates training and inference workloads onto distinct GPUs
+so that Ray can manage process placement and inter-process communication.
+A Hugging Face Transformer model occupies GPU 0 for training, whereas a
+tensor-parallel vLLM inference engine occupies GPU 1–2.
+
+The example performs the following steps:
+
+* Load the training model on GPU 0.
+* Split the inference model across GPUs 1–2 using vLLM's tensor parallelism
+  and Ray placement groups.
+* Generate text from a list of prompts using the inference engine.
+* Update the weights of the training model and broadcast the updated weights
+  to the inference engine by using a Ray collective RPC group. Note that
+  for demonstration purposes we simply zero out the weights.
+
+For a production-ready implementation that supports multiple training and
+inference replicas, see the OpenRLHF framework:
+https://github.com/OpenRLHF/OpenRLHF
+
+This example assumes a single-node cluster with three GPUs, but Ray
+supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
+workloads. Residual GPU activity interferes with vLLM memory profiling and
+causes unexpected behavior.
+"""
+
+import json
+import os
+
+import ray
+import torch
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from rlhf_utils import stateless_init_process_group
+from torchao.core.config import config_to_dict
+from torchao.quantization import (
+    Float8DynamicActivationFloat8WeightConfig,
+    PerRow,
+)
+from transformers import AutoModelForCausalLM
+
+from vllm import LLM, SamplingParams
+from vllm.utils.network_utils import get_ip, get_open_port
+
+
+class MyLLM(LLM):
+    """Configure the vLLM worker for Ray placement group execution."""
+
+    def __init__(self, *args, **kwargs):
+        # Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
+        # so that vLLM can manage its own device placement within the worker.
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+        super().__init__(*args, **kwargs)
+
+
+# Load the OPT-125M model onto GPU 0 for the training workload.
+train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+train_model.to("cuda:0")
+
+# Initialize Ray and set the visible devices. The vLLM engine will
+# be placed on GPUs 1 and 2.
+os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
+ray.init()
+
+# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
+# Learn more about Ray placement groups:
+# https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html
+pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
+ray.get(pg_inference.ready())
+scheduling_inference = PlacementGroupSchedulingStrategy(
+    placement_group=pg_inference,
+    placement_group_capture_child_tasks=True,
+    placement_group_bundle_index=0,
+)
+
+# Launch the vLLM inference engine. The `enforce_eager` flag reduces
+# start-up latency.
+
+# generate torchao quantization config for RL rollout
+# see https://github.com/vllm-project/vllm/pull/23014 for instructions to
+# use serialized config files instead of passing around json string
+config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+
+json_str = json.dumps(config_to_dict(config))
+
+llm = ray.remote(
+    num_cpus=0,
+    num_gpus=0,
+    scheduling_strategy=scheduling_inference,
+)(MyLLM).remote(
+    model="facebook/opt-125m",
+    hf_overrides={"quantization_config_dict_json": json_str},
+    enforce_eager=True,
+    worker_extension_cls="rlhf_utils.WorkerExtension",
+    tensor_parallel_size=2,
+    distributed_executor_backend="ray",
+)
+
+# Generate text from the prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+sampling_params = SamplingParams(temperature=0)
+
+outputs = ray.get(llm.generate.remote(prompts, sampling_params))
+
+print("-" * 50)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-" * 50)
+
+# Set up the communication channel between the training process and the
+# inference engine.
+master_address = get_ip()
+master_port = get_open_port()
+
+handle = llm.collective_rpc.remote(
+    "init_weight_update_group", args=(master_address, master_port, 1, 3)
+)
+
+model_update_group = stateless_init_process_group(
+    master_address, master_port, 0, 3, torch.device("cuda:0")
+)
+ray.get(handle)
+
+# Simulate a training step by zeroing out all model weights.
+# In a real RLHF training loop the weights would be updated using the gradient
+# from an RL objective such as PPO on a reward model.
+for name, p in train_model.named_parameters():
+    p.data.zero_()
+
+# Synchronize the updated weights to the inference engine.
+for name, p in train_model.named_parameters():
+    dtype_name = str(p.dtype).split(".")[-1]
+    handle = llm.collective_rpc.remote(
+        "update_weight", args=(name, dtype_name, p.shape)
+    )
+    model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
+    ray.get(handle)
+
+# Verify that the inference weights have been updated.
+assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
+
+# Generate text with the updated model. The output is expected to be nonsense
+# because the weights are zero.
+outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
+print("-" * 50)
+for output in outputs_updated:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-" * 50)
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index c06ac550a94a..b80026741781 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -22,6 +22,7 @@
     fastsafetensors_weights_iterator,
     filter_duplicate_safetensors_files,
     filter_files_not_needed_for_inference,
+    get_quant_config,
     maybe_download_from_modelscope,
     multi_thread_pt_weights_iterator,
     multi_thread_safetensors_weights_iterator,
@@ -273,42 +274,17 @@ def download_model(self, model_config: ModelConfig) -> None:
         )
 
     def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
-        if model_config.quantization == "torchao" and torchao_version_at_least(
-            "0.14.0"
-        ):
-            self.load_config.safetensors_load_strategy = "torchao"
-        weights_to_load = {name for name, _ in model.named_parameters()}
-
-        # if we don't have `model.weight_metadata_and_attr_saved` defined and
-        # set to True, it means that this is either offline quantization case
-        # or the first run of online quantization
-        # see online_quantization.py for detailed notes
-        offline_quantization_or_first_run_of_online_quantization = not getattr(
-            model, "weight_metadata_and_attr_saved", False
-        )
+        if model_config.quantization == "torchao":
+            quant_config = get_quant_config(model_config, self.load_config)
+            if (
+                hasattr(quant_config, "is_checkpoint_torchao_serialized")
+                and quant_config.is_checkpoint_torchao_serialized
+                and torchao_version_at_least("0.14.0")
+            ):
+                self.load_config.safetensors_load_strategy = "torchao"
 
-        if model_config.quantization is None:
-            # model is not quantized
-            loaded_weights = model.load_weights(
-                self.get_all_weights(model_config, model)
-            )
-        elif offline_quantization_or_first_run_of_online_quantization:
-            # case 1: offline quantized checkpoint
-            # case 2: Step I1 first run of weight loading with
-            # online quantization
-            # see online_quantization.py for detailed notes
-            loaded_weights = model.load_weights(
-                self.get_all_weights(model_config, model)
-            )
-        else:
-            # to avoid circular dependency
-            from vllm.model_executor.model_loader.online_quantization import (
-                load_weights_and_online_quantize,
-            )
-
-            # subsequent runs of weight loading with online
-            # quantization
-            loaded_weights = load_weights_and_online_quantize(self, model, model_config)
+        weights_to_load = {name for name, _ in model.named_parameters()}
+        loaded_weights = model.load_weights(self.get_all_weights(model_config, model))
 
         self.counter_after_loading_weights = time.perf_counter()
         logger.info_once(
diff --git a/vllm/model_executor/model_loader/online_quantization.py b/vllm/model_executor/model_loader/online_quantization.py
index 890dd7231a0e..f330af85bbe8 100644
--- a/vllm/model_executor/model_loader/online_quantization.py
+++ b/vllm/model_executor/model_loader/online_quantization.py
@@ -2,13 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import types
+from collections.abc import Iterable
 
 import torch
 from torch import nn
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
-from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
 from vllm.model_executor.model_loader.utils import process_weights_after_loading
 
 logger = init_logger(__name__)
@@ -56,6 +56,9 @@
 #    R4. quantize weights (by calling process_weights_after_loading),
 #    also set `process_weights_after_loading_already_called` to
 #    True to stop it from running again
+#    R5. (workaround for cudagraph), we restore the weight params to original quantized
+#    weights params, and use original_weight_param.copy_(updated_weight_param) so that
+#    the weight update work well with cudagraph
 #  process_weights_after_loading (if called):
 #    this will be skipped since it's already ran in
 #    load_weights
@@ -69,14 +72,6 @@ def maybe_save_metadata_and_attributes_for_weight_reloading(
     if model_config.quantization != "torchao":
         return
 
-    if getattr(model, "process_weights_after_loading_already_called", False):
-        # In case `process_weights_after_loading` is called multiple times
-        # we'll skip it at later times
-        logger.warning(
-            "process_weights_after_loading already called for model %s", model
-        )
-        return
-
     from vllm.model_executor.model_loader.weight_utils import get_quant_config
 
     quant_config = get_quant_config(model_config, None)
@@ -137,6 +132,7 @@ def maybe_save_metadata_and_attributes_for_weight_reloading(
                 else:
                     model.recorded_weight_attr[name][key] = attr
     # mark the metadata and attributes saved so we don't run it again
+    model._model_config = model_config
     model.weight_metadata_and_attr_saved = True
 
 
@@ -148,77 +144,132 @@ def _bond_method_to_cls(func, obj):
         return types.MethodType(func, obj)
 
 
-def load_weights_and_online_quantize(
-    model_loader: DefaultModelLoader, model: nn.Module, model_config: ModelConfig
-) -> set[str]:
+def support_quantized_model_reload_from_hp_weights(original_load_weights):
+    """Decorator for `load_weights` method for AutoWeightsLoader.load_weights to support
+    reloading high precision (bfloat16/float16/float32) weight for an already quantized
+    model, this involves restoring the weights to a high precision weights and
+    then online quantize the weights
+    """
     # online quantization, right now only enabled for
     # torchao
-    # R1, R2, R3, R4 in the Notes
-
-    # TODO: Add fp8 support
-    assert model_config.quantization == "torchao", (
-        "online quantization is only enabled for torchao currently"
-    )
-    # TODO: use create_weights to restore the weights to original state
-
-    # Step R1: First restore the quantized weights to original bfloat16
-    # weights, with original metadata (shape, dtype, device)
-    # and attributes, so that bfloat16 weights can be loaded properly
-    existing_param_names = dict(model.named_parameters(remove_duplicate=False)).keys()
-    named_modules = dict(model.named_modules(remove_duplicate=False))
-    model_device = None
-
-    # Step R2: recover the parameter to the state before first loading
-    for name, d in model.original_weights_rebuild_keys.items():
-        _shape = d["shape"]
-        _dtype = d["dtype"]
-        _device = d["device"]
+    # R1, R2, R3, R4, R5 in the Notes
+
+    def patched_model_load_weights(
+        auto_weight_loader, weights: Iterable[tuple[str, torch.Tensor]], *, mapper=None
+    ) -> set[str]:
+        model = auto_weight_loader.module
+        offline_quantization_or_first_run_of_online_quantization = not getattr(
+            model, "weight_metadata_and_attr_saved", False
+        )
+
+        # if we don't have `model.weight_metadata_and_attr_saved` defined and
+        # set to True, it means that this is either offline quantization case
+        # or the first run of online quantization
+        # see Notes in this file for more details
+        if offline_quantization_or_first_run_of_online_quantization:
+            # case 1: offline quantized checkpoint
+            # case 2: Step I1 first run of weight loading with
+            # online quantization
+            return original_load_weights(auto_weight_loader, weights, mapper=mapper)
+
+        model_config = model._model_config
+
+        # TODO: Add fp8 support
+        assert model_config.quantization == "torchao", (
+            "online quantization is only enabled for torchao currently"
+        )
+        # TODO: use create_weights to restore the weights to original state
+
+        # Step R1: First restore the quantized weights to original bfloat16
+        # weights, with original metadata (shape, dtype, device)
+        # and attributes, so that bfloat16 weights can be loaded properly
+        # TODO: maybe set remove_duplicate to True?
+        original_quantized_weight_dict = dict(
+            model.named_parameters(remove_duplicate=False)
+        )
+        named_modules = dict(model.named_modules(remove_duplicate=False))
+        model_device = None
+
+        for name, d in model.original_weights_rebuild_keys.items():
+            _shape = d["shape"]
+            _dtype = d["dtype"]
+            _device = d["device"]
+            if model_device is not None:
+                assert model_device == _device, (
+                    "Expecting all weights "
+                    "to be in the same device for now, got both: "
+                    f"{model_device} and {_device}"
+                )
+            else:
+                model_device = _device
+
+            if name in original_quantized_weight_dict:
+                module_name, weight_name = name.rsplit(".", 1)
+                module = named_modules[module_name]
+                setattr(
+                    module,
+                    weight_name,
+                    torch.nn.Parameter(
+                        torch.empty(_shape, dtype=_dtype, device=_device),
+                        requires_grad=False,
+                    ),
+                )
+
+        # Step R2: recover the weight attributes to the state before first loading
+        # recorded_weight_attr is
+        # {"weight_name": {"weight_attr_key": attr}}
+        # e.g.
+        # {
+        #   {
+        #     "layer.0.weight": {
+        #       "weight_loader": weight_loader_function_object,
+        #       "input_dim": 0, ...
+        #     },
+        #     "layer.1.weight": ...,
+        #    }
+        # }
+        for full_weight_name, weight_attr_dict in model.recorded_weight_attr.items():
+            for attr_name, attr in weight_attr_dict.items():
+                module_name, weight_name = full_weight_name.rsplit(".", 1)
+                module = named_modules[module_name]
+                weight = getattr(module, weight_name)
+                if not hasattr(weight, attr_name):
+                    setattr(weight, attr_name, _bond_method_to_cls(attr, weight))
+
+        # Step R3: reload bfloat16 / high precision weights
+        updated_params = original_load_weights(
+            auto_weight_loader, weights, mapper=mapper
+        )
+
+        # Step R4: online quantize the weights
+        # manually process weights after loading
+        model.process_weights_after_loading_already_called = False
         if model_device is not None:
-            assert model_device == _device, (
-                "Expecting all weights "
-                "to be in the same device for now, got both: "
-                f"{model_device} and {_device}"
-            )
+            process_weights_after_loading(model, model_config, model_device)
         else:
-            model_device = _device
-
-        if name in existing_param_names:
-            module_name, weight_name = name.rsplit(".", 1)
-            module = named_modules[module_name]
-            setattr(
-                module,
-                weight_name,
-                torch.nn.Parameter(torch.empty(_shape, dtype=_dtype, device=_device)),
+            logger.warning_once(
+                "model_device is None, skip calling process_weights_after_loading"
             )
 
-    # recorded_weight_attr is
-    # {"weight_name": {"weight_attr_key": attr}}
-    # e.g.
-    # {
-    #   {
-    #     "layer.0.weight": {
-    #       "weight_loader": weight_loader_function_object,
-    #       "input_dim": 0, ...
-    #     },
-    #     "layer.1.weight": ...,
-    #    }
-    # }
-    for full_weight_name, weight_attr_dict in model.recorded_weight_attr.items():
-        for attr_name, attr in weight_attr_dict.items():
-            module_name, weight_name = full_weight_name.rsplit(".", 1)
-            module = named_modules[module_name]
-            weight = getattr(module, weight_name)
-            if not hasattr(weight, attr_name):
-                setattr(weight, attr_name, _bond_method_to_cls(attr, weight))
-
-    # Step I1: reload bfloat16 / high precision weights
-    loaded_weights = model.load_weights(
-        model_loader.get_all_weights(model_config, model)
-    )
-
-    # Step I2: online quantize the weights
-    # manually process weights after loading
-    model.process_weights_after_loading_already_called = False
-    process_weights_after_loading(model, model_config, model_device)
-    model.process_weights_after_loading_already_called = True
-    return loaded_weights
+        # Step R5 (workaround for cudagraph): restore the original quantized weights
+        # and do a copy_ of the currents weights to the original weights
+        updated_quantized_weights = dict(model.named_parameters(remove_duplicate=False))
+        for name in model.original_weights_rebuild_keys:
+            if name in original_quantized_weight_dict:
+                original_quantized_weight = original_quantized_weight_dict[name]
+                updated_quantized_weight = updated_quantized_weights[name]
+
+                module_name, weight_name = name.rsplit(".", 1)
+                module = named_modules[module_name]
+                setattr(module, weight_name, original_quantized_weight)
+                with torch.no_grad():
+                    original_quantized_weight.copy_(updated_quantized_weight)
+
+        del original_quantized_weight_dict
+        del named_modules
+        del updated_quantized_weight
+
+        model.process_weights_after_loading_already_called = True
+        return updated_params
+
+    return patched_model_load_weights
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index ba708a098c0d..e74434e9d12c 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -88,6 +88,14 @@ def initialize_model(
 def process_weights_after_loading(
     model: nn.Module, model_config: ModelConfig, target_device: torch.device
 ) -> None:
+    if getattr(model, "process_weights_after_loading_already_called", False):
+        # In case `process_weights_after_loading` is called multiple times
+        # we'll skip it at later times
+        logger.debug_once(
+            "process_weights_after_loading already called for model %s", model
+        )
+        return
+
     # to avoid circular dependency
     from vllm.model_executor.model_loader.online_quantization import (
         maybe_save_metadata_and_attributes_for_weight_reloading,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index ca5af358e2ee..ccefd7e66697 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -21,6 +21,9 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
 )
+from vllm.model_executor.model_loader.online_quantization import (
+    support_quantized_model_reload_from_hp_weights,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import supports_any_eagle
 from vllm.multimodal import NestedTensors
@@ -316,6 +319,7 @@ def _load_module(
                 )
                 raise ValueError(msg)
 
+    @support_quantized_model_reload_from_hp_weights
     def load_weights(
         self,
         weights: Iterable[tuple[str, torch.Tensor]],

From 40b6b38f2c8f8df1dbc145b48df99575f191014f Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Tue, 18 Nov 2025 18:10:02 -0800
Subject: [PATCH 169/578] [Core] Switch Flat logprob control from environment
 variable to SamplingParams (#28914)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 tests/samplers/test_logprobs.py |  3 +--
 tests/test_logprobs.py          | 32 ++++++++++----------------------
 vllm/envs.py                    |  6 ------
 vllm/logprobs.py                | 10 ++++------
 vllm/sampling_params.py         |  6 ++++++
 vllm/v1/engine/logprobs.py      | 17 ++++++++++++-----
 6 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index c9d227599cde..ea40c4802720 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -24,9 +24,7 @@ def test_ranks(
     greedy,
     flat_logprobs,
     example_prompts,
-    monkeypatch: pytest.MonkeyPatch,
 ):
-    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1" if flat_logprobs else "0")
     with vllm_runner(model, dtype=dtype, max_logprobs=MAX_LOGPROBS) as vllm_model:
         tokenizer = vllm_model.llm.get_tokenizer()
         example_prompt_tokens = [tokenizer.encode(prompt) for prompt in example_prompts]
@@ -36,6 +34,7 @@ def test_ranks(
             max_tokens=MAX_TOKENS,
             logprobs=NUM_TOP_LOGPROBS,
             prompt_logprobs=NUM_PROMPT_LOGPROBS,
+            flat_logprobs=flat_logprobs,
         )
         results = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
 
diff --git a/tests/test_logprobs.py b/tests/test_logprobs.py
index d26a460d2bca..75e9d337aa24 100644
--- a/tests/test_logprobs.py
+++ b/tests/test_logprobs.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-import pytest
-
 from vllm.logprobs import (
     FlatLogprobs,
     Logprob,
@@ -14,24 +12,20 @@
 )
 
 
-def test_create_logprobs_non_flat(monkeypatch: pytest.MonkeyPatch) -> None:
-    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0")
-
-    prompt_logprobs = create_prompt_logprobs()
+def test_create_logprobs_non_flat() -> None:
+    prompt_logprobs = create_prompt_logprobs(flat_logprobs=False)
     assert isinstance(prompt_logprobs, list)
     # Ensure first prompt position logprobs is None
     assert len(prompt_logprobs) == 1
     assert prompt_logprobs[0] is None
 
-    sample_logprobs = create_sample_logprobs()
+    sample_logprobs = create_sample_logprobs(flat_logprobs=False)
     assert isinstance(sample_logprobs, list)
     assert len(sample_logprobs) == 0
 
 
-def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None:
-    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1")
-
-    prompt_logprobs = create_prompt_logprobs()
+def test_create_logprobs_flat() -> None:
+    prompt_logprobs = create_prompt_logprobs(flat_logprobs=True)
     assert isinstance(prompt_logprobs, FlatLogprobs)
     assert prompt_logprobs.start_indices == [0]
     assert prompt_logprobs.end_indices == [0]
@@ -43,7 +37,7 @@ def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None:
     assert len(prompt_logprobs) == 1
     assert prompt_logprobs[0] == dict()
 
-    sample_logprobs = create_sample_logprobs()
+    sample_logprobs = create_sample_logprobs(flat_logprobs=True)
     assert isinstance(sample_logprobs, FlatLogprobs)
     assert len(sample_logprobs.start_indices) == 0
     assert len(sample_logprobs.end_indices) == 0
@@ -54,11 +48,8 @@ def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None:
     assert len(sample_logprobs) == 0
 
 
-def test_append_logprobs_for_next_position_none_flat(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0")
-    logprobs = create_sample_logprobs()
+def test_append_logprobs_for_next_position_none_flat() -> None:
+    logprobs = create_sample_logprobs(flat_logprobs=False)
     append_logprobs_for_next_position(
         logprobs,
         token_ids=[1],
@@ -85,11 +76,8 @@ def test_append_logprobs_for_next_position_none_flat(
     ]
 
 
-def test_append_logprobs_for_next_position_flat(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1")
-    logprobs = create_sample_logprobs()
+def test_append_logprobs_for_next_position_flat() -> None:
+    logprobs = create_sample_logprobs(flat_logprobs=True)
     append_logprobs_for_next_position(
         logprobs,
         token_ids=[1],
diff --git a/vllm/envs.py b/vllm/envs.py
index 6bf05803e14e..62b3344ccd85 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -225,7 +225,6 @@
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
-    VLLM_FLAT_LOGPROBS: bool = False
 
 
 def get_default_cache_root():
@@ -1499,11 +1498,6 @@ def get_vllm_port() -> int | None:
     "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices(
         "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"]
     ),
-    # Flag to enable FlatLogprobs whose GC overhead is significantly smaller than
-    # the original list[dict[int, Logprob]] approach.
-    # After enabled, PromptLogprobs and SampleLogprobs would populated as
-    # FlatLogprobs.
-    "VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/logprobs.py b/vllm/logprobs.py
index a34398db2c96..6a820308f523 100644
--- a/vllm/logprobs.py
+++ b/vllm/logprobs.py
@@ -5,8 +5,6 @@
 from dataclasses import dataclass, field
 from typing import overload
 
-import vllm.envs as envs
-
 
 # We use dataclass for now because it is used for
 # openai server output, and msgspec is not serializable.
@@ -161,17 +159,17 @@ def __iter__(self) -> Iterator[LogprobsOnePosition]:
 SampleLogprobs = FlatLogprobs | list[LogprobsOnePosition]
 
 
-def create_prompt_logprobs() -> PromptLogprobs:
+def create_prompt_logprobs(flat_logprobs: bool) -> PromptLogprobs:
     """Creates a container to store prompt logprobs for a request"""
-    logprobs = FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else []
+    logprobs = FlatLogprobs() if flat_logprobs else []
     # NOTE: logprob of first prompt token is None.
     logprobs.append(None)
     return logprobs
 
 
-def create_sample_logprobs() -> SampleLogprobs:
+def create_sample_logprobs(flat_logprobs: bool) -> SampleLogprobs:
     """Creates a container to store decode logprobs for a request"""
-    return FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else []
+    return FlatLogprobs() if flat_logprobs else []
 
 
 def append_logprobs_for_next_position(
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 901d66163452..0fb1d67687c8 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -204,6 +204,12 @@ class SamplingParams(
     prompt_logprobs: int | None = None
     """Number of log probabilities to return per prompt token.
     When set to -1, return all `vocab_size` log probabilities."""
+    flat_logprobs: bool = False
+    """Whether to return logprobs in flatten format (i.e. FlatLogprob)
+    for better performance.
+    NOTE: GC costs of FlatLogprobs is significantly smaller than
+    list[dict[int, Logprob]]. After enabled, PromptLogprobs and
+    SampleLogprobs would populated as FlatLogprobs."""
     # NOTE: This parameter is only exposed at the engine level for now.
     # It is not exposed in the OpenAI API server, as the OpenAI API does
     # not support returning only a list of token IDs.
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index b618d2347265..63064a2c65d6 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -43,15 +43,22 @@ def from_new_request(
         tokenizer: AnyTokenizer | None,
         request: EngineCoreRequest,
     ) -> "LogprobsProcessor":
-        assert request.sampling_params is not None
-        num_logprobs = request.sampling_params.logprobs
-        num_prompt_logprobs = request.sampling_params.prompt_logprobs
+        sampling_params = request.sampling_params
+        assert sampling_params is not None
+        num_logprobs = sampling_params.logprobs
+        num_prompt_logprobs = sampling_params.prompt_logprobs
         return cls(
             tokenizer=tokenizer,
             cumulative_logprob=(None if num_logprobs is None else 0.0),
-            logprobs=(None if num_logprobs is None else create_sample_logprobs()),
+            logprobs=(
+                None
+                if num_logprobs is None
+                else create_sample_logprobs(sampling_params.flat_logprobs)
+            ),
             prompt_logprobs=(
-                None if num_prompt_logprobs is None else create_prompt_logprobs()
+                None
+                if num_prompt_logprobs is None
+                else create_prompt_logprobs(sampling_params.flat_logprobs)
             ),
             num_prompt_logprobs=num_prompt_logprobs,
             num_logprobs=num_logprobs,

From 20852c8f4c10d80204c47e0cb85f5b252ff51c86 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 19 Nov 2025 10:32:00 +0800
Subject: [PATCH 170/578] [CPU] Refactor CPU WNA16  (#28826)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../scripts/hardware_ci/run-cpu-test.sh       |  11 +-
 cmake/cpu_extension.cmake                     |   1 +
 csrc/cpu/cpu_attn_impl.hpp                    |   2 +-
 csrc/cpu/cpu_types_x86.hpp                    |  47 +-
 csrc/cpu/cpu_wna16.cpp                        | 402 +++++++++++
 csrc/cpu/dnnl_helper.cpp                      |   6 +-
 csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp    | 245 +++++++
 csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp   |  91 +++
 csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp    | 115 ++++
 csrc/cpu/torch_bindings.cpp                   |  16 +
 csrc/cpu/utils.hpp                            |  55 ++
 docs/getting_started/installation/cpu.md      |   4 +-
 requirements/cpu.txt                          |   1 -
 tests/quantization/test_cpu_wna16.py          |  23 +
 vllm/_custom_ops.py                           |  25 +
 vllm/config/model.py                          |   2 +
 vllm/envs.py                                  |   5 -
 .../layers/fused_moe/cpu_fused_moe.py         |  49 --
 .../fused_moe/unquantized_fused_moe_method.py |   2 +-
 .../layers/quantization/__init__.py           |   5 +
 .../layers/quantization/cpu_wna16.py          | 625 ++++++++++++++++++
 .../layers/quantization/ipex_quant.py         |   2 +-
 22 files changed, 1656 insertions(+), 78 deletions(-)
 create mode 100644 csrc/cpu/cpu_wna16.cpp
 create mode 100644 csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
 create mode 100644 csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
 create mode 100644 csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
 create mode 100644 csrc/cpu/utils.hpp
 create mode 100644 tests/quantization/test_cpu_wna16.py
 create mode 100644 vllm/model_executor/layers/quantization/cpu_wna16.py

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 7479c43977d7..2267718f75ca 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -73,12 +73,11 @@ function cpu_tests() {
     pytest -x -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
 
-  # Note: disable it until supports V1
-  # Run AWQ test
-  # docker exec cpu-test-"$NUMA_NODE" bash -c "
-  #   set -e
-  #   pytest -x -s -v \
-  #   tests/quantization/test_ipex_quant.py"
+  # Run AWQ/GPTQ test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/quantization/test_cpu_wna16.py"
 
   # Run multi-lora tests
   docker exec cpu-test-"$NUMA_NODE" bash -c "
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index aa84125818d1..fbbb03c5ed46 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -375,6 +375,7 @@ set(VLLM_EXT_SRC
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
         "csrc/cpu/shm.cpp"
+        "csrc/cpu/cpu_wna16.cpp"
         ${VLLM_EXT_SRC})
     if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
         set(VLLM_EXT_SRC
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 344296528b65..294b4f714a76 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -1,7 +1,6 @@
 #ifndef CPU_ATTN_HPP
 #define CPU_ATTN_HPP
 
-#include <unistd.h>
 #include <type_traits>
 #include <cstddef>
 
@@ -12,6 +11,7 @@
 #include "cpu_types.hpp"
 #include "scratchpad_manager.h"
 #include "cpu_attn_macros.h"
+#include "utils.hpp"
 
 namespace cpu_attention {
 enum class ISA { AMX, VEC, VEC16 };
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 7ddf028e6e13..6f51277f7844 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -104,6 +104,8 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
   explicit FP16Vec16(bool, void* ptr)
       : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
 
+  explicit FP16Vec16(const c10::Half v) : reg(_mm256_set1_epi16(v.x)) {}
+
   explicit FP16Vec16(const FP32Vec16&);
 
   void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
@@ -141,6 +143,8 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
   explicit BF16Vec16(bool, void* ptr)
       : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
 
+  explicit BF16Vec16(const c10::BFloat16 v) : reg(_mm256_set1_epi16(v.x)) {}
+
   explicit BF16Vec16(const FP32Vec16&);
 
   void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
@@ -350,6 +354,22 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(__m512 data) : reg(data) {}
 
+  // de-pack 4 bit values
+  explicit FP32Vec16(int64_t value, const FP32Vec16& lut) {
+    int64_t mask_0 = 0x0F0F0F0F0F0F0F0F;
+    int64_t mask_1 = 0xF0F0F0F0F0F0F0F0;
+    int64_t value_0 = value & mask_0;
+    int64_t value_1 = value & mask_1;
+    __m128i vec_0 = _mm_movpi64_epi64((__m64)value_0);
+    __m128i vec_1 = _mm_movpi64_epi64((__m64)value_1);
+    vec_0 = _mm_cvtepu8_epi16(vec_0);
+    vec_1 = _mm_cvtepu8_epi16(vec_1);
+    vec_1 = _mm_slli_epi16(vec_1, 4);
+    __m128i vec = _mm_or_si128(vec_0, vec_1);
+    __m512i vec_i32 = _mm512_cvtepu8_epi32(vec);
+    reg = _mm512_permutexvar_ps(vec_i32, lut.reg);
+  }
+
   explicit FP32Vec16(const FP32Vec4& data)
       : reg((__m512)_mm512_inserti32x4(
             _mm512_inserti32x4(
@@ -426,14 +446,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   float get_last_elem() const { return _mm512_cvtss_f32(reg); }
 
-  template <int group_size>
-  float reduce_sub_sum(int idx) {
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
-    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
-    return _mm512_mask_reduce_add_ps(mask, reg);
-  }
-
   void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); }
 
   void save(float* ptr, const int elem_num) const {
@@ -755,6 +767,25 @@ inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
 inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
   _mm512_stream_ps((float*)ptr, vec.reg);
 }
+
+static void interleave_save(const BF16Vec16& vec0, const BF16Vec16& vec1,
+                            void* ptr) {
+  __m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg);
+  __m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg);
+  vec_1 = _mm512_slli_epi32(vec_1, 16);
+  vec_0 = _mm512_or_si512(vec_0, vec_1);
+  _mm512_storeu_epi32(ptr, vec_0);
+}
+
+static void interleave_save(const FP16Vec16& vec0, const FP16Vec16& vec1,
+                            void* ptr) {
+  __m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg);
+  __m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg);
+  vec_1 = _mm512_slli_epi32(vec_1, 16);
+  vec_0 = _mm512_or_si512(vec_0, vec_1);
+  _mm512_storeu_epi32(ptr, vec_0);
+}
+
 #endif
 
 inline void mem_barrier() { _mm_mfence(); }
diff --git a/csrc/cpu/cpu_wna16.cpp b/csrc/cpu/cpu_wna16.cpp
new file mode 100644
index 000000000000..816d195506e5
--- /dev/null
+++ b/csrc/cpu/cpu_wna16.cpp
@@ -0,0 +1,402 @@
+#include "cpu_types.hpp"
+#include "scratchpad_manager.h"
+#include "utils.hpp"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
+#endif
+#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp"
+
+#define VLLM_DISPATCH_CASE_16B_TYPES(...)                 \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+
+#define VLLM_DISPATCH_16B_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_16B_TYPES(__VA_ARGS__))
+
+template <typename T>
+void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
+                  int32_t stride) {
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(5) << name << ": [\n";
+  auto* curr_logits_buffer = ptr;
+  for (int32_t m = 0; m < row; ++m) {
+    for (int32_t n = 0; n < col; ++n) {
+      ss << curr_logits_buffer[n] << ", ";
+    }
+    ss << "\n";
+    curr_logits_buffer += stride;
+  }
+  ss << "]\n";
+  std::printf("%s", ss.str().c_str());
+}
+
+namespace {
+using cpu_utils::ISA;
+using cpu_utils::VecTypeTrait;
+
+template <typename scalar_t, ISA isa, bool has_zp, bool use_desc_act>
+class Dequantizer4b {
+ public:
+  constexpr static int32_t pack_num = 32 / 4;
+  using scalar_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+ public:
+  static void dequant(int32_t* __restrict__ q_weight,
+                      scalar_t* __restrict__ weight,
+                      scalar_t* __restrict__ scales,
+                      int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx,
+                      const int64_t scales_stride, const int64_t zeros_stride,
+                      const int32_t k_size, const int32_t group_size) {
+    vec_op::FP32Vec16 lut;
+    if constexpr (has_zp) {
+      // AWQ
+      alignas(64) static const float LUT[16] = {
+          0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+          8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+      lut = vec_op::FP32Vec16(LUT);
+    } else {
+      // GPTQ
+      alignas(64) static const float LUT[16] = {
+          -8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f,
+          0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f};
+      lut = vec_op::FP32Vec16(LUT);
+    }
+
+    // per 64-bits elem contains 16 output channels
+    int64_t* __restrict__ curr_q_weight = reinterpret_cast<int64_t*>(q_weight);
+    int64_t* __restrict__ curr_zeros = reinterpret_cast<int64_t*>(zeros);
+    scalar_t* __restrict__ curr_weight = weight;
+    scalar_t* __restrict__ curr_scale = scales;
+    vec_op::FP32Vec16 scale_0;
+    vec_op::FP32Vec16 scale_1;
+    vec_op::FP32Vec16 zero_0;
+    vec_op::FP32Vec16 zero_1;
+    int32_t group_counter = 0;
+    for (int32_t k_idx = 0; k_idx < k_size; k_idx += 2) {
+      int64_t qwb_0 = *curr_q_weight;
+      int64_t qwb_1 = *(curr_q_weight + 1);
+      vec_op::FP32Vec16 wb_0(qwb_0, lut);
+      vec_op::FP32Vec16 wb_1(qwb_1, lut);
+
+      if constexpr (!use_desc_act) {
+        if (group_counter == 0) {
+          scale_0 = vec_op::FP32Vec16(scalar_vec_t(curr_scale));
+          scale_1 = vec_op::FP32Vec16(scale_0);
+          curr_scale += scales_stride;
+
+          if constexpr (has_zp) {
+            zero_0 = vec_op::FP32Vec16(*curr_zeros, lut);
+            zero_1 = vec_op::FP32Vec16(zero_0);
+            curr_zeros += zeros_stride / 2;
+          }
+        }
+      } else {
+        int32_t g_idx_0 = g_idx[k_idx];
+        int32_t g_idx_1 = g_idx[k_idx + 1];
+        scale_0 = vec_op::FP32Vec16(
+            scalar_vec_t(curr_scale + g_idx_0 * scales_stride));
+        scale_1 = vec_op::FP32Vec16(
+            scalar_vec_t(curr_scale + g_idx_1 * scales_stride));
+        if constexpr (has_zp) {
+          zero_0 = vec_op::FP32Vec16(*(curr_zeros + g_idx_0 * zeros_stride / 2),
+                                     lut);
+          zero_1 = vec_op::FP32Vec16(*(curr_zeros + g_idx_1 * zeros_stride / 2),
+                                     lut);
+        }
+      }
+
+      if constexpr (has_zp) {
+        wb_0 = wb_0 - zero_0;
+        wb_1 = wb_1 - zero_1;
+      }
+
+      wb_0 = wb_0 * scale_0;
+      wb_1 = wb_1 * scale_1;
+
+      scalar_vec_t output_vec_0(wb_0);
+      scalar_vec_t output_vec_1(wb_1);
+
+      // AMX needs to interlave K elements to pack as 32 bits
+      if constexpr (isa == ISA::AMX) {
+        vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight);
+      } else {
+        output_vec_0.save(curr_weight);
+        output_vec_1.save(curr_weight + 16);
+      }
+
+      // update
+      curr_q_weight += 2;
+      curr_weight += 32;
+      if constexpr (!use_desc_act) {
+        group_counter += 2;
+        if (group_counter == group_size) {
+          group_counter = 0;
+        }
+      }
+    }
+  }
+};
+};  // namespace
+
+template <typename scalar_t, typename dequantizer_t, typename gemm_t>
+void cpu_gemm_wna16_impl(
+    scalar_t* __restrict__ input, int32_t* __restrict__ q_weight,
+    scalar_t* __restrict__ output, scalar_t* __restrict__ scales,
+    int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx,
+    scalar_t* __restrict__ bias, const int32_t m_size, const int32_t n_size,
+    const int32_t k_size, const int64_t input_stride,
+    const int64_t output_stride, const int64_t scales_group_stride,
+    const int64_t zeros_group_stride, const int32_t group_num,
+    const int32_t group_size, const int64_t pack_factor) {
+  constexpr int32_t gemm_n_tile_size = gemm_t::NSize;
+  constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize;
+  constexpr int32_t n_block_size = 16;
+  static_assert(gemm_n_tile_size % n_block_size == 0);
+  const int32_t thread_num = omp_get_max_threads();
+
+  // a simple schedule policy, just to hold more B tiles in L2 and make sure
+  // each thread has tasks
+  const int32_t n_partition_size = [&]() {
+    const int64_t cache_size = cpu_utils::get_l2_size();
+    int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t));
+    int64_t ps_thread_limit = n_size / thread_num;
+    ps_cache_limit =
+        std::max((ps_cache_limit / gemm_n_tile_size) * gemm_n_tile_size,
+                 (int64_t)gemm_n_tile_size);
+    ps_thread_limit =
+        std::max((ps_thread_limit / gemm_n_tile_size) * gemm_n_tile_size,
+                 (int64_t)gemm_n_tile_size);
+    return std::min(ps_cache_limit, ps_thread_limit);
+  }();
+  const int32_t task_num = (n_size + n_partition_size - 1) / n_partition_size;
+
+  // get buffer size
+  const int64_t b_buffer_size =
+      (((n_partition_size * k_size * sizeof(scalar_t) + 63) / 64) * 64);
+  const int64_t c_buffer_size =
+      (((gemm_m_tile_size * gemm_n_tile_size * sizeof(float) + 63) / 64) * 64);
+  const int64_t b_buffer_offset = 0;
+  const int64_t c_buffer_offset = b_buffer_size;
+  const int64_t buffer_size = b_buffer_size + c_buffer_size;
+  DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size *
+                                                                thread_num);
+
+  alignas(64) cpu_utils::Counter counter;
+  cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+  for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+    scalar_t* __restrict__ b_buffer = nullptr;
+    float* __restrict__ c_buffer = nullptr;
+    {
+      uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager()
+                                ->get_data<uint8_t>() +
+                            thread_id * buffer_size;
+      b_buffer = reinterpret_cast<scalar_t*>(buffer_ptr + b_buffer_offset);
+      c_buffer = reinterpret_cast<float*>(buffer_ptr + c_buffer_offset);
+    }
+
+    const int64_t q_weight_block_stride = n_block_size / pack_factor * k_size;
+    const int64_t b_buffer_block_stride = n_block_size * k_size;
+    const int32_t zeros_block_stride = n_block_size / pack_factor;
+
+    gemm_t gemm;
+
+    for (;;) {
+      int32_t task_id = counter_ptr->acquire_counter();
+
+      if (task_id >= task_num) {
+        break;
+      }
+
+      const int32_t n_start_idx = task_id * n_partition_size;
+      const int32_t n_block_start_idx = n_start_idx / n_block_size;
+      const int32_t n_num = std::min(n_partition_size, n_size - n_start_idx);
+      const int32_t n_block_num = n_num / n_block_size;
+      // std::printf("thread_id: %d, task_id: %d, n_start_idx: %d, n_num: %d\n",
+      // thread_id, task_id, n_start_idx, n_num);
+
+      // dequant weight
+      {
+        int32_t* __restrict__ curr_q_weight =
+            q_weight + n_block_start_idx * q_weight_block_stride;
+        scalar_t* __restrict__ curr_b_buffer = b_buffer;
+        scalar_t* __restrict__ curr_scales = scales + n_start_idx;
+        int32_t* __restrict__ curr_zeros = zeros + n_start_idx / pack_factor;
+        for (int32_t block_idx = 0; block_idx < n_block_num; ++block_idx) {
+          dequantizer_t::dequant(curr_q_weight, curr_b_buffer, curr_scales,
+                                 curr_zeros, g_idx, scales_group_stride,
+                                 zeros_group_stride, k_size, group_size);
+
+          // if (block_idx == 0 && n_start_idx == 0) {
+          //     print_logits("depacked weight", curr_b_buffer, k_size,
+          //     n_block_size, n_block_size);
+          // }
+
+          // update
+          curr_q_weight += q_weight_block_stride;
+          curr_b_buffer += b_buffer_block_stride;
+          curr_scales += n_block_size;
+          curr_zeros += zeros_block_stride;
+        }
+      }
+
+      // compute loop
+      {
+        const int32_t n_tile_num = n_num / gemm_n_tile_size;
+        scalar_t* __restrict__ curr_input = input;
+        scalar_t* __restrict__ init_bias = bias;
+        if (bias != nullptr) {
+          init_bias += n_start_idx;
+        }
+        scalar_t* __restrict__ init_output = output + n_start_idx;
+        for (int32_t m_idx = 0; m_idx < m_size; m_idx += gemm_m_tile_size) {
+          const int32_t curr_m_size =
+              std::min(gemm_m_tile_size, m_size - m_idx);
+          scalar_t* __restrict__ curr_b_buffer = b_buffer;
+          scalar_t* __restrict__ curr_bias = init_bias;
+          scalar_t* __restrict__ curr_output = init_output;
+          for (int32_t n_tile_idx = 0; n_tile_idx < n_tile_num; ++n_tile_idx) {
+            gemm.gemm(curr_input, curr_b_buffer, c_buffer, curr_m_size, k_size,
+                      input_stride, b_buffer_block_stride, gemm_n_tile_size,
+                      false);
+
+            if (bias != nullptr) {
+              cpu_micro_gemm::bias_epilogue<gemm_n_tile_size>(
+                  c_buffer, curr_output, curr_bias, curr_m_size,
+                  gemm_n_tile_size, output_stride);
+              curr_bias += gemm_n_tile_size;
+            } else {
+              cpu_micro_gemm::default_epilogue<gemm_n_tile_size>(
+                  c_buffer, curr_output, curr_m_size, gemm_n_tile_size,
+                  output_stride);
+            }
+
+            curr_b_buffer +=
+                b_buffer_block_stride * (gemm_n_tile_size / n_block_size);
+            curr_output += gemm_n_tile_size;
+          }
+          curr_input += gemm_m_tile_size * input_stride;
+          init_output += gemm_m_tile_size * output_stride;
+        }
+      }
+    }
+  }
+}
+
+void cpu_gemm_wna16(
+    const torch::Tensor& input,  // [M, K]
+    const torch::Tensor&
+        q_weight,           // [N / 16, K * 16 / pack_factor], packed as int32
+    torch::Tensor& output,  // [M, N]
+    const torch::Tensor& scales,  // [group_num, N]
+    const std::optional<torch::Tensor>&
+        zeros,  // [group_num, N / pack_factor], packed as int32
+    const std::optional<torch::Tensor>& g_idx,  // [K]
+    const std::optional<torch::Tensor>& bias,   // [N]
+    const int64_t pack_factor, const std::string& isa_hint) {
+  using cpu_utils::ISA;
+  TORCH_CHECK_EQ(pack_factor, 8);  // only supports 4bits
+  const int32_t a_m_size = input.size(0);
+  const int32_t a_k_size = input.size(1);
+  const int64_t a_m_stride = input.stride(0);
+  const int32_t b_n_size = q_weight.size(0) * 16;
+  TORCH_CHECK_EQ(a_k_size % 32, 0);
+  TORCH_CHECK_EQ(b_n_size % 32, 0);
+  const int32_t group_num = scales.size(0);
+  const int32_t group_size = a_k_size / group_num;
+  TORCH_CHECK_EQ(group_size % 2, 0);
+  const int64_t scales_group_stride = scales.stride(0);
+  const int64_t output_m_stride = output.stride(0);
+
+  bool has_zp = zeros.has_value();
+  bool use_desc_act = g_idx.has_value();
+  TORCH_CHECK(!(has_zp && use_desc_act));
+
+  ISA isa = [&]() {
+    if (isa_hint == "amx") {
+      return ISA::AMX;
+    } else if (isa_hint == "vec") {
+      return ISA::VEC;
+    } else {
+      TORCH_CHECK(false, "unsupported isa hint: " + isa_hint);
+    }
+  }();
+
+  int32_t* zeros_ptr = has_zp ? zeros->data_ptr<int32_t>() : nullptr;
+  const int64_t zeros_group_stride = has_zp ? zeros->stride(0) : 0;
+  int32_t* g_idx_ptr = use_desc_act ? g_idx->data_ptr<int32_t>() : nullptr;
+
+  VLLM_DISPATCH_16B_TYPES(input.scalar_type(), "cpu_gemm_wna16", [&]() {
+    if (isa == ISA::AMX) {
+      using gemm_t = cpu_micro_gemm::MicroGemm<ISA::AMX, scalar_t>;
+      if (has_zp) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, true, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+      if (use_desc_act) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, false, true>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      } else {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, false, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+    } else if (isa == ISA::VEC) {
+      using gemm_t = cpu_micro_gemm::MicroGemm<ISA::VEC, scalar_t>;
+      if (has_zp) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, true, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+      if (use_desc_act) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, false, true>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      } else {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, false, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+    }
+  });
+}
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index 02a8072ccf30..cfb6e78cba9a 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -396,9 +396,9 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
     : DNNLMatMulPrimitiveHandler(
           static_cast<DNNLMatMulPrimitiveHandler::Args>(args), args.ab_type),
       m_size_cache_(nullptr) {
-  assert(ab_type_ == dnnl::memory::data_type::f32 ||
-         ab_type_ == dnnl::memory::data_type::bf16 ||
-         ab_type_ == dnnl::memory::data_type::f16);
+  assert(b_type_ == dnnl::memory::data_type::f32 ||
+         b_type_ == dnnl::memory::data_type::bf16 ||
+         b_type_ == dnnl::memory::data_type::f16);
 
   dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                    {b_k_stride_, b_n_stride_});
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
new file mode 100644
index 000000000000..87a019773a89
--- /dev/null
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
@@ -0,0 +1,245 @@
+#ifndef CPU_MICRO_GEMM_AMX_HPP
+#define CPU_MICRO_GEMM_AMX_HPP
+#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp"
+
+namespace cpu_micro_gemm {
+namespace {
+// AMX specific
+constexpr static int64_t AMX_TILE_ROW_BYTES = 64;
+constexpr static int64_t AMX_TILE_ROW_NUM = 16;
+constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM;
+
+typedef struct __tile_config {
+  uint8_t palette_id = 1;
+  uint8_t start_row = 0;
+  uint8_t reserved_0[14] = {0};
+  uint16_t colsb[16] = {0};
+  uint8_t rows[16] = {0};
+} __tilecfg;
+
+// 2-2-4 pattern, for 16 < m <= 32
+// TILE 0, 1: load A matrix, row num should be 16, m - 16
+// TILE 2, 3: load B matrix, row num should be 16
+// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
+// - 16
+template <typename scalar_t>
+class TileGemm224 {
+ public:
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm224");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm224");
+  }
+};
+
+template <>
+class TileGemm224<c10::BFloat16> {
+ public:
+  using scalar_t = c10::BFloat16;
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    c10::BFloat16* __restrict__ a_tile_0 = a_ptr;
+    c10::BFloat16* __restrict__ a_tile_1 = a_ptr + lda * AMX_TILE_ROW_NUM;
+    const int64_t a_tile_stride = lda * sizeof(c10::BFloat16);
+
+    // B is always packed as 16 output channels block
+    c10::BFloat16* __restrict__ b_tile_2 = b_ptr;
+    c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride;
+    const int32_t b_tile_stride = AMX_TILE_ROW_BYTES;
+
+    float* __restrict__ c_tile_4 = c_ptr;
+    float* __restrict__ c_tile_5 =
+        c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float);
+    float* __restrict__ c_tile_6 = c_ptr + AMX_TILE_ROW_NUM * ldc;
+    float* __restrict__ c_tile_7 =
+        c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float);
+    const int32_t c_tile_stride = ldc * sizeof(float);
+
+    if (accum_c) {
+      _tile_loadd(4, c_tile_4, c_tile_stride);
+      _tile_loadd(5, c_tile_5, c_tile_stride);
+      _tile_loadd(6, c_tile_6, c_tile_stride);
+      _tile_loadd(7, c_tile_7, c_tile_stride);
+    } else {
+      _tile_zero(4);
+      _tile_zero(5);
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_tile_stride);
+      _tile_dpbf16ps(4, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_tile_stride);
+      _tile_dpbf16ps(5, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_dpbf16ps(6, 1, 2);
+      _tile_dpbf16ps(7, 1, 3);
+
+      // update ptrs
+      a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    _tile_stored(4, c_tile_4, c_tile_stride);
+    _tile_stored(5, c_tile_5, c_tile_stride);
+    _tile_stored(6, c_tile_6, c_tile_stride);
+    _tile_stored(7, c_tile_7, c_tile_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    const int32_t m_0 = AMX_TILE_ROW_NUM;
+    const int32_t m_1 = m - AMX_TILE_ROW_NUM;
+    config.rows[0] = m_0;
+    config.rows[1] = m_1;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = m_0;
+    config.rows[5] = m_0;
+    config.rows[6] = m_1;
+    config.rows[7] = m_1;
+    _tile_loadconfig(&config);
+  }
+};
+
+// 1-2-2 pattern, for 0 < m <= 16
+// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
+// m, m
+// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
+// num should be 16
+// TILE 6, 7, (6, 7): store results C matrix, row num should be
+// m
+template <typename scalar_t>
+class TileGemm122 {
+ public:
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm122");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm122");
+  }
+};
+
+template <>
+class TileGemm122<c10::BFloat16> {
+ public:
+  using scalar_t = c10::BFloat16;
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    c10::BFloat16* __restrict__ a_tile_0 = a_ptr;
+    c10::BFloat16* __restrict__ a_tile_1 =
+        a_ptr + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+    const int64_t a_tile_stride = lda * sizeof(c10::BFloat16);
+
+    c10::BFloat16* __restrict__ b_tile_2 = b_ptr;
+    c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride;
+    c10::BFloat16* __restrict__ b_tile_4 =
+        b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    c10::BFloat16* __restrict__ b_tile_5 =
+        b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    int64_t b_stride = AMX_TILE_ROW_BYTES;
+
+    float* __restrict__ c_tile_6 = c_ptr;
+    float* __restrict__ c_tile_7 = c_ptr + AMX_TILE_ROW_BYTES / sizeof(float);
+    int64_t c_stride = ldc * sizeof(float);
+
+    const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    const int32_t k_group_times = k_times / 2;
+    const bool has_tail = (k_times % 2 == 1);
+
+    if (accum_c) {
+      _tile_loadd(6, c_tile_6, c_stride);
+      _tile_loadd(7, c_tile_7, c_stride);
+    } else {
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_group_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_stream_loadd(4, b_tile_4, b_stride);
+      _tile_dpbf16ps(6, 1, 4);
+      _tile_stream_loadd(5, b_tile_5, b_stride);
+      _tile_dpbf16ps(7, 1, 5);
+
+      // update ptrs
+      a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    if (has_tail) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+    }
+
+    _tile_stored(6, c_tile_6, c_stride);
+    _tile_stored(7, c_tile_7, c_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    config.rows[0] = m;
+    config.rows[1] = m;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = AMX_TILE_ROW_NUM;
+    config.rows[5] = AMX_TILE_ROW_NUM;
+    config.rows[6] = m;
+    config.rows[7] = m;
+    _tile_loadconfig(&config);
+  }
+};
+}  // namespace
+
+// Gemm kernel uses AMX, requires B matrix to be packed
+template <typename scalar_t>
+class MicroGemm<cpu_utils::ISA::AMX, scalar_t> {
+ public:
+  static constexpr int32_t MaxMSize = 32;
+  static constexpr int32_t NSize = 32;
+
+ public:
+  MicroGemm() : curr_m_(-1) {
+    vec_op::unroll_loop<int, 8>([&](int i) { amx_tile_config_.colsb[i] = 64; });
+  }
+
+  void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    if (m > AMX_TILE_ROW_NUM) {
+      if (m != curr_m_) {
+        curr_m_ = m;
+        TileGemm224<scalar_t>::init_tile_config(m, amx_tile_config_);
+      }
+      TileGemm224<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
+    } else {
+      if (m != curr_m_) {
+        curr_m_ = m;
+        TileGemm122<scalar_t>::init_tile_config(m, amx_tile_config_);
+      }
+      TileGemm122<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
+    }
+  }
+
+ private:
+  alignas(64) __tilecfg amx_tile_config_;
+  int32_t curr_m_;
+};
+
+}  // namespace cpu_micro_gemm
+
+#endif
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
new file mode 100644
index 000000000000..784da55a420e
--- /dev/null
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
@@ -0,0 +1,91 @@
+#ifndef CPU_MICRO_GEMM_IMPL_HPP
+#define CPU_MICRO_GEMM_IMPL_HPP
+#include "cpu/utils.hpp"
+#include "cpu/cpu_types.hpp"
+
+namespace cpu_micro_gemm {
+#define DEFINE_CPU_MICRO_GEMM_PARAMS                                        \
+  scalar_t *__restrict__ a_ptr, scalar_t *__restrict__ b_ptr,               \
+      float *__restrict__ c_ptr, const int32_t m, const int32_t k,          \
+      const int64_t lda, const int64_t b_n_group_stride, const int64_t ldc, \
+      const bool accum_c
+
+#define CPU_MICRO_GEMM_PARAMS \
+  a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c
+
+template <cpu_utils::ISA isa, typename scalar_t>
+class MicroGemm {
+ public:
+  static constexpr int32_t MaxMSize = 16;
+  static constexpr int32_t NSize = 16;
+
+ public:
+  void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TORCH_CHECK(false, "Unimplemented MicroGemm.");
+  }
+};
+
+template <int32_t n_size, typename scalar_t>
+FORCE_INLINE void default_epilogue(float* __restrict__ c_ptr,
+                                   scalar_t* __restrict__ d_ptr,
+                                   const int32_t m, const int64_t ldc,
+                                   const int64_t ldd) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  static_assert(n_size % 16 == 0);
+
+  float* __restrict__ curr_c = c_ptr;
+  scalar_t* __restrict__ curr_d = d_ptr;
+  for (int32_t i = 0; i < m; ++i) {
+    float* __restrict__ curr_c_iter = curr_c;
+    scalar_t* __restrict__ curr_d_iter = curr_d;
+    vec_op::unroll_loop<int32_t, n_size / 16>([&](int32_t n_g_idx) {
+      vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
+      scalar_vec_t c_vec(c_vec_fp32);
+      c_vec.save(curr_d_iter);
+      curr_c_iter += 16;
+      curr_d_iter += 16;
+    });
+    curr_c += ldc;
+    curr_d += ldd;
+  }
+}
+
+template <int32_t n_size, typename scalar_t>
+FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr,
+                                scalar_t* __restrict__ d_ptr,
+                                scalar_t* __restrict__ bias_ptr,
+                                const int32_t m, const int64_t ldc,
+                                const int64_t ldd) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  static_assert(n_size % 16 == 0);
+  constexpr int32_t n_group_num = n_size / 16;
+  static_assert(n_group_num <= 16);
+
+  vec_op::FP32Vec16 bias_vecs[n_group_num];
+  scalar_t* __restrict__ curr_bias = bias_ptr;
+  vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t i) {
+    scalar_vec_t vec(curr_bias);
+    bias_vecs[i] = vec_op::FP32Vec16(vec);
+    curr_bias += 16;
+  });
+
+  float* __restrict__ curr_c = c_ptr;
+  scalar_t* __restrict__ curr_d = d_ptr;
+  for (int32_t i = 0; i < m; ++i) {
+    float* __restrict__ curr_c_iter = curr_c;
+    scalar_t* __restrict__ curr_d_iter = curr_d;
+    vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t n_g_idx) {
+      vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
+      c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx];
+      scalar_vec_t c_vec(c_vec_fp32);
+      c_vec.save(curr_d_iter);
+      curr_c_iter += 16;
+      curr_d_iter += 16;
+    });
+    curr_c += ldc;
+    curr_d += ldd;
+  }
+}
+}  // namespace cpu_micro_gemm
+
+#endif
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
new file mode 100644
index 000000000000..3985c2f2e5fe
--- /dev/null
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
@@ -0,0 +1,115 @@
+#ifndef CPU_MICRO_GEMM_VEC_HPP
+#define CPU_MICRO_GEMM_VEC_HPP
+#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp"
+
+namespace cpu_micro_gemm {
+namespace {
+// 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32]
+template <typename scalar_t>
+class TileGemm82 {
+ public:
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    switch (m) {
+      case 1:
+        gemm_micro<1>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 2:
+        gemm_micro<2>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 3:
+        gemm_micro<3>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 4:
+        gemm_micro<4>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 5:
+        gemm_micro<5>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 6:
+        gemm_micro<6>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 7:
+        gemm_micro<7>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 8:
+        gemm_micro<8>(CPU_MICRO_GEMM_PARAMS);
+        break;
+    }
+  }
+
+  template <int32_t M>
+  static void gemm_micro(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    static_assert(0 < M <= 8);
+    using load_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+
+    scalar_t* __restrict__ curr_b_0 = b_ptr;
+    scalar_t* __restrict__ curr_b_1 = b_ptr + b_n_group_stride;
+    float* __restrict__ curr_c_0 = c_ptr;
+    float* __restrict__ curr_c_1 = c_ptr + 16;
+
+    vec_op::FP32Vec16 c_regs[M * 2];
+    if (accum_c) {
+      float* __restrict__ curr_m_c_0 = curr_c_0;
+      float* __restrict__ curr_m_c_1 = curr_c_1;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        c_regs[i * 2] = vec_op::FP32Vec16(curr_m_c_0);
+        c_regs[i * 2 + 1] = vec_op::FP32Vec16(curr_m_c_1);
+
+        // update
+        curr_m_c_0 += ldc;
+        curr_m_c_1 += ldc;
+      });
+    }
+
+    scalar_t* __restrict__ curr_a = a_ptr;
+    for (int32_t k_idx = 0; k_idx < k; ++k_idx) {
+      load_vec_t b_0_reg(curr_b_0);
+      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
+      load_vec_t b_1_reg(curr_b_1);
+      vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg);
+
+      scalar_t* __restrict__ curr_m_a = curr_a;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        scalar_t v = *curr_m_a;
+        load_vec_t a_reg_original(v);
+        vec_op::FP32Vec16 a_reg(a_reg_original);
+        c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg;
+        c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg;
+
+        // update
+        curr_m_a += lda;
+      });
+
+      // update
+      curr_a += 1;
+      curr_b_0 += 16;
+      curr_b_1 += 16;
+    }
+
+    vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+      c_regs[i * 2].save(curr_c_0);
+      c_regs[i * 2 + 1].save(curr_c_1);
+
+      // update
+      curr_c_0 += ldc;
+      curr_c_1 += ldc;
+    });
+  }
+};
+}  // namespace
+
+// Gemm kernel uses vector instructions, requires B matrix to be packed
+template <typename scalar_t>
+class MicroGemm<cpu_utils::ISA::VEC, scalar_t> {
+ public:
+  static constexpr int32_t MaxMSize = 8;
+  static constexpr int32_t NSize = 32;
+
+ public:
+  void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TileGemm82<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
+  }
+};
+}  // namespace cpu_micro_gemm
+
+#endif
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 9fefd88cd9b0..b07d20bab7dd 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -103,6 +103,13 @@ void cpu_attention_with_kv_cache(
 // Note: just for avoiding importing errors
 void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
 
+void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight,
+                    torch::Tensor& output, const torch::Tensor& scales,
+                    const std::optional<torch::Tensor>& zeros,
+                    const std::optional<torch::Tensor>& g_idx,
+                    const std::optional<torch::Tensor>& bias,
+                    const int64_t pack_factor, const std::string& isa_hint);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -283,6 +290,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
   ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
   ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
+
+  // WNA16
+#if defined(__AVX512F__)
+  ops.def(
+      "cpu_gemm_wna16(Tensor input, Tensor q_weight, Tensor(a2!) output, "
+      "Tensor scales, Tensor? zeros, Tensor? g_idx, Tensor? bias, SymInt "
+      "pack_factor, str isa_hint) -> ()");
+  ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16);
+#endif
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp
new file mode 100644
index 000000000000..d8399c56f6af
--- /dev/null
+++ b/csrc/cpu/utils.hpp
@@ -0,0 +1,55 @@
+#ifndef UTILS_HPP
+#define UTILS_HPP
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <unistd.h>
+
+#include "cpu_types.hpp"
+
+namespace cpu_utils {
+enum class ISA { AMX, VEC };
+
+template <typename T>
+struct VecTypeTrait {
+  using vec_t = void;
+};
+
+template <>
+struct VecTypeTrait<float> {
+  using vec_t = vec_op::FP32Vec16;
+};
+
+template <>
+struct VecTypeTrait<c10::BFloat16> {
+  using vec_t = vec_op::BF16Vec16;
+};
+
+template <>
+struct VecTypeTrait<c10::Half> {
+  using vec_t = vec_op::FP16Vec16;
+};
+
+struct Counter {
+  std::atomic<int64_t> counter;
+  char _padding[56];
+
+  Counter() : counter(0) {}
+
+  void reset_counter() { counter.store(0); }
+
+  int64_t acquire_counter() { return counter++; }
+};
+
+inline int64_t get_l2_size() {
+  static int64_t size = []() {
+    long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
+    assert(l2_cache_size != -1);
+    return l2_cache_size >> 1;  // use 50% of L2 cache
+  }();
+  return size;
+}
+}  // namespace cpu_utils
+
+#endif
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index be99cef3723e..d1beab7855b1 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -97,7 +97,6 @@ Currently, there are no pre-built CPU wheels.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists, `auto` (by default), or `nobind` (to disable binding to individual CPU cores and to inherit user-defined OpenMP variables). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. If set to `nobind`, the number of OpenMP threads is determined by the standard `OMP_NUM_THREADS` environment variable.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
 - `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
-- `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False).
 
 ## FAQ
@@ -191,10 +190,9 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
     - GPTQ (x86 only)
     - compressed-tensor INT8 W8A8 (x86, s390x)
 
-### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`?
+### (x86 only) What is the purpose of `VLLM_CPU_SGL_KERNEL`?
 
 - Both of them require `amx` CPU flag.
-    - `VLLM_CPU_MOE_PREPACK` can provide better performance for MoE models
     - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios.
 
 ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index d11787df4d92..e23d3286f3f7 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -22,7 +22,6 @@ datasets # for benchmark scripts
 
 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
-intel_extension_for_pytorch==2.8.0; platform_machine == "x86_64"
 triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
 
 # Use this to gather CPU info and optimize based on ARM Neoverse cores
diff --git a/tests/quantization/test_cpu_wna16.py b/tests/quantization/test_cpu_wna16.py
new file mode 100644
index 000000000000..077b802e559d
--- /dev/null
+++ b/tests/quantization/test_cpu_wna16.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+MODELS = [
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ",
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",  # with g_idx
+]
+DTYPE = ["bfloat16"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", DTYPE)
+def test_ipex_quant(vllm_runner, model, dtype):
+    with vllm_runner(model, dtype=dtype) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+    assert output
+    print(output)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 096266c9764e..66cf6472eee4 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2702,6 +2702,31 @@ def cpu_attention_with_kv_cache(
     )
 
 
+def cpu_gemm_wna16(
+    input: torch.Tensor,
+    q_weight: torch.Tensor,
+    scales: torch.Tensor,
+    zeros: torch.Tensor | None,
+    g_idx: torch.Tensor | None,
+    bias: torch.Tensor | None,
+    pack_factor: int,
+    isa_hint: str,
+) -> torch.Tensor:
+    output = torch.empty((input.size(0), scales.size(1)), dtype=input.dtype)
+    torch.ops._C.cpu_gemm_wna16(
+        input,
+        q_weight,
+        output,
+        scales,
+        zeros,
+        g_idx,
+        bias,
+        pack_factor,
+        isa_hint,
+    )
+    return output
+
+
 if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"):
 
     @register_fake("_qutlass_C::matmul_mxf4_bf16_tn")
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 49fe0bcd9a2a..3e8790a26e0e 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1020,6 +1020,8 @@ def _verify_quantization(self) -> None:
                 # Ensure heavy backends are probed last to avoid unnecessary
                 # imports during override detection (e.g., MXFP4 imports Triton)
                 "mxfp4",
+                "cpu_gptq",
+                "cpu_awq",
             ]
             quantization_methods = [
                 q for q in supported_quantization if q not in overrides
diff --git a/vllm/envs.py b/vllm/envs.py
index 62b3344ccd85..6d92d5afee50 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -50,7 +50,6 @@
     VLLM_CPU_KVCACHE_SPACE: int | None = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
     VLLM_CPU_NUM_OF_RESERVED_CPU: int | None = None
-    VLLM_CPU_MOE_PREPACK: bool = True
     VLLM_CPU_SGL_KERNEL: bool = False
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
     VLLM_XLA_CHECK_RECOMPILATION: bool = False
@@ -665,10 +664,6 @@ def get_vllm_port() -> int | None:
     )
     if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ
     else None,
-    # (CPU backend only) whether to use prepack for MoE layer. This will be
-    # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
-    # need to set this to "0" (False).
-    "VLLM_CPU_MOE_PREPACK": lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
     # (CPU backend only) whether to use SGL kernels, optimized for small batch.
     "VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
     # If the env var is set, Ray Compiled Graph uses the specified
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 23ace3408562..572307052b48 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -6,7 +6,6 @@
 from torch.nn import functional as F
 
 from vllm import _custom_ops as ops
-from vllm import envs
 
 
 def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
@@ -130,54 +129,6 @@ def select_experts(
         )
 
 
-class IPEXFusedMOE:
-    def __init__(self, layer: torch.nn.Module) -> None:
-        import intel_extension_for_pytorch as ipex
-
-        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
-            layer.w13_weight,
-            layer.w2_weight,
-            use_prepack=envs.VLLM_CPU_MOE_PREPACK,
-        )
-
-    def __call__(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-    ) -> torch.Tensor:
-        assert activation == "silu", f"{activation} is not supported."
-        assert not apply_router_weight_on_input
-        assert routed_scaling_factor == 1.0, (
-            f"routed_scaling_factor {routed_scaling_factor} is not supported."
-        )
-        return layer.ipex_fusion(
-            x,
-            use_grouped_topk,
-            top_k,
-            router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            custom_routing_function,
-            scoring_func,
-            e_score_correction_bias,
-        )
-
-
 class SGLFusedMOE:
     def __init__(self, layer: torch.nn.Module) -> None:
         pass
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index ce56887f1c26..2e0376553b91 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -260,7 +260,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     layer.w2_weight.copy_(packed_w2_weight)
                     layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
                 else:
-                    layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
+                    layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
             else:
                 layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index bb42b10f8718..18aaae394f93 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -38,6 +38,8 @@
     "inc",
     "mxfp4",
     "petit_nvfp4",
+    "cpu_gptq",
+    "cpu_awq",
 ]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
 
@@ -107,6 +109,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .compressed_tensors.compressed_tensors import (
         CompressedTensorsConfig,
     )
+    from .cpu_wna16 import CPUAWQConfig, CPUGPTQConfig
     from .deepspeedfp import DeepSpeedFPConfig
     from .experts_int8 import ExpertsInt8Config
     from .fbgemm_fp8 import FBGEMMFp8Config
@@ -159,6 +162,8 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "inc": INCConfig,
         "mxfp4": Mxfp4Config,
         "petit_nvfp4": PetitNvFp4Config,
+        "cpu_gptq": CPUGPTQConfig,
+        "cpu_awq": CPUAWQConfig,
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
diff --git a/vllm/model_executor/layers/quantization/cpu_wna16.py b/vllm/model_executor/layers/quantization/cpu_wna16.py
new file mode 100644
index 000000000000..bf643f55f1b9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/cpu_wna16.py
@@ -0,0 +1,625 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import torch
+from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
+
+from vllm._custom_ops import (
+    cpu_gemm_wna16,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_linear_quant_method,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    marlin_repeat_scales_on_all_ranks,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    is_layer_skipped,
+    pack_cols,
+    unpack_cols,
+)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.models.utils import WeightsMapper
+from vllm.model_executor.parameter import (
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedColumnParameter,
+    PackedvLLMParameter,
+    RowvLLMParameter,
+)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_safetensors_params_metadata
+from vllm.utils.collection_utils import is_list_of
+
+logger = init_logger(__name__)
+
+
+class CPUGPTQConfig(QuantizationConfig):
+    """Config class for CPU GPTQ quant"""
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+        dynamic: dict[str, dict[str, int | bool]],
+        full_config: dict[str, Any],
+        modules_in_block_to_quantize: list[str] | None = None,
+    ) -> None:
+        super().__init__()
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is dict[str, dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        assert weight_bits == 4
+        self.dynamic = dynamic
+        self.weight_bits = weight_bits
+        self.is_sym = is_sym
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.full_config = full_config
+        self.modules_in_block_to_quantize = modules_in_block_to_quantize or []
+
+    def __repr__(self) -> str:
+        return (
+            f"CPUWNA16Config("
+            f"group_size={self.group_size}, "
+            f"desc_act={self.desc_act}, "
+            f"lm_head_quantized={self.lm_head_quantized}, "
+            f"dynamic={self.dynamic}, "
+            f"modules_in_block_to_quantize={self.modules_in_block_to_quantize})"
+        )
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "cpu_gptq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return -1
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "CPUGPTQConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        group_size = cls.get_from_keys(config, ["group_size"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        modules_in_block_to_quantize = cls.get_from_keys_or(
+            config, ["modules_in_block_to_quantize"], default=None
+        )
+        return cls(
+            weight_bits,
+            group_size,
+            desc_act,
+            is_sym,
+            lm_head_quantized,
+            dynamic,
+            config,
+            modules_in_block_to_quantize,
+        )
+
+    @classmethod
+    def override_quantization_method(
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
+        quant_method = hf_quant_cfg.get("quant_method", "").lower()
+        if current_platform.is_cpu() and (quant_method == "gptq"):
+            return cls.get_name()
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        return get_linear_quant_method(self, layer, prefix, CPUGPTQLinearMethod)  # type: ignore
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper):
+        if self.modules_in_block_to_quantize is not None:
+            self.modules_in_block_to_quantize = hf_to_vllm_mapper.apply_list(
+                self.modules_in_block_to_quantize
+            )
+
+    def maybe_update_config(self, model_name: str, revision: str | None = None):
+        if self.modules_in_block_to_quantize:
+            if is_list_of(self.modules_in_block_to_quantize, list):
+                # original modules_in_block_to_quantize: list[list[str]]
+                # flatten original modules_in_block_to_quantize
+                self.modules_in_block_to_quantize = [
+                    item
+                    for sublist in self.modules_in_block_to_quantize
+                    for item in sublist
+                ]
+            return
+
+        unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+        metadata = get_safetensors_params_metadata(model_name, revision=revision)
+        quant_layers: set[str] = {
+            param_name.rsplit(".", 1)[0]
+            for param_name, info in metadata.items()
+            if (dtype := info.get("dtype", None))
+            and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes
+        }
+        self.modules_in_block_to_quantize = list(quant_layers)
+
+
+class CPUGPTQLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ on CPU.
+
+    Args:
+        quant_config: The CPUWNA16 quantization config.
+    """
+
+    def __init__(self, quant_config: CPUGPTQConfig) -> None:
+        self.quant_config = quant_config
+        assert self.quant_config.is_sym, "GPTQ asym quant is not supported on CPU"
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        output_size_per_partition = sum(output_partition_sizes)
+        assert output_size_per_partition * self.quant_config.weight_bits % 32 == 0
+        assert output_size_per_partition % 32 == 0
+        assert input_size_per_partition % 32 == 0
+
+        is_row_parallel = input_size != input_size_per_partition
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        # Determine sharding
+        if marlin_repeat_scales_on_all_ranks(
+            self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel
+        ):
+            # By setting scale_dim == None, weight_loader will
+            # repeat the scales on each rank in TP>1 case.
+            scales_and_zp_input_dim = None
+            scales_and_zp_size = input_size // group_size
+        else:
+            # By setting scale_dim == 0, weight_loader will
+            # shard the scales in TP>1 case.
+            scales_and_zp_input_dim = 0
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        # Quantized weights
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        # Activation order
+        g_idx = RowvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            weight_loader=weight_loader,
+        )
+        set_weight_attrs(
+            g_idx,
+            {"ignore_warning": True},
+        )
+
+        qzeros_args = {
+            "data": torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader": weight_loader,
+        }
+        weight_scale_args = {
+            "data": torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader": weight_loader,
+        }
+
+        if scales_and_zp_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        else:
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        torch.set_printoptions(profile="full", linewidth=5000, sci_mode=False)
+        packed_weight = layer.qweight.data
+        bits = self.quant_config.weight_bits
+        pack_factor = int(self.quant_config.pack_factor)
+        p_w_k, p_w_n = packed_weight.size()
+        input_size = p_w_k * pack_factor
+        output_size = p_w_n
+        isa_hint = _get_isa_hint(layer.scales.dtype)
+        layer.isa_hint = isa_hint
+
+        layer.qzeros = None
+        if not self.quant_config.desc_act:
+            layer.g_idx = None
+
+        # convert input dim packed to output dim packed
+        weight = unpack_cols(packed_weight, bits, p_w_k, p_w_n * pack_factor).view(
+            p_w_k, p_w_n, pack_factor
+        )
+        weight = weight.permute(0, 2, 1).reshape(input_size, output_size).contiguous()
+        weight = pack_cols(weight, bits, input_size, output_size)
+        # make 16 output channel as a block and transpose to the make
+        # the block contigous
+        weight = (
+            weight.view(input_size, -1, 16 // pack_factor)
+            .permute(1, 0, 2)
+            .reshape(-1, input_size * 16 // pack_factor)
+            .contiguous()
+        )
+        layer.qweight.data = weight
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        x = cpu_gemm_wna16(
+            input=x,
+            q_weight=layer.qweight,
+            scales=layer.scales,
+            zeros=layer.qzeros,
+            g_idx=layer.g_idx,
+            bias=bias,
+            pack_factor=8,
+            isa_hint=layer.isa_hint,
+        )
+        return x
+
+
+class CPUAWQConfig(QuantizationConfig):
+    """Config class for CPU AWQ"""
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        zero_point: bool,
+        lm_head_quantized: bool,
+        modules_to_not_convert: list[str] | None,
+        full_config: dict[str, Any],
+    ) -> None:
+        super().__init__()
+        assert weight_bits == 4
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.lm_head_quantized = lm_head_quantized
+        self.weight_bits = weight_bits
+        self.modules_to_not_convert = modules_to_not_convert or []
+        self.full_config = full_config
+
+    def __repr__(self) -> str:
+        return (
+            f"AWQMarlinConfig("
+            f"group_size={self.group_size}, "
+            f"zero_point={self.zero_point}, "
+            f"lm_head_quantized={self.lm_head_quantized}, "
+            f"modules_to_not_convert={self.modules_to_not_convert})"
+        )
+
+    @classmethod
+    def get_name(cls) -> "QuantizationMethods":
+        return "cpu_awq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return -1
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "CPUAWQConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        zero_point = cls.get_from_keys(config, ["zero_point"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        modules_to_not_convert = cls.get_from_keys_or(
+            config, ["modules_to_not_convert"], None
+        )
+        return cls(
+            weight_bits,
+            group_size,
+            zero_point,
+            lm_head_quantized,
+            modules_to_not_convert,
+            config,
+        )
+
+    @classmethod
+    def override_quantization_method(
+        cls, hf_quant_cfg, user_quant
+    ) -> Optional["QuantizationMethods"]:
+        quant_method = hf_quant_cfg.get("quant_method", "").lower()
+        if current_platform.is_cpu() and (quant_method == "awq"):
+            return cls.get_name()
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
+            if is_layer_skipped(
+                prefix,
+                self.modules_to_not_convert,
+                self.packed_modules_mapping,
+                skip_with_substr=True,
+            ):
+                return UnquantizedLinearMethod()
+            return CPUAWQLinearMethod(self)
+        return None
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if self.modules_to_not_convert:
+            self.modules_to_not_convert = hf_to_vllm_mapper.apply_list(
+                self.modules_to_not_convert
+            )
+
+    def maybe_update_config(self, model_name: str, revision: str | None = None):
+        if self.modules_to_not_convert:
+            return
+
+        unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+        metadata = get_safetensors_params_metadata(model_name, revision=revision)
+        layers = {param_name.rsplit(".", 1)[0] for param_name in metadata}
+        quant_layers: set[str] = {
+            param_name.rsplit(".", 1)[0]
+            for param_name, info in metadata.items()
+            if (dtype := info.get("dtype", None))
+            and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes
+        }
+        self.modules_to_not_convert = list(layers - quant_layers)
+
+
+class CPUAWQLinearMethod(LinearMethodBase):
+    """Linear method for CPU AWQ.
+
+    Args:
+        quant_config: The CPU AWQ quantization config.
+    """
+
+    def __init__(self, quant_config: CPUAWQConfig) -> None:
+        self.quant_config = quant_config
+        assert self.quant_config.zero_point
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        del output_size
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        num_groups = input_size_per_partition // group_size
+
+        qzeros = PackedvLLMParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        scales = GroupQuantScaleParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=0,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        torch.set_printoptions(profile="full", linewidth=5000, sci_mode=False)
+        packed_weight = layer.qweight.data
+        packed_zeros = layer.qzeros.data
+        group_num = packed_zeros.size(0)
+        bits = self.quant_config.weight_bits
+        pack_factor = int(self.quant_config.pack_factor)
+        input_size, packed_output_size = packed_weight.size()
+        output_size = packed_output_size * pack_factor
+        isa_hint = _get_isa_hint(layer.scales.dtype)
+        layer.isa_hint = isa_hint
+
+        interleave_map = (0, 4, 1, 5, 2, 6, 3, 7)
+        weight = unpack_cols(
+            packed_weight,
+            bits,
+            input_size,
+            output_size,
+        )
+        zeros = unpack_cols(
+            packed_zeros,
+            bits,
+            group_num,
+            output_size,
+        )
+        weight = (
+            weight.view(input_size, -1, pack_factor)[:, :, interleave_map]
+            .reshape(input_size, output_size)
+            .contiguous()
+        )
+        zeros = (
+            zeros.view(group_num, -1, pack_factor)[:, :, interleave_map]
+            .reshape(group_num, output_size)
+            .contiguous()
+        )
+
+        zeros = pack_cols(zeros, bits, group_num, output_size).contiguous()
+        # make 16 output channel as a block and transpose to
+        # the make the block contigous
+        weight = pack_cols(weight, bits, input_size, output_size)
+        weight = (
+            weight.view(input_size, -1, 16 // pack_factor)
+            .permute(1, 0, 2)
+            .reshape(-1, input_size * 16 // pack_factor)
+            .contiguous()
+        )
+        layer.qweight.data = weight
+        layer.qzeros.data = zeros
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        x = cpu_gemm_wna16(
+            input=x,
+            q_weight=layer.qweight,
+            scales=layer.scales,
+            zeros=layer.qzeros,
+            g_idx=None,
+            bias=bias,
+            pack_factor=8,
+            isa_hint=layer.isa_hint,
+        )
+        return x
+
+
+def _get_isa_hint(dtype: torch.dtype) -> str:
+    supports_amx = torch._C._cpu._is_amx_tile_supported()
+    if supports_amx and dtype in (torch.bfloat16,):
+        return "amx"
+    else:
+        return "vec"
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 5ca9167faec8..22c4bae041a5 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -134,7 +134,7 @@ def from_config(cls, config: dict[str, Any]) -> "IPEXConfig":
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
     ) -> QuantizationMethods | None:
-        if not current_platform.is_cpu() and not current_platform.is_xpu():
+        if not current_platform.is_xpu():
             return None
 
         quant_method = hf_quant_cfg.get("quant_method", "").lower()

From 814843e021a3618f7f8e494d4f0d4fd561cf3225 Mon Sep 17 00:00:00 2001
From: Strahinja Stamenkovic <strahinja.stamenkovic@amd.com>
Date: Wed, 19 Nov 2025 04:12:31 +0100
Subject: [PATCH 171/578] Enable bitsandbytes quantization on AMD GPUs that use
 warp size 32 (#27307)

Signed-off-by: sstamenk <strahinja.stamenkovic@amd.com>
---
 tests/models/quantization/test_bitsandbytes.py | 11 +++++++----
 vllm/platforms/rocm.py                         |  3 +++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py
index 24220978534c..dc4b4546e451 100644
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -14,10 +14,13 @@
 from ...utils import compare_two_settings, multi_gpu_test
 from ..utils import check_embeddings_close, check_logprobs_close
 
-pytestmark = pytest.mark.skipif(
-    current_platform.is_rocm(),
-    reason="bitsandbytes quantization not supported on ROCm (CUDA-only kernels)",
-)
+if current_platform.is_rocm():
+    from vllm.platforms.rocm import on_gfx9
+
+    pytestmark = pytest.mark.skipif(
+        on_gfx9(),
+        reason="bitsandbytes not supported on gfx9 (warp size 64 limitation)",
+    )
 
 models_4bit_to_test = [
     ("facebook/opt-125m", "quantize opt model inflight"),
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 788f9d69c357..bb116792fed5 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -185,6 +185,9 @@ class RocmPlatform(Platform):
         "petit_nvfp4",
         "torchao",
     ]
+    # bitsandbytes not supported on gfx9 (warp size 64 limitation)
+    if not on_gfx9():
+        supported_quantization += ["bitsandbytes"]
 
     @classmethod
     def get_vit_attn_backend(

From 4c23690f43e51eccf6ce5866ac47adcf39215e4d Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 18 Nov 2025 23:06:21 -0500
Subject: [PATCH 172/578] [Attention] FlashAttention ViT support, make default
 backend (#28763)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 cmake/external_projects/vllm_flash_attn.cmake |  2 +-
 tests/kernels/attention/test_flash_attn.py    |  4 +--
 tests/kernels/attention/test_mha_attn.py      | 30 +------------------
 vllm/platforms/cuda.py                        | 21 ++++++-------
 vllm/v1/attention/backends/flash_attn.py      |  4 +--
 5 files changed, 15 insertions(+), 46 deletions(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 567c8959f045..6cc5cda14c52 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
+          GIT_TAG 71bb26f6295449be880344b93b51791cc009237d
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 6e5468969bf2..26b8c77ab482 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -13,14 +13,14 @@
 )
 
 NUM_HEADS = [(4, 4), (8, 2)]
-HEAD_SIZES = [128, 256]
+HEAD_SIZES = [40, 72, 80, 128, 256]
 BLOCK_SIZES = [16]
 DTYPES = [torch.bfloat16]
 QDTYPES = [None, torch.float8_e4m3fn]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
-SOFT_CAPS = [None, 50.0]
+SOFT_CAPS = [None]
 SLIDING_WINDOWS = [None, 256]
 
 
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 183bbf3bf4e0..a878ac6396ce 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -62,38 +62,10 @@ def test_mha_attn_platform(device: str):
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
         # Test CUDA with head_size=72 (not divisible by 32)
-        # - with upstream FA not available
-        # - should use xformers
-        with (
-            patch("vllm.attention.layer.current_platform", CudaPlatform()),
-            patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
-            patch(
-                "vllm.attention.layer.check_upstream_fa_availability",
-                return_value=False,
-            ),
-        ):
-            attn = MultiHeadAttention(16, 72, scale=1)
-            assert attn.attn_backend == AttentionBackendEnum.XFORMERS
-
-        # Test CUDA with head_size=72 (not divisible by 32)
-        # - with upstream FA available
-        # - should use upstream FA
+        # - should use vLLM's FlashAttention
         with (
             patch("vllm.attention.layer.current_platform", CudaPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
-            patch(
-                "vllm.attention.layer.check_upstream_fa_availability", return_value=True
-            ),
-            patch.dict(
-                "sys.modules",
-                {
-                    "flash_attn": type(
-                        "MockFlashAttn",
-                        (),
-                        {"flash_attn_varlen_func": lambda *args, **kwargs: None},
-                    )()
-                },
-            ),
         ):
             attn = MultiHeadAttention(16, 72, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2e4dd8bb808b..f9bf242b7194 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -267,24 +267,21 @@ def get_vit_attn_backend(
     ) -> "AttentionBackendEnum":
         from vllm.attention.backends.registry import AttentionBackendEnum
 
-        # For Blackwell GPUs, force TORCH_SDPA for now.
-        # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
-        if cls.has_device_capability(100):
-            return AttentionBackendEnum.TORCH_SDPA
-
-        if dtype not in (torch.float16, torch.bfloat16):
-            return AttentionBackendEnum.XFORMERS
-
-        if cls.has_device_capability(80):
+        # Try FlashAttention first
+        try:
             backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
             if backend_class.supports_head_size(
                 head_size
             ) and backend_class.supports_dtype(dtype):
                 return AttentionBackendEnum.FLASH_ATTN
-            else:
-                return AttentionBackendEnum.XFORMERS
+        except ImportError:
+            pass
+
+        if cls.has_device_capability(100):
+            # xFormers doesn't support Blackwell, fall back to SDPA
+            # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
+            return AttentionBackendEnum.TORCH_SDPA
         else:
-            # Fallback for Volta/Turing GPUs or FA not supported
             return AttentionBackendEnum.XFORMERS
 
     @classmethod
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a5d4435000d4..fdc99a0df1c8 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -119,8 +119,8 @@ def get_fp8_dtype_for_flashattn(kv_cache_dtype: str) -> torch.dtype:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
     @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
+    def supports_head_size(cls, head_size: int) -> bool:
+        return head_size % 8 == 0 and head_size <= 256
 
     @classmethod
     def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool:

From 468a8d72bac181c1499320478940cec64363e107 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Tue, 18 Nov 2025 21:05:22 -0800
Subject: [PATCH 173/578] [Bugfix] Fix FusedMoEModularKernel for triton backend
 (#28913)

Signed-off-by: Xin Yang <xyangx@amazon.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index b95d1a6b3a1f..66ae2e94c60a 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -755,8 +755,10 @@ def _interleave_mxfp4_cutlass_sm90(w):
 
             self.w13_weight = w13_weight
             self.w2_weight = w2_weight
-            layer.w13_weight = Parameter(w13_weight.storage.data, requires_grad=False)
-            layer.w2_weight = Parameter(w2_weight.storage.data, requires_grad=False)
+            del layer.w13_weight
+            del layer.w2_weight
+            layer.w13_weight = w13_weight
+            layer.w2_weight = w2_weight
         else:
             raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
 
@@ -1065,8 +1067,8 @@ def apply(
 
             return triton_kernel_moe_forward(
                 hidden_states=x,
-                w1=self.w13_weight,
-                w2=self.w2_weight,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
                 gating_output=router_logits,
                 topk=top_k,
                 renormalize=renormalize,

From 73ff872db0d4e3f5e133d5d2a5307248619d93a6 Mon Sep 17 00:00:00 2001
From: Gleb Kurchanov <nepherpitou@gmail.com>
Date: Wed, 19 Nov 2025 08:21:02 +0300
Subject: [PATCH 174/578] [Bugfix] Fix typo in Qwen3 Next model executor
 (#28960)

Signed-off-by: Gleb Kurchanov <nepherpitou@gmail.com>
---
 vllm/model_executor/models/qwen3_next.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 86508a7c6431..0415c8e00fdf 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -1154,8 +1154,8 @@ def set_moe_parameters(self):
                 example_moe = layer.mlp
                 self.moe_layers.append(layer.mlp.experts)
 
-            if example_moe is None:
-                raise RuntimeError("No Qwen3Next layer found in the model.layers.")
+        if example_moe is None:
+            raise RuntimeError("No Qwen3Next layer found in the model.layers.")
 
         # Set MoE hyperparameters
         self.num_moe_layers = len(self.moe_layers)

From 6a25ea5f0ea193e35b5a83cb0285c48964bc9eb1 Mon Sep 17 00:00:00 2001
From: Uranus <109661872+UranusSeven@users.noreply.github.com>
Date: Wed, 19 Nov 2025 13:30:08 +0800
Subject: [PATCH 175/578] [Docs] Update oneshot imports (#28188)

Signed-off-by: UranusSeven <109661872+UranusSeven@users.noreply.github.com>
---
 docs/features/quantization/fp8.md               | 2 +-
 docs/features/quantization/int4.md              | 2 +-
 docs/features/quantization/int8.md              | 2 +-
 docs/features/quantization/quantized_kvcache.md | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index 0c5111fb8af0..d4a6176b236f 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -60,7 +60,7 @@ Since simple RTN does not require data for weight quantization and the activatio
 ??? code
 
     ```python
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
     from llmcompressor.modifiers.quantization import QuantizationModifier
 
     # Configure the simple PTQ quantization
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index 035e7ea291f9..9752039097d6 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -80,7 +80,7 @@ Now, apply the quantization algorithms:
 ??? code
 
     ```python
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
     from llmcompressor.modifiers.quantization import GPTQModifier
     from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index ec8a77f74ffe..701ca6378cb1 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -87,7 +87,7 @@ Now, apply the quantization algorithms:
 ??? code
 
     ```python
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
     from llmcompressor.modifiers.quantization import GPTQModifier
     from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index 56cf057678be..d26a5e217f31 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -78,7 +78,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
     ```python
     from datasets import load_dataset
     from transformers import AutoModelForCausalLM, AutoTokenizer
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
 
     # Select model and load it
     MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

From 3d4e7d34be856cc4f54033e6a019059afacb5e76 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Wed, 19 Nov 2025 05:43:01 +0000
Subject: [PATCH 176/578] [Model][QwenVL] Simplify cos/sin rotary embedding
 indexing  (#28962)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/model_executor/models/glm4_1v.py           |  9 ++-------
 vllm/model_executor/models/qwen2_5_vl.py        |  9 ++-------
 vllm/model_executor/models/qwen2_vl.py          |  9 ++-------
 .../models/qwen3_omni_moe_thinker.py            |  9 ++-------
 vllm/model_executor/models/qwen3_vl.py          | 17 +++--------------
 5 files changed, 11 insertions(+), 42 deletions(-)

diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 2c2f45c2453e..7a4fee76ae6b 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -797,13 +797,8 @@ def rot_pos_emb(
         # Use pre-computed cos_sin_cache from RotaryEmbedding
         cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
 
-        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
-        cos_w = cos[pos_ids[:, 1]]
-        sin_h = sin[pos_ids[:, 0]]
-        sin_w = sin[pos_ids[:, 1]]
-
-        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
-        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
         return cos_combined, sin_combined, pos_ids
 
     def compute_attn_mask_seqlen(
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 2e4fd9645d88..5b5d50ec8935 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -738,13 +738,8 @@ def rotary_pos_emb_thw(self, t, h, w):
         # Use pre-computed cos_sin_cache from RotaryEmbedding
         cos, sin = self.rotary_pos_emb.get_cos_sin(max_size)
 
-        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
-        cos_w = cos[pos_ids[:, 1]]
-        sin_h = sin[pos_ids[:, 0]]
-        sin_w = sin[pos_ids[:, 1]]
-
-        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
-        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
 
         cos_combined = cos_combined.reshape(
             cos_combined.shape[0] // self.spatial_merge_unit,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 53df5972a8fe..cda8eaf5377f 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -724,13 +724,8 @@ def rot_pos_emb(
         # Use pre-computed cos_sin_cache from RotaryEmbedding
         cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
 
-        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
-        cos_w = cos[pos_ids[:, 1]]
-        sin_h = sin[pos_ids[:, 0]]
-        sin_w = sin[pos_ids[:, 1]]
-
-        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
-        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
         return cos_combined, sin_combined
 
     def compute_attn_mask_seqlen(
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 8274b92138f7..d2fd74a5e41a 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -428,13 +428,8 @@ def rot_pos_emb(self, grid_thw):
         # Use pre-computed cos_sin_cache from RotaryEmbedding
         cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
 
-        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
-        cos_w = cos[pos_ids[:, 1]]
-        sin_h = sin[pos_ids[:, 0]]
-        sin_w = sin[pos_ids[:, 1]]
-
-        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
-        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
 
         return cos_combined, sin_combined
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 99a4007ef7f2..0c546309400b 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -459,18 +459,13 @@ def rot_pos_emb(self, grid_thw: list[list[int]]):
             else self.rot_pos_ids(h, w, self.spatial_merge_size).repeat(t, 1)
             for t, h, w in grid_thw
         ]
-        pos_ids = torch.cat(pos_ids, dim=0)
+        pos_ids = torch.cat(pos_ids, dim=0).to(self.device, non_blocking=True)
 
         # Use pre-computed cos_sin_cache from RotaryEmbedding
         cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
 
-        cos_h = cos[pos_ids[:, 0]]  # (num_tokens, rotary_dim // 2)
-        cos_w = cos[pos_ids[:, 1]]
-        sin_h = sin[pos_ids[:, 0]]
-        sin_w = sin[pos_ids[:, 1]]
-
-        cos_combined = torch.cat([cos_h, cos_w], dim=-1)
-        sin_combined = torch.cat([sin_h, sin_w], dim=-1)
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
 
         return cos_combined, sin_combined
 
@@ -566,12 +561,6 @@ def forward(
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list)
         hidden_states = hidden_states + pos_embeds
         rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
-        rotary_pos_emb_cos = rotary_pos_emb_cos.to(
-            hidden_states.device, non_blocking=True
-        )
-        rotary_pos_emb_sin = rotary_pos_emb_sin.to(
-            hidden_states.device, non_blocking=True
-        )
 
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]

From 71d0ae1c54543689ea7541aa20b9522982b0815e Mon Sep 17 00:00:00 2001
From: Roman Solomatin <samoed.roman@gmail.com>
Date: Wed, 19 Nov 2025 09:28:40 +0300
Subject: [PATCH 177/578] [Misc] Update embedding/cross encoder tests to use
 `mteb` v2 (#27329)

Signed-off-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 requirements/test.in                          |   2 +-
 requirements/test.txt                         |   4 +-
 .../language/pooling_mteb_test/mteb_utils.py  | 179 +++++++++++-------
 .../test_bge_reranker_v2_gemma.py             |  31 ++-
 .../pooling_mteb_test/test_mxbai_rerank.py    |   5 +-
 .../pooling_mteb_test/test_qwen3_reranker.py  |   5 +-
 6 files changed, 143 insertions(+), 83 deletions(-)

diff --git a/requirements/test.in b/requirements/test.in
index 30d97e9b9c7d..05f6bcca5c2c 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -36,7 +36,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 # TODO: Use lm-eval[api]==0.4.10 once released
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
-mteb[bm25s]>=1.38.11, <2 # required for mteb test
+mteb[bm25s]>=2, <3 # required for mteb test
 transformers==4.57.1
 tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
diff --git a/requirements/test.txt b/requirements/test.txt
index 3263b74c0879..bcd511660f85 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -201,8 +201,6 @@ email-validator==2.2.0
     # via pydantic
 encodec==0.1.1
     # via vocos
-eval-type-backport==0.2.2
-    # via mteb
 evaluate==0.4.3
     # via lm-eval
 fastapi==0.116.1
@@ -490,7 +488,7 @@ msgpack==1.1.0
     # via
     #   librosa
     #   ray
-mteb==1.38.11
+mteb==2.1.2
     # via -r requirements/test.in
 multidict==6.1.0
     # via
diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py
index 0384ff82790f..189cdbae99dc 100644
--- a/tests/models/language/pooling_mteb_test/mteb_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_utils.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import tempfile
-from collections.abc import Sequence
 
 import mteb
 import numpy as np
 import requests
 import torch
+from mteb.models import ModelMeta
+from mteb.types import Array
+from torch.utils.data import DataLoader
 
 import tests.ci_envs as ci_envs
 from tests.models.utils import (
@@ -27,24 +29,47 @@
 
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
-MTEB_RERANK_LANGS = ["en"]
+MTEB_RERANK_LANGS = ["eng"]
 MTEB_RERANK_TOL = 2e-3
 
+_empty_model_meta = ModelMeta(
+    loader=None,
+    name="vllm/model",
+    revision="1",
+    release_date=None,
+    languages=None,
+    framework=[],
+    similarity_fn_name=None,
+    n_parameters=None,
+    memory_usage_mb=None,
+    max_tokens=None,
+    embed_dim=None,
+    license=None,
+    open_weights=None,
+    public_training_code=None,
+    public_training_data=None,
+    use_instructions=None,
+    training_datasets=None,
+    modalities=["text"],  # 'image' can be added to evaluate multimodal models
+)
+
+
+class VllmMtebEncoder(mteb.EncoderProtocol):
+    mteb_model_meta = _empty_model_meta
 
-class VllmMtebEncoder(mteb.Encoder):
     def __init__(self, vllm_model):
-        super().__init__()
         self.llm = vllm_model
         self.rng = np.random.default_rng(seed=42)
 
     def encode(
         self,
-        sentences: Sequence[str],
+        inputs: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
         # Hoping to discover potential scheduling
         # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
         r = self.rng.permutation(len(sentences))
         sentences = [sentences[i] for i in r]
         outputs = self.llm.embed(sentences, use_tqdm=False)
@@ -52,36 +77,70 @@ def encode(
         embeds = embeds[np.argsort(r)]
         return embeds
 
+    def similarity(
+        self,
+        embeddings1: np.ndarray,
+        embeddings2: np.ndarray,
+    ) -> np.ndarray:
+        # Cosine similarity
+        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
+        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
+        sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
+        return sim
+
+    def similarity_pairwise(
+        self,
+        embeddings1: Array,
+        embeddings2: Array,
+    ) -> Array:
+        # Cosine similarity
+        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
+        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
+        sim = np.sum(embeddings1 * embeddings2, axis=1) / (
+            norm1.flatten() * norm2.flatten()
+        )
+        return sim
+
+
+class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
+    mteb_model_meta = _empty_model_meta
+
+    def __init__(self, vllm_model):
+        self.llm = vllm_model
+        self.rng = np.random.default_rng(seed=42)
+
     def predict(
         self,
-        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
-
-        queries = [s[0] for s in sentences]
-        corpus = [s[1] for s in sentences]
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        corpus = [text for batch in inputs2 for text in batch["text"]]
 
         outputs = self.llm.score(
             queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
         )
         scores = np.array(outputs)
-        scores = scores[np.argsort(r)]
         return scores
 
 
-class OpenAIClientMtebEncoder(mteb.Encoder):
+class OpenAIClientMtebEncoder(VllmMtebEncoder):
     def __init__(self, model_name: str, client):
-        super().__init__()
         self.model_name = model_name
         self.client = client
         self.rng = np.random.default_rng(seed=42)
 
-    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
+    def encode(
+        self,
+        inputs: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
         # Hoping to discover potential scheduling
         # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
         r = self.rng.permutation(len(sentences))
         sentences = [sentences[i] for i in r]
 
@@ -94,28 +153,29 @@ def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
         return embeds
 
 
-class ScoreClientMtebEncoder(mteb.Encoder):
+class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
+    mteb_model_meta = _empty_model_meta
+
     def __init__(self, model_name: str, url):
-        super().__init__()
         self.model_name = model_name
         self.url = url
         self.rng = np.random.default_rng(seed=42)
 
     def predict(
         self,
-        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        full_corpus = [text for batch in inputs2 for text in batch["text"]]
 
         outputs = []
-        for query, corpus, prompt in sentences:
+        for query, corpus in zip(queries, full_corpus):
             outputs.append(self.get_score(query, corpus))
 
         scores = np.array(outputs)
-        scores = scores[np.argsort(r)]
         return scores
 
     def get_score(self, query, corpus):
@@ -145,16 +205,13 @@ def get_score(self, query, corpus):
         return response["results"][0]["relevance_score"]
 
 
-def run_mteb_embed_task(encoder, tasks):
+def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
     tasks = mteb.get_tasks(tasks=tasks)
-    evaluation = mteb.MTEB(tasks=tasks)
-    results = evaluation.run(
+    results = mteb.evaluate(
         encoder,
-        verbosity=0,
-        output_folder=None,
-        encode_kwargs={
-            "show_progress_bar": False,
-        },
+        tasks,
+        cache=None,
+        show_progress_bar=False,
     )
 
     main_score = results[0].scores["test"][0]["main_score"]
@@ -244,33 +301,39 @@ def mteb_test_embed_models(
     assert st_main_score - vllm_main_score < atol
 
 
-def run_mteb_rerank(cross_encoder, tasks, languages):
-    with tempfile.TemporaryDirectory() as results_folder:
+def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
+    with tempfile.TemporaryDirectory() as prediction_folder:
         bm25s = mteb.get_model("bm25s")
-        tasks = mteb.get_tasks(tasks=tasks, languages=languages)
-
-        subset = "default"
         eval_splits = ["test"]
 
-        evaluation = mteb.MTEB(tasks=tasks)
-        evaluation.run(
+        mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
+            tasks=tasks, languages=languages, eval_splits=eval_splits
+        )
+
+        mteb.evaluate(
             bm25s,
-            verbosity=0,
-            eval_splits=eval_splits,
-            save_predictions=True,
-            output_folder=f"{results_folder}/stage1",
-            encode_kwargs={"show_progress_bar": False},
+            mteb_tasks,
+            prediction_folder=prediction_folder,
+            show_progress_bar=False,
+            # don't save results for test runs
+            cache=None,
+            overwrite_strategy="always",
         )
 
-        results = evaluation.run(
+        second_stage_tasks = []
+        for task in mteb_tasks:
+            second_stage_tasks.append(
+                task.convert_to_reranking(
+                    prediction_folder,
+                    top_k=10,
+                )
+            )
+
+        results = mteb.evaluate(
             cross_encoder,
-            verbosity=0,
-            eval_splits=eval_splits,
-            top_k=10,
-            save_predictions=True,
-            output_folder=f"{results_folder}/stage2",
-            previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
-            encode_kwargs={"show_progress_bar": False},
+            second_stage_tasks,
+            show_progress_bar=False,
+            cache=None,
         )
         main_score = results[0].scores["test"][0]["main_score"]
     return main_score
@@ -280,20 +343,6 @@ def mteb_test_rerank_models_hf(
     hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
 ):
     with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
-        original_predict = hf_model.predict
-
-        def _predict(
-            sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
-            *args,
-            **kwargs,
-        ):
-            # vllm and st both remove the prompt, fair comparison.
-            prompts = [(s[0], s[1]) for s in sentences]
-            return original_predict(prompts, *args, **kwargs, batch_size=8)
-
-        hf_model.predict = _predict
-        hf_model.original_predict = original_predict
-
         if hf_model_callback is not None:
             hf_model_callback(hf_model)
 
@@ -310,7 +359,7 @@ def mteb_test_rerank_models(
     model_info: RerankModelInfo,
     vllm_extra_kwargs=None,
     hf_model_callback=None,
-    vllm_mteb_encoder=VllmMtebEncoder,
+    vllm_mteb_encoder=VllmMtebCrossEncoder,
     atol=MTEB_RERANK_TOL,
 ):
     vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
index 2927a3711136..6b2e46964492 100644
--- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
@@ -2,13 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
+import mteb
 import numpy as np
 import pytest
 import torch
+from torch.utils.data import DataLoader
 
 from tests.conftest import HfRunner
 from tests.models.language.pooling_mteb_test.mteb_utils import (
-    VllmMtebEncoder,
+    VllmMtebCrossEncoder,
     mteb_test_rerank_models,
 )
 from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
@@ -103,7 +105,7 @@ def get_inputs(pairs, tokenizer, prompt=None):
         return torch.Tensor(scores)
 
 
-class GemmaMtebEncoder(VllmMtebEncoder):
+class GemmaMtebEncoder(VllmMtebCrossEncoder):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.query_template = "A: {query}\n"
@@ -111,17 +113,26 @@ def __init__(self, *args, **kwargs):
 
     def predict(
         self,
-        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
-        _sentences = []
-        for query, corpus, prompt in sentences:
-            query = self.query_template.format(query=query)
-            corpus = self.document_template.format(doc=corpus, prompt=PROMPT)
-            _sentences.append((query, corpus, prompt))
-
-        return super().predict(_sentences, *args, **kwargs)
+        queries = [
+            self.query_template.format(query=text)
+            for batch in inputs1
+            for text in batch["text"]
+        ]
+        corpus = [
+            self.document_template.format(doc=text, prompt=PROMPT)
+            for batch in inputs2
+            for text in batch["text"]
+        ]
+        outputs = self.llm.score(
+            queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
+        )
+        scores = np.array(outputs)
+        return scores
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
index fd04dc199023..a6f2a89b268f 100644
--- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
+++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
@@ -70,8 +70,9 @@ def compute_logits(inputs):
             return scores
 
         scores = []
-        for prompt in prompts:
-            inputs = process_inputs([prompt])
+        for query, doc, *_ in prompts:
+            pairs = [(query, doc)]
+            inputs = process_inputs(pairs)
             score = compute_logits(inputs)
             scores.append(score[0].item())
         return torch.Tensor(scores)
diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
index 00e99f44cfdb..9a1be6c0be1d 100644
--- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
+++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
@@ -72,8 +72,9 @@ def compute_logits(inputs):
             return scores
 
         scores = []
-        for prompt in prompts:
-            inputs = process_inputs([prompt])
+        for query, doc, *_ in prompts:
+            pairs = [(query, doc)]
+            inputs = process_inputs(pairs)
             score = compute_logits(inputs)
             scores.append(score[0].item())
         return torch.Tensor(scores)

From a4511e38db375a85b4dd784c2c38528747288f46 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 19 Nov 2025 01:46:32 -0500
Subject: [PATCH 178/578] Speed up macOS smoke test (#28954)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .github/workflows/macos-smoke-test.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index 42b05ecd5ac0..a183033c9add 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -9,7 +9,7 @@ on:
 jobs:
   macos-m1-smoke-test:
     runs-on: macos-latest
-    timeout-minutes: 20
+    timeout-minutes: 30
 
     steps:
       - uses: actions/checkout@v4
@@ -37,15 +37,14 @@ jobs:
       - name: Verify installation
         run: |
           python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
-          python -c "import torch; print(f'PyTorch: {torch.__version__}')"
 
       - name: Smoke test vllm serve
-        timeout-minutes: 10
         run: |
           # Start server in background
           vllm serve Qwen/Qwen3-0.6B \
-            --max-model-len=2048 \
+            --max-model-len=2K \
             --load-format=dummy \
+            --hf-overrides '{"num_hidden_layers": 2}' \
             --enforce-eager \
             --port 8000 &
 

From 7ed27f3cb55e3f64614300ec7acde1b382a48541 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Wed, 19 Nov 2025 07:52:30 +0100
Subject: [PATCH 179/578] [Doc]: fix typos in various files (#28945)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 docs/design/moe_kernel_features.md                   | 4 ++--
 docs/design/plugin_system.md                         | 2 +-
 docs/features/quantization/quark.md                  | 2 +-
 examples/online_serving/prometheus_grafana/README.md | 2 +-
 vllm/engine/arg_utils.py                             | 2 +-
 vllm/envs.py                                         | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 7663b82266f0..36ae9506b65f 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -4,7 +4,7 @@ The purpose of this document is to provide an overview of the various MoE kernel
 
 ## Fused MoE Modular All2All backends
 
-There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` sub-classes provide an interface for each all2all backend.
+There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` subclasses provide an interface for each all2all backend.
 
 The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support.
 
@@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 
 ## Fused MoE Experts Kernels
 
-The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
+There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
 
 Each kernel must be provided with one of the supported input activation formats.  Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
 
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index dc2f7c4aed3c..e8db8047ca4e 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -49,7 +49,7 @@ Every plugin has three parts:
 
 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
 
-- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
+- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre-/post-processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
 
 - **Stat logger plugins** (with group name `vllm.stat_logger_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree loggers into vLLM. The entry point should be a class that subclasses StatLoggerBase.
 
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index bd7bc186e13a..c54d7d225199 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -306,7 +306,7 @@ As examples, we provide some ready-to-use quantized mixed precision model to sho
 
 ### 2. inference the quantized mixed precision model in vLLM
 
-Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follow:
+Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follows:
 
 ```bash
 lm_eval --model vllm \
diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
index 5cd4dab5a8fa..9615210a2ad8 100644
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -46,7 +46,7 @@ Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the de
 
 Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus.
 
-On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
+On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each container. You can just use `http://prometheus:9090`.
 
 Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ab6e5e594c23..e2f7326448b3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1500,7 +1500,7 @@ def create_engine_config(
         # Local DP rank = 1, use pure-external LB.
         if data_parallel_external_lb:
             assert self.data_parallel_rank is not None, (
-                "data_parallel_rank or node_rank must be spefified if "
+                "data_parallel_rank or node_rank must be specified if "
                 "data_parallel_external_lb is enable."
             )
             assert self.data_parallel_size_local in (1, None), (
diff --git a/vllm/envs.py b/vllm/envs.py
index 6d92d5afee50..e61fb114325c 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1261,7 +1261,7 @@ def get_vllm_port() -> int | None:
     # MoE routing strategy selector.
     # See `RoutingSimulator.get_available_strategies()` # for available
     # strategies.
-    # Cutstom routing strategies can be registered by
+    # Custom routing strategies can be registered by
     # RoutingSimulator.register_strategy()
     # Note: custom strategies may not produce correct model outputs
     "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": lambda: os.environ.get(

From ae4821a1086325decbc801d3292dee42e42549bb Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Tue, 18 Nov 2025 23:47:57 -0800
Subject: [PATCH 180/578] Add CPU support model (#28697)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 docs/models/hardware_supported_models/cpu.md | 26 ++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 docs/models/hardware_supported_models/cpu.md

diff --git a/docs/models/hardware_supported_models/cpu.md b/docs/models/hardware_supported_models/cpu.md
new file mode 100644
index 000000000000..0832755f8fbe
--- /dev/null
+++ b/docs/models/hardware_supported_models/cpu.md
@@ -0,0 +1,26 @@
+# CPU - Intel® Xeon®
+
+## Supported Models
+
+### Text-only Language Models
+
+| Model                                | Architecture                             | Supported |
+|--------------------------------------|-------------------------------------------|-----------|
+| meta-llama/Llama-3.1 / 3.3           | LlamaForCausalLM                          | ✅        |
+| meta-llama/Llama-4-Scout             | Llama4ForConditionalGeneration            | ✅        |
+| meta-llama/Llama-4-Maverick          | Llama4ForConditionalGeneration            | ✅        |
+| ibm-granite/granite (Granite-MOE)    | GraniteMoeForCausalLM                     | ✅        |
+| Qwen/Qwen3                           | Qwen3ForCausalLM                          | ✅        |
+| zai-org/GLM-4.5                      | GLMForCausalLM                            | ✅        |
+| google/gemma                         | GemmaForCausalLM                          | ✅        |
+
+### Multimodal Language Models
+
+| Model                                | Architecture                             | Supported |
+|--------------------------------------|-------------------------------------------|-----------|
+| Qwen/Qwen2.5-VL                      | Qwen2VLForConditionalGeneration           | ✅        |
+| openai/whisper                       | WhisperForConditionalGeneration           | ✅        |
+
+✅ Runs and optimized.  
+🟨 Runs and correct but not optimized to green yet.  
+❌ Does not pass accuracy test or does not run.  

From d69062c67af46a2e624be92162e9db585eef329b Mon Sep 17 00:00:00 2001
From: gnovack <gnovack@amazon.com>
Date: Wed, 19 Nov 2025 00:32:00 -0800
Subject: [PATCH 181/578] add support for --fully-sharded-loras in fused_moe
 (#28761)

Signed-off-by: gnovack <gnovack@amazon.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_fused_moe_lora_kernel.py      | 208 +++++++++++++++++-
 tests/lora/test_olmoe_tp.py                   |  10 +-
 vllm/lora/layers/fused_moe.py                 |  36 ++-
 vllm/lora/ops/triton_ops/fused_moe_lora_op.py |  24 +-
 vllm/lora/punica_wrapper/punica_base.py       |   2 +
 vllm/lora/punica_wrapper/punica_gpu.py        |   4 +
 6 files changed, 274 insertions(+), 10 deletions(-)

diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
index 91ab4a87c65f..91c8b861c3c5 100644
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -1,13 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import random
 
 import pytest
 import torch
 
+from tests.utils import multi_gpu_test
 from vllm import _custom_ops as ops
+from vllm.distributed import (
+    init_distributed_environment,
+    initialize_model_parallel,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_world_size,
+)
 from vllm.lora.ops.triton_ops import fused_moe_lora
 from vllm.platforms import current_platform
+from vllm.utils.network_utils import get_open_port
 
 
 @pytest.fixture(autouse=True)
@@ -122,6 +134,8 @@ def use_fused_moe_lora_kernel(
     max_loras,
     num_experts,
     block_size,
+    fully_sharded=False,
+    offset=0,
 ):
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
     max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
@@ -195,10 +209,10 @@ def use_fused_moe_lora_kernel(
         config["NUM_STAGES"],
         config["SPLIT_K"],
         mul_routed_weight,
+        fully_sharded=fully_sharded,
+        offset=offset,
     )
 
-    return output
-
 
 def use_torch(
     hidden_states,
@@ -317,3 +331,193 @@ def test_fused_moe_lora_kernel(
     )
 
     torch.testing.assert_close(output, output2, atol=1e-1, rtol=1e-1)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("num_tokens", [100])
+@pytest.mark.parametrize("top_k_num", [6])
+@pytest.mark.parametrize("num_experts", [64])
+@pytest.mark.parametrize("max_loras", [4])
+@pytest.mark.parametrize("N", [1408])
+@pytest.mark.parametrize("K", [2048])
+@pytest.mark.parametrize("max_lora_rank", [16, 32, 64])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("column_parallel", [True, False])
+def test_fused_moe_lora_kernel_fully_sharded(
+    num_tokens,
+    top_k_num,
+    num_experts,
+    max_loras,
+    N,
+    K,
+    max_lora_rank,
+    block_size,
+    dtype,
+    seed,
+    column_parallel,
+):
+    current_platform.seed_everything(seed)
+    # the number of randomly generated sentences.
+    num_sequences = 10
+    # generate data
+    topk_ids, topk_weights, token_lora_mapping = sample_data(
+        num_tokens, num_sequences, max_loras, num_experts, top_k_num
+    )
+
+    def run_torch_spawn(fn, nprocs):
+        torch.multiprocessing.spawn(
+            fn,
+            args=(
+                nprocs,
+                f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
+                dtype,
+                seed,
+                N,
+                K,
+                num_tokens,
+                topk_ids,
+                topk_weights,
+                token_lora_mapping,
+                max_lora_rank,
+                top_k_num,
+                max_loras,
+                num_experts,
+                block_size,
+                column_parallel,
+            ),
+            nprocs=nprocs,
+        )
+
+    run_torch_spawn(use_fused_moe_lora_kernel_tensor_parallel, nprocs=2)
+
+
+def use_fused_moe_lora_kernel_tensor_parallel(
+    local_rank,
+    world_size,
+    init_method,
+    dtype,
+    seed,
+    N,
+    K,
+    num_tokens,
+    topk_ids,
+    topk_weights,
+    token_lora_mapping,
+    max_lora_rank,
+    top_k_num,
+    max_loras,
+    num_experts,
+    block_size,
+    column_parallel,
+):
+    def _get_shard_slice(shard_size):
+        return slice(local_rank * shard_size, (local_rank + 1) * shard_size)
+
+    current_platform.seed_everything(seed)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    init_distributed_environment(
+        world_size=world_size,
+        rank=local_rank,
+        local_rank=local_rank,
+        distributed_init_method=init_method,
+    )
+    initialize_model_parallel(world_size, 1)
+    tp_size = get_tensor_model_parallel_world_size()
+
+    input_dim = K if column_parallel else N
+    output_dim = N if column_parallel else K
+
+    # init lora weights
+    lora_a = torch.rand(
+        (
+            max_loras,
+            num_experts,
+            max_lora_rank,
+            input_dim,
+        ),
+        dtype=dtype,
+    )
+    lora_b = torch.rand(
+        (
+            max_loras,
+            num_experts,
+            output_dim,
+            max_lora_rank,
+        ),
+        dtype=dtype,
+    )
+
+    hidden_states = torch.rand(
+        (
+            num_tokens,
+            input_dim,
+        ),
+        dtype=dtype,
+    )
+
+    output = torch.zeros((num_tokens, top_k_num, output_dim), dtype=dtype)
+    topk_ids = topk_ids.to(device)
+    topk_weights = topk_weights.to(device)
+    token_lora_mapping = token_lora_mapping.to(device)
+
+    ref_output = use_torch(
+        hidden_states,
+        token_lora_mapping,
+        topk_ids,
+        [lora_a],
+        [lora_b],
+        top_k_num,
+    )
+
+    if column_parallel:
+        # Column parallel (e.g. gate_up_proj): LoRA A is sliced along the rank dim,
+        # and Lora B is sliced along the output dim
+        lora_a_shard_size = max_lora_rank // tp_size
+        lora_a = lora_a[:, :, _get_shard_slice(lora_a_shard_size), :]
+        max_lora_rank = lora_a_shard_size
+        offset = 0
+
+        lora_b_shard_size = output_dim // tp_size
+        lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :]
+        output = output[:, :, _get_shard_slice(lora_b_shard_size)].contiguous()
+    else:
+        # Row parallel (e.g. down proj): LoRA A is sliced along the input dim,
+        # and LoRA B is sliced along the output dim
+        lora_a_shard_size = input_dim // tp_size
+        lora_a = lora_a[:, :, :, _get_shard_slice(lora_a_shard_size)]
+        hidden_states = hidden_states[:, _get_shard_slice(lora_a_shard_size)]
+
+        lora_b_shard_size = output_dim // tp_size
+        lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :]
+        offset = lora_b_shard_size * local_rank
+
+    use_fused_moe_lora_kernel(
+        topk_ids,
+        topk_weights,
+        token_lora_mapping,
+        max_lora_rank,
+        top_k_num,
+        [lora_a],
+        [lora_b],
+        hidden_states,
+        output,
+        max_loras,
+        num_experts,
+        block_size,
+        fully_sharded=True,
+        offset=offset,
+    )
+
+    if column_parallel:
+        output = tensor_model_parallel_all_gather(output)
+    else:
+        output = tensor_model_parallel_all_reduce(output)
+
+    torch.testing.assert_close(output, ref_output, atol=1e-1, rtol=1e-1)
diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py
index e659c1e1a9a0..e3c9816625ba 100644
--- a/tests/lora/test_olmoe_tp.py
+++ b/tests/lora/test_olmoe_tp.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -111,8 +113,9 @@ def test_olmoe_lora_mixed(olmoe_lora_files):
     generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None])
 
 
+@pytest.mark.parametrize("fully_sharded_loras", [False, True])
 @multi_gpu_test(num_gpus=2)
-def test_olmoe_lora_tp2(olmoe_lora_files):
+def test_olmoe_lora_tp2(olmoe_lora_files, fully_sharded_loras):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
@@ -122,14 +125,16 @@ def test_olmoe_lora_tp2(olmoe_lora_files):
         trust_remote_code=True,
         enable_chunked_prefill=True,
         tensor_parallel_size=2,
+        fully_sharded_loras=fully_sharded_loras,
     )
 
     generate_and_test(llm, olmoe_lora_files, lora_id=1)
     generate_and_test(llm, olmoe_lora_files, lora_id=2)
 
 
+@pytest.mark.parametrize("fully_sharded_loras", [False, True])
 @multi_gpu_test(num_gpus=4)
-def test_olmoe_lora_tp4(olmoe_lora_files):
+def test_olmoe_lora_tp4(olmoe_lora_files, fully_sharded_loras):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
@@ -139,6 +144,7 @@ def test_olmoe_lora_tp4(olmoe_lora_files):
         trust_remote_code=True,
         enable_chunked_prefill=True,
         tensor_parallel_size=4,
+        fully_sharded_loras=fully_sharded_loras,
     )
 
     generate_and_test(llm, olmoe_lora_files, lora_id=1)
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 8fb3efa220f6..3291c41fcda1 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -12,6 +12,7 @@
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
+from vllm.distributed.utils import divide
 from vllm.lora.layers.base import BaseLayerWithLoRA
 from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -205,6 +206,7 @@ def wrapper(*args, **kwargs):
                     shrink_config,  ## pass the shrink config
                     expand_config,  ## pass the expand config
                     self.adapter_enabled,
+                    fully_sharded=self.fully_sharded,
                 )
 
                 result = func(*args, **kwargs)
@@ -250,7 +252,10 @@ def wrapper(*args, **kwargs):
                 sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1)
                 intermediate_cache2 = moe_state_dict["intermediate_cache2"]
                 intermediate_cache3 = args[0]
-                max_lora_rank = self.w1_lora_a_stacked.shape[-2]
+                max_lora_rank = self.w2_lora_a_stacked.shape[-2]
+
+                shard_size_w2 = divide(self.base_layer.hidden_size, self.tp_size)
+
                 self.punica_wrapper.add_lora_fused_moe(
                     intermediate_cache3,
                     intermediate_cache2,
@@ -266,6 +271,8 @@ def wrapper(*args, **kwargs):
                     expand_config,  ## pass the expand config
                     self.adapter_enabled,
                     True,
+                    fully_sharded=self.fully_sharded,
+                    offset=shard_size_w2 * self.tp_rank if self.fully_sharded else 0,
                 )
 
                 result = func(*args, **kwargs)
@@ -294,6 +301,7 @@ def create_lora_weights(
         model_config: PretrainedConfig | None = None,
     ) -> None:
         """Initializes lora matrices."""
+        self.fully_sharded = lora_config.fully_sharded_loras
 
         self.adapter_enabled = torch.tensor(
             [0] * (max_loras + 1), dtype=torch.int, device=self.device
@@ -303,7 +311,9 @@ def create_lora_weights(
             (
                 max_loras,
                 self.base_layer.local_num_experts,
-                lora_config.max_lora_rank,
+                lora_config.max_lora_rank
+                if not self.fully_sharded
+                else divide(lora_config.max_lora_rank, self.tp_size),
                 self.base_layer.hidden_size,
             ),
             dtype=lora_config.lora_dtype,
@@ -334,7 +344,9 @@ def create_lora_weights(
             (
                 max_loras,
                 self.base_layer.local_num_experts,
-                self.base_layer.hidden_size,
+                self.base_layer.hidden_size
+                if not self.fully_sharded
+                else divide(self.base_layer.hidden_size, self.tp_size),
                 lora_config.max_lora_rank,
             ),
             dtype=lora_config.lora_dtype,
@@ -345,7 +357,9 @@ def create_lora_weights(
             (
                 max_loras,
                 self.base_layer.local_num_experts,
-                lora_config.max_lora_rank,
+                lora_config.max_lora_rank
+                if not self.fully_sharded
+                else divide(lora_config.max_lora_rank, self.tp_size),
                 self.base_layer.hidden_size,
             ),
             dtype=lora_config.lora_dtype,
@@ -419,6 +433,20 @@ def set_lora(
                 w3_lora_b = w3_lora_b[start_idx:end_idx, :]
                 w2_lora_a = w2_lora_a[:, start_idx:end_idx]
 
+                if self.fully_sharded:
+                    # Based on S-LoRA, we slice W1 and W3 A along the rank dim,
+                    # and W2 B along the hidden_size dim.
+                    w13_shard_size = self.w1_lora_a_stacked[index, eid].shape[0]
+                    w13_start_idx = self.tp_rank * w13_shard_size
+                    w13_end_idx = (self.tp_rank + 1) * w13_shard_size
+                    w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :]
+                    w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :]
+
+                    w2_shard_size = self.w2_lora_b_stacked[index, eid].shape[0]
+                    w2_start_idx = self.tp_rank * w2_shard_size
+                    w2_end_idx = (self.tp_rank + 1) * w2_shard_size
+                    w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :]
+
             self.w1_lora_a_stacked[
                 index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1]
             ].copy_(w1_lora_a, non_blocking=True)
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index e2dd47dbb4e6..413ee8ecbbf9 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -3,6 +3,10 @@
 
 import torch
 
+from vllm.distributed import (
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import direct_register_custom_op
 
@@ -311,6 +315,7 @@ def _fused_moe_lora_expand(
     num_stages: int,
     split_k: int,
     mul_routed_weight: bool = False,
+    offset: int = 0,
 ) -> None:
     b_ptr = _get_ptr(lora_b_stacked, device)
     K = max_lora_rank
@@ -380,7 +385,7 @@ def _fused_moe_lora_expand(
         **expand_config,
     )
     for i in range(num_slices):
-        output[:, :, i * N : (i + 1) * N] += b_intermediate_cache1[i]
+        output[:, :, i * N + offset : (i + 1) * N + offset] += b_intermediate_cache1[i]
 
 
 @torch.inference_mode()
@@ -416,6 +421,8 @@ def _fused_moe_lora(
     expand_num_stages: int,
     expand_split_k: int,
     mul_routed_weight: bool = False,
+    fully_sharded: bool = False,
+    offset: int = 0,
 ) -> None:
     assert len(lora_a_stacked) == len(lora_b_stacked) > 0
     assert (
@@ -430,7 +437,6 @@ def _fused_moe_lora(
         == expert_ids.shape[0]
         == num_tokens_post_padded.shape[0]
     )
-    assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1]
     assert output.shape[0] == topk_weights.shape[0]
     assert top_k_num == topk_weights.shape[1]
     device = qcurr_hidden_states.device
@@ -480,6 +486,19 @@ def _fused_moe_lora(
         mul_routed_weight,
     )
 
+    if fully_sharded:
+        if max_lora_rank == w1_lora_b_stacked.shape[-1]:
+            a_intermediate_cache1 = tensor_model_parallel_all_reduce(
+                a_intermediate_cache1
+            )
+        else:
+            a_intermediate_cache1 = tensor_model_parallel_all_gather(
+                a_intermediate_cache1
+            )
+
+            # reset max_lora_rank to the full rank after allgather
+            max_lora_rank = a_intermediate_cache1.shape[-1]
+
     _fused_moe_lora_expand(
         output,
         a_intermediate_cache1,
@@ -510,6 +529,7 @@ def _fused_moe_lora(
         expand_num_stages,
         expand_split_k,
         mul_routed_weight,
+        offset,
     )
 
 
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index b6186e856152..a6ffbb7b71ce 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -483,6 +483,8 @@ def add_lora_fused_moe(
         expand_config,
         adapter_enabled: torch.Tensor,
         mul_routed_weight=False,
+        fully_sharded: bool = False,
+        offset: int = 0,
     ):
         """
         Performs a fused forward computation for LoRA of
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index ede50a48af98..d863a5884d3c 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -375,6 +375,8 @@ def add_lora_fused_moe(
         expand_config,
         adapter_enabled: torch.Tensor,
         mul_routed_weight=False,
+        fully_sharded: bool = False,
+        offset: int = 0,
     ):
         """
         Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer.
@@ -408,4 +410,6 @@ def add_lora_fused_moe(
             expand_config.get("NUM_STAGES", 3),
             expand_config.get("SPLIT_K", 1),
             mul_routed_weight,
+            fully_sharded,
+            offset,
         )

From fdf93486d6c4f36be2f410a846bf68654041dc51 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 19 Nov 2025 18:35:29 +0800
Subject: [PATCH 182/578] [Docs] Clean up moe_kernel_features.md (#28530)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/design/moe_kernel_features.md | 92 +++++++++++++++---------------
 1 file changed, 45 insertions(+), 47 deletions(-)

diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 36ae9506b65f..f0d5a3e934f3 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -1,4 +1,4 @@
-# Fused MoE Kernel features
+# Fused MoE Kernel Features
 
 The purpose of this document is to provide an overview of the various MoE kernels (both modular and non-modular) so it will be easier to select an appropriate set of kernels for any particular situation. This includes information about the all2all backends used by modular kernels.
 
@@ -8,15 +8,15 @@ There are a number of all2all communication backends that are used to implement
 
 The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support.
 
-The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, the finalize step requires the same format. All the backend `prepare` methods expect activations in standard format and all the `finalize methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document.
+The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, and the finalize step requires the same format. All the backend `prepare` methods expect activations in the standard format and all the `finalize` methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document.
 
-The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports. e.g. deepep_high_throughput supports only block-quantized fp8 format, any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type.  The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 w/per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process.  If no quantization type is specified, the kernel operates on float16 and/or bfloat16.
+The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports, e.g. deepep_high_throughput supports only block-quantized fp8 format. Any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 with per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16.
 
 Async backends support the use of DBO (Dual Batch Overlap) and shared expert overlap (where shared experts are computed during the combine step).
 
-Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass, for non-modular kernels, it is up to the experts function to deal with this flag.
+Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag.
 
-unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`.  All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP w/o EP.
+Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.
 
 <style>
 td {
@@ -30,24 +30,23 @@ th {
 }
 </style>
 
-| Backend                               | Output act. format | Quant. types    | Quant. format          | Async | Apply Weight On Input | Sub-class                                                                                                                                                     |
-|---------------------------------------|--------------------|-----------------|------------------------|-------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| naive                                 | standard           | all<sup>1</sup> | G,A,T                  | N     | <sup>6</sup>          | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl]                                                                                  |
-| pplx                                  | batched            | fp8,int8        | G,A,T                  | Y     | Y                     | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize]                                                 |
-| deepep_high_throughput                | standard           | fp8             | G(128),A,T<sup>2</sup> | Y     | Y                     | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize]                                    |
-| deepep_low_latency                    | batched            | fp8             | G(128),A,T<sup>3</sup> | Y     | Y                     | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize]                                    |
-| flashinfer_all2allv                   | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] |
-| flashinfer<sup>4</sup>                | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize]   |
-| flashinfer<sup>4</sup>                | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize]   |
-| MoEPrepareAndFinalizeNoEP<sup>5</sup> | standard           | fp8,int8        | G,A,T                  | N     | Y                     | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP]                                                |
-| BatchedPrepareAndFinalize<sup>5</sup> | batched            | fp8,int8        | G,A,T                  | N     | Y                     | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize]                                               |
+| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
+|---------|--------------------|--------------|---------------|-------|-----------------------|-----------|
+| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] |
+| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] |
+| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
+| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
+| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] |
+| flashinfer<sup>4</sup> | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] |
+| MoEPrepareAndFinalizeNoEP<sup>5</sup> | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] |
+| BatchedPrepareAndFinalize<sup>5</sup> | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] |
 
 !!! info "Table key"
     1. All types: mxfp4, nvfp4, int4, int8, fp8
     2. A,T quantization occurs after dispatch.
     3. All quantization happens after dispatch.
     4. Controlled by different env vars (`VLLM_FLASHINFER_MOE_BACKEND` "throughput" or "latency")
-    5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs w/o dispatch or combine.  These cannot be selected via environment variable.  These are generally use for testing or adapting an expert subclass to the `fused_experts` API.
+    5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs without dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API.
     6. This depends on the experts implementation.
 
     ---
@@ -66,44 +65,43 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
 - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
 
-## Fused MoE Experts Kernels
+## Fused Experts Kernels
 
-There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
+There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters, so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
 
-Each kernel must be provided with one of the supported input activation formats.  Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
+Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx` and `DeepEPLLPrepareAndFinalize`.
 
 Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type.
 
-Each experts kernel supports one or more activation functions, e.g. silu, gelu that are applied to the intermediate results.
+Each experts kernel supports one or more activation functions, e.g. silu or gelu, which are applied to the intermediate results.
 
 As with the backends, some experts support applying topk weights on the input activations. The entries in the column in this table only apply to the non-modular experts.
 
 Most experts flavors include an equivalent modular interface which will be a subclass of `FusedMoEPermuteExpertsUnpermute`.
 
-To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels must have compatible activation formats, quantization types and quantization formats.
-
-| Kernel                       | Input act. format     | Quant. types     | Quant. format | Activation function                                         | Apply Weight On Input | Modular | Source                                                                                                                                                                                                                                                                                                      |
-|------------------------------|-----------------------|------------------|---------------|-------------------------------------------------------------|-----------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| triton                       | standard              | all<sup>1</sup>  | G,A,T         | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y                     | Y       | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts]                                                                                                                                        |
-| triton (batched)             | batched               | all<sup>1</sup>  | G,A,T         | silu, gelu                                                  | <sup>6</sup>          | Y       | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts]                                                                                                                                                                                                       |
-| deep gemm                    | standard,</br>batched | fp8              | G(128),A,T    | silu, gelu                                                  | <sup>6</sup>          | Y       | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
-| cutlass_fp4                  | standard,</br>batched | nvfp4            | A,T           | silu                                                        | Y                     | Y       | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4]                                                                                                                        |
-| cutlass_fp8                  | standard,</br>batched | fp8              | A,T           | silu, gelu                                                  | Y                     | Y       | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8]            |
-| flashinfer                   | standard              | nvfp4,</br>fp8   | T             | <sup>5</sup>                                                | N                     | Y       | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts]                                                                            |
-| gpt oss triton               | standard              | N/A              | N/A           | <sup>5</sup>                                                | Y                     | Y       | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts]                                                                    |
-| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup>  | G(128),A,T    | silu, gelu                                                  | <sup>6</sup>          | Y       | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts]                                                 |
-| marlin                       | standard              | <sup>3</sup>     | <sup>3</sup>  | silu,</br>swigluoai                                         | Y                     | Y       | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]          |
-| marlin experts               | standard,</br>batched | N/A              | N/A           | silu,</br>swigluoai                                         | Y                     | Y       | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]                                                                                                            |
-| trtllm                       | standard              | mxfp4,</br>nvfp4 | G(16),G(32)   | <sup>5</sup>                                                | N                     | Y       | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts]                                                                                                                                                                                                                      |
-| pallas                       | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe]                                                                                                                                                                                                                                    |
-| iterative                    | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe]                                                                                                                                                                                                                           |
-| rocm aiter moe               | standard              | fp8              | G(128),A,T    | silu, gelu                                                  | Y                     | N       | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts]                                                                                                                                                                                           |
-| cpu_fused_moe                | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE]                                                                                                                                                                                                                             |
-| naive batched<sup>4</sup>    | batched               | int8,</br>fp8    | G,A,T         | silu, gelu                                                  | <sup>6</sup>          | Y       | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts]                                                                                                                                                                                                         |
+To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels must have compatible activation formats, quantization types and quantization formats.
+
+| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
+|--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------|
+| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
+| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
+| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
+| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
+| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
+| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
+| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
+| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
+| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
+| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
+| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
+| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
+| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] |
+| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
+| naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
 
 !!! info "Table key"
     1. All types: mxfp4, nvfp4, int4, int8, fp8
-    2. A dispatcher wrapper around triton and deep gemm experts.  Will select based on type + shape + quantization params
+    2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params
     3. uint4, uint8, fp8, fp4
     4. This is a naive implementation of experts that supports batched format. Mainly used for testing.
     5. The `activation` parameter is ignored and SwiGlu is used by default instead.
@@ -113,8 +111,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
 
 The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
 
-| backend                          | `FusedMoEPrepareAndFinalize` subclasses                    | `FusedMoEPermuteExpertsUnpermute` subclasses                                                                               |
-|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
-| deepep_high_throughput           | `DeepEPHTPrepareAndFinalize`                               |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts`                                  |
-| deepep_low_latency,</br>pplx     | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts`|
-| flashinfer                       | `FlashInferCutlassMoEPrepareAndFinalize`                   | `FlashInferExperts`                                                                                                                                    |
+| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
+|---------|-----------------------------------------|----------------------------------------------|
+| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
+| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
+| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |

From 815160958327d601933139b9e76a01eb6d2bc5cf Mon Sep 17 00:00:00 2001
From: ihb2032 <40718643+ihb2032@users.noreply.github.com>
Date: Wed, 19 Nov 2025 19:05:44 +0800
Subject: [PATCH 183/578] refactor(cpu_types_scalar.hpp): Unify scalar loop
 implementations using unroll_loop (#28847)

Signed-off-by: ihb2032 <1355790728@qq.com>
Co-authored-by: lyd1992 <liuyudong@iscas.ac.cn>
---
 csrc/cpu/cpu_types_scalar.hpp | 222 +++++++++++++---------------------
 1 file changed, 87 insertions(+), 135 deletions(-)

diff --git a/csrc/cpu/cpu_types_scalar.hpp b/csrc/cpu/cpu_types_scalar.hpp
index 1a9278bc662e..f9da78283da5 100644
--- a/csrc/cpu/cpu_types_scalar.hpp
+++ b/csrc/cpu/cpu_types_scalar.hpp
@@ -26,10 +26,6 @@ namespace vec_op {
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
-#define __max(a, b) ((a) > (b) ? (a) : (b))
-#define __min(a, b) ((a) < (b) ? (a) : (b))
-#define __abs(a) ((a) < (0) ? (0 - a) : (a))
-
 typedef struct f16x8_t {
   uint16_t val[8];
 } f16x8_t;
@@ -99,7 +95,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
   void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
 
   void save(void* ptr, const int elem_num) const {
-    int num = __min(elem_num, VEC_ELEM_NUM);
+    int num = std::min(elem_num, VEC_ELEM_NUM);
     std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
   }
 };
@@ -128,7 +124,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
   void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
 
   void save(void* ptr, const int elem_num) const {
-    int num = __min(elem_num, VEC_ELEM_NUM);
+    int num = std::min(elem_num, VEC_ELEM_NUM);
     std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
   }
 };
@@ -143,9 +139,9 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   explicit BF16Vec32(f16x32_t data) : reg(data) {};
 
   explicit BF16Vec32(BF16Vec8& vec8_data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&vec8_data, this](int i) {
       reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
-    }
+    });
   }
 
   void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
@@ -157,15 +153,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
   f32x4_t reg;
 
   explicit FP32Vec4(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
   }
 
   explicit FP32Vec4() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
   }
 
   explicit FP32Vec4(const float* ptr)
@@ -182,15 +174,11 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   f32x8_t reg;
 
   explicit FP32Vec8(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
   }
 
   explicit FP32Vec8() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
   }
 
   explicit FP32Vec8(const float* ptr)
@@ -201,78 +189,68 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
 
   explicit FP32Vec8(const FP16Vec8& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = fp16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); });
   }
 
   FP32Vec8(const BF16Vec8& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = bf16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); });
   }
 
   float reduce_sum() const {
     float result = 0;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result += reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result += reg.val[i]; });
     return result;
   }
 
   FP32Vec8 exp() const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = expf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = expf(reg.val[i]); });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 tanh() const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = tanhf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = tanhf(reg.val[i]); });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 er() const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = erf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = erf(reg.val[i]); });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator*(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] * b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator+(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] + b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator-(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] - b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator/(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] / b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
@@ -284,15 +262,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   f32x16_t reg;
 
   explicit FP32Vec16(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
   }
 
   explicit FP32Vec16() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
   }
 
   explicit FP32Vec16(const float* ptr)
@@ -301,29 +275,27 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   explicit FP32Vec16(f32x16_t data) : reg(data) {};
 
   FP32Vec16(const FP32Vec4& data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&data, this](int i) {
       reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
-    }
+    });
   }
 
   FP32Vec16(const FP32Vec8& data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&data, this](int i) {
       reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
-    }
+    });
   }
 
   FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
 
   explicit FP32Vec16(const FP16Vec16& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = fp16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); });
   }
 
   explicit FP32Vec16(const BF16Vec16& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = bf16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); });
   }
 
   explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
@@ -331,82 +303,74 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
 
   FP32Vec16 operator*(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] * b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 operator+(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] + b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 operator-(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] - b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 operator/(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] / b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 max(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>([&ret, &b, this](int i) {
+      ret.val[i] = std::max(reg.val[i], b.reg.val[i]);
+    });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 min(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>([&ret, &b, this](int i) {
+      ret.val[i] = std::min(reg.val[i], b.reg.val[i]);
+    });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 abs() const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __abs(reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = std::abs(reg.val[i]); });
+    return FP32Vec16(ret);
   }
 
   float reduce_sum() const {
     float result = 0.0f;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result += reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result += reg.val[i]; });
     return result;
   }
 
   float reduce_max() const {
-    float result = reg.val[0];
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result = __max(reg.val[i], result);
-    }
+    float result = std::numeric_limits<float>::lowest();
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result = std::max(reg.val[i], result); });
     return result;
   }
 
   float reduce_min() const {
-    float result = reg.val[0];
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result = __min(reg.val[i], result);
-    }
+    float result = std::numeric_limits<float>::max();
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result = std::min(reg.val[i], result); });
     return result;
   }
 
@@ -414,13 +378,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
     float sum = 0.0;
-    int start = idx * group_size;
-    int end = (idx + 1) * group_size;
-
-    for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
-      sum += reg.val[start];
-    }
-
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&sum, &start, this](int i) { sum += reg.val[start + i]; });
     return sum;
   }
 
@@ -477,17 +437,13 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
 }
 
 inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
-  int i = 0;
-  for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_fp16(v.reg.val[i]);
-  }
+  unroll_loop<int, FP16Vec16::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); });
 }
 
 inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
-  int i = 0;
-  for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_fp16(v.reg.val[i]);
-  }
+  unroll_loop<int, FP16Vec8::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); });
 }
 
 inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
@@ -495,17 +451,13 @@ inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
 }
 
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
-  int i = 0;
-  for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_bf16(v.reg.val[i]);
-  }
+  unroll_loop<int, BF16Vec8::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); });
 }
 
 inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
-  int i = 0;
-  for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_bf16(v.reg.val[i]);
-  }
+  unroll_loop<int, BF16Vec16::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); });
 }
 
 inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }

From bbc6c2f1e5bc856a9265dfa2b379ed1d242adc33 Mon Sep 17 00:00:00 2001
From: j20120307 <j20120307@gmail.com>
Date: Wed, 19 Nov 2025 03:07:22 -0800
Subject: [PATCH 184/578] [CI/Build] Fix broken build on Apple M1 (#28999)

Signed-off-by: Kan Zhu <j20120307@gmail.com>
---
 csrc/cpu/utils.hpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp
index d8399c56f6af..d3def306b806 100644
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -6,6 +6,10 @@
 #include <cstdint>
 #include <unistd.h>
 
+#if defined(__APPLE__)
+  #include <sys/sysctl.h>
+#endif
+
 #include "cpu_types.hpp"
 
 namespace cpu_utils {
@@ -21,10 +25,12 @@ struct VecTypeTrait<float> {
   using vec_t = vec_op::FP32Vec16;
 };
 
+#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct VecTypeTrait<c10::BFloat16> {
   using vec_t = vec_op::BF16Vec16;
 };
+#endif
 
 template <>
 struct VecTypeTrait<c10::Half> {
@@ -44,9 +50,21 @@ struct Counter {
 
 inline int64_t get_l2_size() {
   static int64_t size = []() {
+#if defined(__APPLE__)
+    // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
+    int64_t l2_cache_size = 0;
+    size_t len = sizeof(l2_cache_size);
+    if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
+        l2_cache_size > 0) {
+      return l2_cache_size >> 1;  // use 50% of L2 cache
+    }
+    // Fallback if sysctlbyname fails
+    return 128LL * 1024 >> 1;  // use 50% of 128KB
+#else
     long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
     assert(l2_cache_size != -1);
     return l2_cache_size >> 1;  // use 50% of L2 cache
+#endif
   }();
   return size;
 }

From 97cfa99d59375de6d5e4c17dc6aea955ae75b493 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 19 Nov 2025 12:32:04 +0100
Subject: [PATCH 185/578] [Docs] Take env var definition out of folded
 admonition (#29005)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/configuration/env_vars.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md
index 2c0a898754fa..f6d548a19d91 100644
--- a/docs/configuration/env_vars.md
+++ b/docs/configuration/env_vars.md
@@ -7,8 +7,6 @@ vLLM uses the following environment variables to configure the system:
 
     All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
 
-??? code
-
-    ```python
-    --8<-- "vllm/envs.py:env-vars-definition"
-    ```
+```python
+--8<-- "vllm/envs.py:env-vars-definition"
+```

From ba558c029ad65ab4f040c8320607ebd87612cf08 Mon Sep 17 00:00:00 2001
From: Tova Movshovitz <tovam@pliops.com>
Date: Wed, 19 Nov 2025 13:37:11 +0200
Subject: [PATCH 186/578] [config] Expose `get_total_num_hidden_layers()` in
 ModelConfig (#28961)

Signed-off-by: tovam <tovam@pliops.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/config/model.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 3e8790a26e0e..f61dbb6a695a 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1369,11 +1369,7 @@ def get_num_experts(self) -> int:
         # Coerce to 0 if explicitly set to None
         return num_experts or 0
 
-    def get_layers_start_end_indices(
-        self, parallel_config: ParallelConfig
-    ) -> tuple[int, int]:
-        from vllm.distributed.utils import get_pp_indices
-
+    def get_total_num_hidden_layers(self) -> int:
         if (
             self.hf_text_config.model_type == "deepseek_mtp"
             or self.hf_config.model_type == "mimo_mtp"
@@ -1393,6 +1389,15 @@ def get_layers_start_end_indices(
             total_num_hidden_layers = getattr(
                 self.hf_text_config, "num_hidden_layers", 0
             )
+        return total_num_hidden_layers
+
+    def get_layers_start_end_indices(
+        self, parallel_config: ParallelConfig
+    ) -> tuple[int, int]:
+        from vllm.distributed.utils import get_pp_indices
+
+        total_num_hidden_layers = self.get_total_num_hidden_layers()
+
         # the layout order is: DP x PP x TP
         pp_rank = (
             parallel_config.rank // parallel_config.tensor_parallel_size

From da2f6800e0d6ac768c6f63b95f7c0755407f4263 Mon Sep 17 00:00:00 2001
From: Chen Bruce <bruceszchen@tencent.com>
Date: Wed, 19 Nov 2025 20:46:24 +0800
Subject: [PATCH 187/578] [Feat][Perf] Enable deepep-low-latency with
 round-robin expert placement. (#28449)

Signed-off-by: bruceszchen <bruceszchen@tencent.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../layers/fused_moe/all2all_utils.py         |  11 ++
 .../fused_moe/deepep_ll_prepare_finalize.py   |  30 +++-
 .../layers/fused_moe/fused_moe_method_base.py |   9 +-
 vllm/model_executor/layers/fused_moe/layer.py | 157 +++++++++++++++---
 .../fused_moe/unquantized_fused_moe_method.py |   7 +-
 .../compressed_tensors_moe.py                 |  14 +-
 .../model_executor/layers/quantization/fp8.py |   7 +-
 .../layers/quantization/modelopt.py           |  10 +-
 8 files changed, 208 insertions(+), 37 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py
index 2dd625054339..86c50f39f007 100644
--- a/vllm/model_executor/layers/fused_moe/all2all_utils.py
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -67,6 +67,7 @@ def maybe_roundup_layer_hidden_size(
 def maybe_make_prepare_finalize(
     moe: FusedMoEConfig,
     quant_config: FusedMoEQuantConfig | None,
+    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
 ) -> FusedMoEPrepareAndFinalize | None:
     if not moe.moe_parallel_config.use_all2all_kernels:
         return None
@@ -134,6 +135,13 @@ def maybe_make_prepare_finalize(
 
     elif moe.use_deepep_ll_kernels:
         assert quant_config is not None
+        global_to_physical = physical_to_global = local_expert_global_ids = None
+        if routing_tables is not None:
+            (
+                global_to_physical,
+                physical_to_global,
+                local_expert_global_ids,
+            ) = routing_tables
         all_to_all_args = dict(
             max_num_tokens_per_dp_rank=moe.max_num_tokens,
             token_hidden_size=moe.hidden_dim,
@@ -155,6 +163,9 @@ def maybe_make_prepare_finalize(
             max_tokens_per_rank=moe.max_num_tokens,
             num_dispatchers=all2all_manager.world_size,
             use_fp8_dispatch=use_fp8_dispatch,
+            global_to_physical=global_to_physical,
+            physical_to_global=physical_to_global,
+            local_expert_global_ids=local_expert_global_ids,
         )
 
     return prepare_finalize
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 06c9df317f7c..e0db248958b4 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -85,6 +85,9 @@ def __init__(
         max_tokens_per_rank: int,
         num_dispatchers: int,
         use_fp8_dispatch: bool = False,
+        global_to_physical: torch.Tensor | None = None,
+        physical_to_global: torch.Tensor | None = None,
+        local_expert_global_ids: torch.Tensor | None = None,
     ):
         super().__init__()
 
@@ -97,6 +100,17 @@ def __init__(
         self.handles: list[tuple | None] = [None, None]
         self.num_dispatchers_ = num_dispatchers
 
+        topk_indices_dtype = self.topk_indices_dtype()
+
+        def _maybe_cast(tensor: torch.Tensor | None) -> torch.Tensor | None:
+            if tensor is None or topk_indices_dtype is None:
+                return tensor
+            return tensor.to(dtype=topk_indices_dtype)
+
+        self.global_to_physical = _maybe_cast(global_to_physical)
+        self.physical_to_global = _maybe_cast(physical_to_global)
+        self.local_expert_global_ids = _maybe_cast(local_expert_global_ids)
+
         # We don't have enough information to determine if we should dispatch
         # activation scales in a packed ue8m0 format during object construction
         # time. This setting is handled by post_init_setup.
@@ -136,6 +150,16 @@ def max_num_tokens_per_rank(self) -> int | None:
     def topk_indices_dtype(self) -> torch.dtype | None:
         return torch.int64
 
+    def _map_global_to_physical_ids(self, topk_ids: torch.Tensor) -> torch.Tensor:
+        if self.global_to_physical is None:
+            return topk_ids
+        return self.global_to_physical[topk_ids]
+
+    def _map_local_to_global_ids(self, expert_topk_ids: torch.Tensor) -> torch.Tensor:
+        if self.local_expert_global_ids is None:
+            return expert_topk_ids
+        return self.local_expert_global_ids[expert_topk_ids]
+
     def _do_quant(
         self,
         x: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
@@ -226,9 +250,10 @@ def prepare_async(
             a1 = a1 * topk_weights.to(a1.dtype)
 
         # Dispatch
+        dispatch_topk_ids = self._map_global_to_physical_ids(topk_ids)
         expert_x, expert_num_tokens, handle, _, hook = self.buffer.low_latency_dispatch(
             a1,
-            topk_ids,
+            dispatch_topk_ids,
             self.max_tokens_per_rank,
             num_experts,
             use_fp8=self.use_fp8_dispatch,
@@ -313,11 +338,12 @@ def _finalize(
             # weights have already been applied.
             combine_topk_weights = torch.ones_like(topk_weights)
 
+        combine_topk_ids = self._map_global_to_physical_ids(topk_ids)
         # TODO (varun) : Enable zero copy mode
         dbo_maybe_run_recv_hook()
         _, _, recv_hook = self.buffer.low_latency_combine(
             fused_expert_output,
-            topk_ids,
+            combine_topk_ids,
             combine_topk_weights,
             handle,
             async_finish=False,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index 87f8c8d75a9b..073e90a4e680 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -50,10 +50,15 @@ def uses_weight_scale_2_pattern(self) -> bool:
         """
         return False
 
-    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> FusedMoEPrepareAndFinalize | None:
         from .all2all_utils import maybe_make_prepare_finalize
 
-        return maybe_make_prepare_finalize(self.moe, self.moe_quant_config)
+        return maybe_make_prepare_finalize(
+            self.moe, self.moe_quant_config, routing_tables
+        )
 
     def select_gemm_impl(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 023132acfed3..c41995e4a913 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -5,7 +5,7 @@
 from contextlib import nullcontext
 from enum import Enum
 from functools import partial
-from typing import Literal, get_args, overload
+from typing import Literal, cast, get_args, overload
 
 import torch
 import torch.nn.functional as F
@@ -192,6 +192,42 @@ def determine_expert_map(
     return (local_num_experts, expert_map, expert_mask)
 
 
+def determine_expert_placement_strategy(
+    expert_placement_strategy: ExpertPlacementStrategy,
+    moe_parallel_config: FusedMoEParallelConfig,
+    num_expert_group: int | None,
+    num_redundant_experts: int,
+    enable_eplb: bool,
+) -> ExpertPlacementStrategy:
+    if expert_placement_strategy == "round_robin":
+        round_robin_supported = (
+            (num_expert_group is not None and num_expert_group > 1)
+            and num_redundant_experts == 0
+            and not enable_eplb
+        )
+
+        if not round_robin_supported:
+            logger.warning(
+                "Round-robin expert placement is only supported for "
+                "models with multiple expert groups and no redundant "
+                "experts. Falling back to linear expert placement."
+            )
+            return "linear"
+        if (
+            moe_parallel_config.use_all2all_kernels
+            and not moe_parallel_config.use_deepep_ll_kernels
+        ):
+            logger.warning(
+                "Round-robin expert placement currently only supports "
+                "the DeepEP low-latency backend, but '%s' was configured. "
+                "Falling back to linear expert placement.",
+                moe_parallel_config.all2all_backend,
+            )
+            return "linear"
+
+    return expert_placement_strategy
+
+
 def get_compressed_expert_map(expert_map: torch.Tensor) -> str:
     """
     Compresses the expert map by removing any -1 entries.
@@ -400,6 +436,9 @@ def __init__(
         self.expert_load_view: torch.Tensor | None = None
         self.logical_to_physical_map: torch.Tensor | None = None
         self.logical_replica_count: torch.Tensor | None = None
+        self.expert_placement_strategy: ExpertPlacementStrategy = (
+            vllm_config.parallel_config.expert_placement_strategy
+        )
 
         # ROCm aiter shared experts fusion
         self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
@@ -433,38 +472,27 @@ def __init__(
                     "Redundant experts are only supported with EPLB."
                 )
 
-            expert_placement_strategy = (
-                vllm_config.parallel_config.expert_placement_strategy
+            self.expert_placement_strategy = determine_expert_placement_strategy(
+                expert_placement_strategy=self.expert_placement_strategy,
+                moe_parallel_config=self.moe_parallel_config,
+                num_expert_group=num_expert_group,
+                num_redundant_experts=num_redundant_experts,
+                enable_eplb=self.enable_eplb,
             )
-            if expert_placement_strategy == "round_robin":
-                # TODO(Bruce): will support round robin expert placement with
-                # EPLB enabled in the future.
-                round_robin_supported = (
-                    (num_expert_group is not None and num_expert_group > 1)
-                    and num_redundant_experts == 0
-                    and not self.enable_eplb
-                )
-
-                if not round_robin_supported:
-                    logger.warning(
-                        "Round-robin expert placement is only supported for "
-                        "models with multiple expert groups and no redundant "
-                        "experts. Falling back to linear expert placement."
-                    )
-                    expert_placement_strategy = "linear"
 
             self.expert_map: torch.Tensor | None
             local_num_experts, expert_map, expert_mask = determine_expert_map(
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
                 global_num_experts=self.global_num_experts,
-                expert_placement_strategy=expert_placement_strategy,
+                expert_placement_strategy=self.expert_placement_strategy,
                 num_fused_shared_experts=self.num_fused_shared_experts,
                 return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
+            self._maybe_init_expert_routing_tables()
             logger.info_once(
                 "[EP Rank %s/%s] Expert parallelism is enabled. Expert "
                 "placement strategy: %s. Local/global"
@@ -472,7 +500,7 @@ def __init__(
                 " %s.",
                 self.ep_rank,
                 self.ep_size,
-                expert_placement_strategy,
+                self.expert_placement_strategy,
                 self.local_num_experts,
                 self.global_num_experts,
                 get_compressed_expert_map(self.expert_map),
@@ -621,7 +649,12 @@ def _get_quant_method() -> FusedMoEMethodBase:
     # should be safe to swap out the quant_method.
     def maybe_init_modular_kernel(self) -> None:
         self.ensure_moe_quant_config_init()
-        prepare_finalize = self.quant_method.maybe_make_prepare_finalize()
+        # routing_tables only needed for round-robin expert placement with
+        # DeepEP all2all backend.
+        routing_tables = self._maybe_init_expert_routing_tables()
+        prepare_finalize = self.quant_method.maybe_make_prepare_finalize(
+            routing_tables=routing_tables
+        )
         if prepare_finalize is not None:
             logger.debug(
                 "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
@@ -703,6 +736,84 @@ def is_internal_router(self) -> bool:
         # By default, router/gate is called before FusedMoE forward pass
         return False
 
+    def _maybe_init_expert_routing_tables(
+        self,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None:
+        # Currently routing_tables only needed for round-robin expert placement
+        # with DeepEP-ll all2all backend.
+        if (
+            self.expert_placement_strategy != "round_robin"
+            or not self.use_deepep_ll_kernels
+        ):
+            return None
+
+        if hasattr(self, "expert_global_to_physical"):
+            return cast(
+                tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+                (
+                    self.expert_global_to_physical,
+                    self.expert_physical_to_global,
+                    self.expert_local_to_global,
+                ),
+            )
+
+        if self.expert_map is None:
+            return None
+
+        routing_tables = self.ensure_round_robin_expert_routing_tables(
+            global_num_experts=self.global_num_experts,
+            ep_size=self.ep_size,
+            ep_rank=self.ep_rank,
+            local_num_experts=self.local_num_experts,
+            device=self.expert_map.device,
+        )
+
+        global_to_physical, physical_to_global, local_global = routing_tables
+        self.register_buffer("expert_global_to_physical", global_to_physical)
+        self.register_buffer("expert_physical_to_global", physical_to_global)
+        self.register_buffer("expert_local_to_global", local_global)
+
+        return routing_tables
+
+    @staticmethod
+    def ensure_round_robin_expert_routing_tables(
+        global_num_experts: int,
+        ep_size: int,
+        ep_rank: int,
+        local_num_experts: int,
+        device: torch.device | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        device_kwargs = {"device": device} if device is not None else {}
+        global_indices = torch.arange(
+            global_num_experts, dtype=torch.long, **device_kwargs
+        )
+        owner = torch.remainder(global_indices, ep_size)
+        local_index = torch.div(global_indices, ep_size, rounding_mode="floor")
+        base = global_num_experts // ep_size
+        remainder = global_num_experts % ep_size
+        physical_offset = owner * base
+        if remainder > 0:
+            remainder_tensor = torch.tensor(
+                remainder, dtype=torch.long, **device_kwargs
+            )
+            physical_offset = physical_offset + torch.minimum(owner, remainder_tensor)
+
+        global_to_physical = physical_offset + local_index
+        physical_to_global = torch.empty_like(global_to_physical)
+        physical_to_global[global_to_physical] = global_indices
+
+        local_global = torch.arange(
+            ep_rank,
+            global_num_experts,
+            ep_size,
+            dtype=torch.long,
+            **device_kwargs,
+        )
+        if local_global.numel() != local_num_experts:
+            local_global = local_global[:local_num_experts]
+
+        return (global_to_physical, physical_to_global, local_global)
+
     def update_expert_map(self):
         # ep_size and ep_rank should already be updated
         assert self.expert_map is not None
@@ -711,12 +822,14 @@ def update_expert_map(self):
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
                 global_num_experts=self.global_num_experts,
+                expert_placement_strategy=self.expert_placement_strategy,
                 num_fused_shared_experts=self.num_fused_shared_experts,
                 return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
+            self._maybe_init_expert_routing_tables()
             if self.aiter_fmoe_shared_expert_enabled:
                 self._init_aiter_shared_experts_topK_buffer(
                     vllm_config=get_current_vllm_config(),
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 2e0376553b91..63b0e6f573d6 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -108,11 +108,14 @@ def supports_eplb(self) -> bool:
     def allow_inplace(self) -> bool:
         return True
 
-    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> FusedMoEPrepareAndFinalize | None:
         if self.rocm_aiter_moe_enabled:
             return None
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 06ee96d55419..22b3c477f420 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -380,11 +380,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             (layer.w2_input_global_scale), requires_grad=False
         )
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin:
             return None
         elif not self.allow_flashinfer:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
         prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe)
         logger.debug_once("%s", prepare_finalize.__class__.__name__)
@@ -890,11 +893,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     layer.w2_weight_scale
                 )
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin or self.rocm_aiter_moe_enabled:
             return None
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 0479bec33840..92fbdd709348 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1018,7 +1018,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         if (
             self.rocm_aiter_moe_enabled
             or self.use_marlin
@@ -1039,7 +1042,7 @@ def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 476521813f46..38ab7cd4f115 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -373,6 +373,7 @@ def __init__(
 
     def maybe_make_prepare_finalize(
         self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
         # TRT LLM not supported with all2all yet.
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
@@ -384,7 +385,7 @@ def maybe_make_prepare_finalize(
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
@@ -1179,7 +1180,10 @@ def __init__(
                 " for ModelOptNvFp4FusedMoE."
             )
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin or (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
@@ -1196,7 +1200,7 @@ def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,

From 09540cd918a5f7d776d7f7e0abec78fbc03938ad Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Wed, 19 Nov 2025 13:56:21 +0100
Subject: [PATCH 188/578] [Doc]: fix typos in various files (#29010)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 docs/deployment/frameworks/skypilot.md      | 2 +-
 docs/design/prefix_caching.md               | 2 +-
 docs/features/nixl_connector_usage.md       | 2 +-
 docs/getting_started/quickstart.md          | 2 +-
 tests/v1/ec_connector/integration/README.md | 2 +-
 vllm/multimodal/evs.py                      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md
index f4a984a6433e..e9b0d5f0671c 100644
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@@ -4,7 +4,7 @@
   <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
 </p>
 
-vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
+vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc., can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
 
 ## Prerequisites
 
diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md
index bd4070f381d8..48536a877bd3 100644
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -1,6 +1,6 @@
 # Automatic Prefix Caching
 
-Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang).
+Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc.) and most open source LLM inference frameworks (e.g., SGLang).
 
 While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block:
 
diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index 1ce038f4d652..f0e25e31aa0b 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -158,7 +158,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
 
 ## Experimental Feature
 
-### Heterogenuous KV Layout support
+### Heterogeneous KV Layout support
 
 Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration
 
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index cfc8b4d9838a..9e86f785b10c 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -286,7 +286,7 @@ If desired, you can also manually set the backend of your choice by configuring
 - On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
 - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
 
-For AMD ROCm, you can futher control the specific Attention implementation using the following variables:
+For AMD ROCm, you can further control the specific Attention implementation using the following variables:
 
 - Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
 - AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
diff --git a/tests/v1/ec_connector/integration/README.md b/tests/v1/ec_connector/integration/README.md
index 30426e055ade..2dbcb307fda3 100644
--- a/tests/v1/ec_connector/integration/README.md
+++ b/tests/v1/ec_connector/integration/README.md
@@ -113,7 +113,7 @@ Quick sanity check:
 
 - Outputs differ between baseline and disagg
 - Server startup fails
-- Encoder cache not found (should fallback to local execution)
+- Encoder cache not found (should fall back to local execution)
 - Proxy routing errors
 
 ## Notes
diff --git a/vllm/multimodal/evs.py b/vllm/multimodal/evs.py
index 4a288d2d238c..8a36ea415da4 100644
--- a/vllm/multimodal/evs.py
+++ b/vllm/multimodal/evs.py
@@ -185,7 +185,7 @@ def recompute_mrope_positions(
 
     Args:
         input_ids: (N,) All input tokens of the prompt (entire sequence).
-        multimodal_positions: List of mrope positsions for each media.
+        multimodal_positions: List of mrope positions for each media.
         mrope_positions: Existing mrope positions (4, N) for entire sequence.
         num_computed_tokens: A number of computed tokens so far.
         vision_start_token_id: Token indicating start of vision media.

From 4f5299f7174ffb10bdc640b47d3494083fc39c48 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 19 Nov 2025 14:50:30 +0100
Subject: [PATCH 189/578] Relax Transformers modeling backend MoE experts check
 (#28952)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/models/supported_models.md                | 4 +++-
 vllm/model_executor/models/transformers/moe.py | 9 ++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index bd14bbb9ab66..80fe143269a7 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -79,7 +79,9 @@ To make your model compatible with the Transformers modeling backend, it needs:
         1. Add `is_causal = False` to `MyAttention`.
     - If your model is mixture-of-experts (MoE):
         1. Your sparse MoE block must have an attribute called `experts`.
-        2. The class of `experts` (`MyExperts`) must inherit from `nn.ModuleList`.
+        2. The class of `experts` (`MyExperts`) must either:
+            - Inherit from `nn.ModuleList` (naive).
+            - Or contain all 3D `nn.Parameters` (packed).
         3. `MyExperts.forward` must accept `hidden_states`, `top_k_index`, `top_k_weights`.
 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
 3. `MyModel` must contain `_supports_attention_backend = True`.
diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py
index 4973014c3d4e..31db9d682bd4 100644
--- a/vllm/model_executor/models/transformers/moe.py
+++ b/vllm/model_executor/models/transformers/moe.py
@@ -256,7 +256,14 @@ def forward(self, *args, **kwargs):
         def _recursive_replace(module: nn.Module, prefix: str):
             for child_name, child_module in module.named_children():
                 qual_name = maybe_prefix(prefix, child_name)
-                if child_name == "experts" and isinstance(child_module, nn.ModuleList):
+                # Naive implementations will have experts as ModuleList
+                is_modulelist = isinstance(child_module, nn.ModuleList)
+                # Packed implementations will have experts as 3D tensors of shapes like:
+                # gate_up_proj = (num_experts, 2 * intermediate_size, hidden_size)
+                # down_proj = (num_experts, intermediate_size, hidden_size)
+                params = list(child_module.parameters())
+                is_3d = len(params) > 0 and all(p.ndim == 3 for p in params)
+                if child_name == "experts" and (is_modulelist or is_3d):
                     # Alias for readability
                     mlp = module
                     experts = child_module

From 2c8b9182b5ced00d83bed15ef8bc0ac6e079b6ee Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Wed, 19 Nov 2025 06:13:50 -0800
Subject: [PATCH 190/578] [CI] Reorganize compile tests so new tests are
 automatically included in CI (#28625)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 .buildkite/test-amd.yaml                      | 57 ++++++++---------
 .buildkite/test-pipeline.yaml                 | 62 +++++++++----------
 tests/compile/README.md                       |  5 ++
 .../{piecewise => distributed}/__init__.py    |  0
 .../{ => distributed}/test_async_tp.py        |  6 +-
 .../test_fusion_all_reduce.py                 |  4 +-
 .../{ => distributed}/test_fusions_e2e.py     |  2 +-
 .../test_sequence_parallelism.py              |  4 +-
 tests/compile/fullgraph/__init__.py           |  0
 .../{ => fullgraph}/test_basic_correctness.py |  2 +-
 .../test_full_cudagraph.py                    |  0
 .../{ => fullgraph}/test_full_graph.py        |  2 +-
 .../test_multimodal_compile.py                |  0
 .../test_multiple_graphs.py                   |  0
 .../{piecewise => fullgraph}/test_simple.py   |  0
 .../test_toy_llama.py                         |  0
 vllm/env_override.py                          |  2 +-
 17 files changed, 74 insertions(+), 72 deletions(-)
 create mode 100644 tests/compile/README.md
 rename tests/compile/{piecewise => distributed}/__init__.py (100%)
 rename tests/compile/{ => distributed}/test_async_tp.py (99%)
 rename tests/compile/{ => distributed}/test_fusion_all_reduce.py (99%)
 rename tests/compile/{ => distributed}/test_fusions_e2e.py (99%)
 rename tests/compile/{ => distributed}/test_sequence_parallelism.py (99%)
 create mode 100644 tests/compile/fullgraph/__init__.py
 rename tests/compile/{ => fullgraph}/test_basic_correctness.py (99%)
 rename tests/compile/{piecewise => fullgraph}/test_full_cudagraph.py (100%)
 rename tests/compile/{ => fullgraph}/test_full_graph.py (99%)
 rename tests/compile/{ => fullgraph}/test_multimodal_compile.py (100%)
 rename tests/compile/{piecewise => fullgraph}/test_multiple_graphs.py (100%)
 rename tests/compile/{piecewise => fullgraph}/test_simple.py (100%)
 rename tests/compile/{piecewise => fullgraph}/test_toy_llama.py (100%)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 2471b509a9ff..0049f3540340 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -187,7 +187,7 @@ steps:
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
   - tests/distributed/test_events
-  - tests/compile/test_basic_correctness
+  - tests/compile/fullgraph/test_basic_correctness.py
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
@@ -215,7 +215,7 @@ steps:
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s distributed/test_events.py
   - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -493,17 +493,12 @@ steps:
     - vllm/
     - tests/compile
   commands:
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_functionalization.py
-    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-  #  - pytest -v -s compile/test_sequence_parallelism.py
-  #  - pytest -v -s compile/test_async_tp.py
-    - pytest -v -s compile/test_fusion_all_reduce.py
-    - pytest -v -s compile/test_decorator.py
-    - pytest -v -s compile/test_noop_elimination.py
-    - pytest -v -s compile/test_aot_compile.py
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -515,9 +510,11 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_basic_correctness.py
-  - pytest -v -s compile/test_multimodal_compile.py
-  - pytest -v -s compile/piecewise/
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
@@ -529,10 +526,10 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
     # Limit to no custom ops to reduce running time
     # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -1066,10 +1063,10 @@ steps:
     - pytest -v -s tests/compile/test_fusion_attn.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
     # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
 
 - label: Blackwell Fusion E2E Tests # 30 min
   timeout_in_minutes: 40
@@ -1086,14 +1083,14 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusions_e2e.py
-  - tests/compile/test_full_graph.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
   commands:
     - nvidia-smi
     # Run all e2e fusion tests
-    - pytest -v -s tests/compile/test_fusions_e2e.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
     # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: ROCm GPT-OSS Eval
   timeout_in_minutes: 60
@@ -1198,7 +1195,7 @@ steps:
   - vllm/worker/worker_base.py
   - vllm/v1/engine/
   - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
+  - tests/compile/fullgraph/test_basic_correctness.py
   - tests/compile/test_wrapper.py
   - tests/distributed/
   - tests/entrypoints/llm/test_collective_rpc.py
@@ -1211,7 +1208,7 @@ steps:
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1417,10 +1414,10 @@ steps:
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4ac76aba67b9..e62cd60efaec 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -167,7 +167,7 @@ steps:
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
   - tests/distributed/test_events
-  - tests/compile/test_basic_correctness
+  - tests/compile/fullgraph/test_basic_correctness.py
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
@@ -197,7 +197,7 @@ steps:
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s distributed/test_events.py
   - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -445,18 +445,12 @@ steps:
     - vllm/
     - tests/compile
   commands:
-    - pytest -v -s compile/test_graph_partition.py
-    - pytest -v -s compile/test_config.py
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_functionalization.py
-    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s compile/test_fusion_all_reduce.py
-    - pytest -v -s compile/test_decorator.py
-    - pytest -v -s compile/test_noop_elimination.py
-    - pytest -v -s compile/test_aot_compile.py
-    - pytest -v -s compile/test_qk_norm_rope_fusion.py
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -466,9 +460,11 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_basic_correctness.py
-  - pytest -v -s compile/test_multimodal_compile.py
-  - pytest -v -s compile/piecewise/
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
@@ -479,10 +475,10 @@ steps:
   - tests/compile
   commands:
     # fp8 kv scales not supported on sm89, tested on Blackwell instead
-  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
     # Limit to no custom ops to reduce running time
     # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -939,17 +935,22 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
   commands:
     - nvidia-smi
     - pytest -v -s tests/compile/test_fusion_attn.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
     # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
     # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell Fusion E2E Tests # 30 min
   timeout_in_minutes: 40
@@ -966,12 +967,11 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusions_e2e.py
-  - tests/compile/test_full_graph.py
+  - tests/compile/distributed/test_fusions_e2e.py
   commands:
     - nvidia-smi
     # Run all e2e fusion tests
-    - pytest -v -s tests/compile/test_fusions_e2e.py
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
 
 - label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
@@ -1069,7 +1069,7 @@ steps:
   - vllm/worker/worker_base.py
   - vllm/v1/engine/
   - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
+  - tests/compile/fullgraph/test_basic_correctness.py
   - tests/compile/test_wrapper.py
   - tests/distributed/
   - tests/entrypoints/llm/test_collective_rpc.py
@@ -1084,7 +1084,7 @@ steps:
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1264,10 +1264,10 @@ steps:
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
diff --git a/tests/compile/README.md b/tests/compile/README.md
new file mode 100644
index 000000000000..300a95686000
--- /dev/null
+++ b/tests/compile/README.md
@@ -0,0 +1,5 @@
+# compile test folder structure
+
+- `compile/test_*.py` : various unit tests meant for testing particular code path/features. Future tests are most likely added here. New test files added here will be included in CI automatically
+- `compile/fullgraph/` : full model tests, including all tests previously in compile/piecewise. These tests do not target particular features. New test files added here will be included in CI automatically
+- `compile/distributed/` : tests that require multiple GPUs. New test files added here will **NOT** be included in CI automatically as these tests generally need to be manually configured to run in runners with particular number/type of GPUs.
diff --git a/tests/compile/piecewise/__init__.py b/tests/compile/distributed/__init__.py
similarity index 100%
rename from tests/compile/piecewise/__init__.py
rename to tests/compile/distributed/__init__.py
diff --git a/tests/compile/test_async_tp.py b/tests/compile/distributed/test_async_tp.py
similarity index 99%
rename from tests/compile/test_async_tp.py
rename to tests/compile/distributed/test_async_tp.py
index 71ee22878143..86d409f1eadb 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/distributed/test_async_tp.py
@@ -27,13 +27,13 @@
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 
-from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import (
+from ...models.registry import HF_EXAMPLE_MODELS
+from ...utils import (
     compare_two_settings,
     create_new_process_for_each_test,
     multi_gpu_test,
 )
-from .backend import TestBackend
+from ..backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py
similarity index 99%
rename from tests/compile/test_fusion_all_reduce.py
rename to tests/compile/distributed/test_fusion_all_reduce.py
index 6d0a0ed7d89d..d401d5703275 100644
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/distributed/test_fusion_all_reduce.py
@@ -33,8 +33,8 @@
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 
-from ..utils import has_module_attribute, multi_gpu_test
-from .backend import TestBackend
+from ...utils import has_module_attribute, multi_gpu_test
+from ..backend import TestBackend
 
 
 class TestAllReduceRMSNormModel(torch.nn.Module):
diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
similarity index 99%
rename from tests/compile/test_fusions_e2e.py
rename to tests/compile/distributed/test_fusions_e2e.py
index f22d60ef000b..2e1b595a4389 100644
--- a/tests/compile/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -18,7 +18,7 @@
 from vllm.utils.flashinfer import has_flashinfer
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
-from ..utils import flat_product, multi_gpu_test
+from ...utils import flat_product, multi_gpu_test
 
 is_blackwell = lambda: current_platform.is_device_capability(100)
 """Are we running on Blackwell, a lot of tests depend on it"""
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py
similarity index 99%
rename from tests/compile/test_sequence_parallelism.py
rename to tests/compile/distributed/test_sequence_parallelism.py
index 9cd7f64b04af..30084dfd5a95 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/distributed/test_sequence_parallelism.py
@@ -32,8 +32,8 @@
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 
-from ..utils import multi_gpu_test
-from .backend import TestBackend
+from ...utils import multi_gpu_test
+from ..backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
 prompts = [
diff --git a/tests/compile/fullgraph/__init__.py b/tests/compile/fullgraph/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py
similarity index 99%
rename from tests/compile/test_basic_correctness.py
rename to tests/compile/fullgraph/test_basic_correctness.py
index 3f6898607f6b..965938c4433d 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -7,7 +7,7 @@
 from vllm.config import CompilationMode
 from vllm.utils.torch_utils import cuda_device_count_stateless
 
-from ..utils import compare_all_settings
+from ...utils import compare_all_settings
 
 
 @dataclasses.dataclass
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py
similarity index 100%
rename from tests/compile/piecewise/test_full_cudagraph.py
rename to tests/compile/fullgraph/test_full_cudagraph.py
diff --git a/tests/compile/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
similarity index 99%
rename from tests/compile/test_full_graph.py
rename to tests/compile/fullgraph/test_full_graph.py
index b4e5e56ac9fe..2c11ecef7f02 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -15,7 +15,7 @@
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
-from ..utils import create_new_process_for_each_test
+from ...utils import create_new_process_for_each_test
 
 
 def models_list(*, all: bool = True, keywords: list[str] | None = None):
diff --git a/tests/compile/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py
similarity index 100%
rename from tests/compile/test_multimodal_compile.py
rename to tests/compile/fullgraph/test_multimodal_compile.py
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/fullgraph/test_multiple_graphs.py
similarity index 100%
rename from tests/compile/piecewise/test_multiple_graphs.py
rename to tests/compile/fullgraph/test_multiple_graphs.py
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/fullgraph/test_simple.py
similarity index 100%
rename from tests/compile/piecewise/test_simple.py
rename to tests/compile/fullgraph/test_simple.py
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/fullgraph/test_toy_llama.py
similarity index 100%
rename from tests/compile/piecewise/test_toy_llama.py
rename to tests/compile/fullgraph/test_toy_llama.py
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 14dae2850c35..9ae1af3af46c 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -95,7 +95,7 @@ def get_output_names(graph_outputs) -> list[str]:
 # ===================================================
 # This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to
 # fix inductor partition + attention-nvfp4 quant fusion, tested in
-# `tests/compile/test_fusions_e2e.py::test_attn_quant`.
+# `tests/compile/distributed/test_fusions_e2e.py::test_attn_quant`.
 # For more context, see https://github.com/pytorch/pytorch/pull/165815.
 
 
From 1ffe934c8ae978e5ed82559a1eaeca05e37f9b35 Mon Sep 17 00:00:00 2001
From: vnadathur <glvikramn@gmail.com>
Date: Wed, 19 Nov 2025 06:13:54 -0800
Subject: [PATCH 191/578] [torch.compile] caching of config fields should be
 opt-out by default (#26468)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: vnadathur <glvikramn@gmail.com>
Signed-off-by: WorldExplored <srreyansh.sethi@gmail.com>
Signed-off-by: Srreyansh Sethi <srreyansh.sethi@gmail.com>
Signed-off-by: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com>
Co-authored-by: WorldExplored <srreyansh.sethi@gmail.com>
Co-authored-by: Srreyansh Sethi <107075589+worldexplored@users.noreply.github.com>
Co-authored-by: vnadathur <236933696+vnadathur@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 tests/config/test_config_utils.py | 166 +++++++++++++++++++++++++++++
 vllm/compilation/backends.py      | 105 +++++++++++++++----
 vllm/compilation/pass_manager.py  |   2 +-
 vllm/config/cache.py              |  31 ++++--
 vllm/config/compilation.py        |  40 +++----
 vllm/config/model.py              |  92 ++++++++--------
 vllm/config/parallel.py           |  49 ++++++---
 vllm/config/utils.py              | 119 ++++++++++++++++++++-
 vllm/envs.py                      | 169 +++++++++++++++---------------
 vllm/logging_utils/__init__.py    |   2 +
 vllm/logging_utils/lazy.py        |  20 ++++
 11 files changed, 602 insertions(+), 193 deletions(-)
 create mode 100644 tests/config/test_config_utils.py
 create mode 100644 vllm/logging_utils/lazy.py

diff --git a/tests/config/test_config_utils.py b/tests/config/test_config_utils.py
new file mode 100644
index 000000000000..1277c7e64eb2
--- /dev/null
+++ b/tests/config/test_config_utils.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+
+import pytest
+
+from vllm.config.utils import get_hash_factors, hash_factors, normalize_value
+
+# Helpers
+
+
+def endswith_fqname(obj, suffix: str) -> bool:
+    # normalize_value(type) returns fully-qualified name
+    # Compare suffix to avoid brittle import paths.
+    out = normalize_value(obj)
+    return isinstance(out, str) and out.endswith(suffix)
+
+
+def expected_path(p_str: str = ".") -> str:
+    import pathlib
+
+    p = pathlib.Path(p_str)
+    return p.expanduser().resolve().as_posix()
+
+
+# Minimal dataclass to test get_hash_factors.
+# Avoid importing heavy vLLM configs.
+@dataclass
+class SimpleConfig:
+    a: object
+    b: object | None = None
+
+
+class DummyLogprobsMode(Enum):
+    RAW_LOGITS = "raw_logits"
+
+
+def test_hash_factors_deterministic():
+    """Test that hash_factors produces consistent SHA-256 hashes"""
+    factors = {"a": 1, "b": "test"}
+    hash1 = hash_factors(factors)
+    hash2 = hash_factors(factors)
+
+    assert hash1 == hash2
+    # Dict key insertion order should not affect the hash.
+    factors_reordered = {"b": "test", "a": 1}
+    assert hash_factors(factors_reordered) == hash1
+    assert len(hash1) == 64
+    assert all(c in "0123456789abcdef" for c in hash1)
+
+
+@pytest.mark.parametrize(
+    "inp, expected",
+    [
+        (None, None),
+        (True, True),
+        (1, 1),
+        (1.0, 1.0),
+        ("x", "x"),
+        (b"ab", "6162"),
+        (bytearray(b"ab"), "6162"),
+        ([1, 2], (1, 2)),
+        ({"b": 2, "a": 1}, (("a", 1), ("b", 2))),
+    ],
+)
+def test_normalize_value_matrix(inp, expected):
+    """Parametric input→expected normalization table."""
+    assert normalize_value(inp) == expected
+
+
+def test_normalize_value_enum():
+    # Enums normalize to (module.QualName, value).
+    # DummyLogprobsMode uses a string payload.
+    out = normalize_value(DummyLogprobsMode.RAW_LOGITS)
+    assert isinstance(out, tuple)
+    assert out[0].endswith("DummyLogprobsMode")
+    # Expect string payload 'raw_logits'.
+    assert out[1] == "raw_logits"
+
+
+def test_normalize_value_set_order_insensitive():
+    # Sets are unordered; normalize_value sorts elements for determinism.
+    assert normalize_value({3, 1, 2}) == normalize_value({1, 2, 3})
+
+
+def test_normalize_value_path_normalization():
+    from pathlib import Path  # local import to avoid global dependency
+
+    # Paths expand/resolve to absolute strings.
+    # Stabilizes hashing across working dirs.
+    assert normalize_value(Path(".")) == expected_path(".")
+
+
+def test_normalize_value_uuid_and_to_json():
+    # Objects may normalize via uuid() or to_json_string().
+    class HasUUID:
+        def uuid(self):
+            return "test-uuid"
+
+    class ToJson:
+        def to_json_string(self):
+            return '{"x":1}'
+
+    assert normalize_value(HasUUID()) == "test-uuid"
+    assert normalize_value(ToJson()) == '{"x":1}'
+
+
+@pytest.mark.parametrize(
+    "bad",
+    [
+        (lambda x: x),
+        (type("CallableInstance", (), {"__call__": lambda self: 0}))(),
+        (lambda: (lambda: 0))(),  # nested function instance
+    ],
+)
+def test_error_cases(bad):
+    """Inputs expected to raise TypeError."""
+    # Reject functions/lambdas/callable instances
+    # to avoid under-hashing.
+    with pytest.raises(TypeError):
+        normalize_value(bad)
+
+
+def test_enum_vs_int_disambiguation():
+    # int stays primitive
+    nf_int = normalize_value(1)
+    assert nf_int == 1
+
+    # enum becomes ("module.QualName", value)
+    nf_enum = normalize_value(DummyLogprobsMode.RAW_LOGITS)
+    assert isinstance(nf_enum, tuple) and len(nf_enum) == 2
+    enum_type, enum_val = nf_enum
+    assert enum_type.endswith(".DummyLogprobsMode")
+    assert enum_val == "raw_logits"
+
+    # Build factor dicts from configs with int vs enum
+    f_int = get_hash_factors(SimpleConfig(1), set())
+    f_enum = get_hash_factors(SimpleConfig(DummyLogprobsMode.RAW_LOGITS), set())
+    # The int case remains a primitive value
+    assert f_int["a"] == 1
+    # The enum case becomes a tagged tuple ("module.QualName", "raw_logits")
+    assert isinstance(f_enum["a"], tuple) and f_enum["a"][1] == "raw_logits"
+    # Factor dicts must differ so we don't collide primitives with Enums.
+    assert f_int != f_enum
+    # Hash digests must differ correspondingly
+    assert hash_factors(f_int) != hash_factors(f_enum)
+
+    # Hash functions produce stable hex strings
+    h_int = hash_factors(f_int)
+    h_enum = hash_factors(f_enum)
+    assert isinstance(h_int, str) and len(h_int) == 64
+    assert isinstance(h_enum, str) and len(h_enum) == 64
+
+
+def test_classes_are_types():
+    """Types normalize to FQNs; include real vLLM types."""
+    # Only classes allowed; functions/lambdas are rejected.
+    # Canonical form is the fully-qualified name.
+    assert isinstance(normalize_value(str), str)
+
+    class LocalDummy:
+        pass
+
+    assert endswith_fqname(LocalDummy, ".LocalDummy")
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 60ef6eef2166..1e66f21ff638 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -4,12 +4,14 @@
 import ast
 import dataclasses
 import hashlib
+import json
 import operator
 import os
 import pprint
 import time
 from collections.abc import Callable, Sequence
 from contextlib import contextmanager
+from functools import partial
 from typing import Any
 
 import torch
@@ -23,7 +25,9 @@
     should_split,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
+from vllm.logging_utils import lazy
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -580,35 +584,47 @@ def configure_post_pass(self):
     def __call__(
         self, graph: fx.GraphModule, example_inputs
     ) -> VllmSerializableFunction:
-        from .caching import _compute_code_hash, compilation_config_hash_factors
-
         vllm_config = self.vllm_config
+        # Minimal hashing here with existing utilities, reused below.
+
+        env_factors = envs.compile_factors()
+        env_hash = hash_factors(env_factors)
+        # Compute config/compiler/code hashes once and reuse
+        config_hash = vllm_config.compute_hash()
+        compiler_hash = self.compiler_manager.compute_hash(vllm_config)
+        forward_code_files = list(sorted(self.compilation_config.traced_files))
+
+        logger.debug(
+            "Traced files (to be considered for compilation cache):\n%s",
+            lazy(lambda: "\n".join(forward_code_files)),
+        )
+        hash_content = []
+        for filepath in forward_code_files:
+            hash_content.append(filepath)
+            if filepath == "<string>":
+                # This means the function was dynamically generated, with
+                # e.g. exec(). We can't actually check these.
+                continue
+            try:
+                with open(filepath) as f:
+                    hash_content.append(f.read())
+            except Exception:
+                logger.warning("Failed to read file %s", filepath)
+                continue
+        code_hash = hashlib.sha256("\n".join(hash_content).encode()).hexdigest()
+        # Clear after consumption
+        self.compilation_config.traced_files.clear()
         if not self.compilation_config.cache_dir:
             # no provided cache dir, generate one based on the known factors
             # that affects the compilation. if none of the factors change,
             # the cache dir will be the same so that we can reuse the compiled
             # graph.
-
-            factors = compilation_config_hash_factors(vllm_config)
-            # 2. factors come from the code files that are traced by Dynamo (
-            #    it mainly summarizes how the model is used in forward pass)
-            code_hash = _compute_code_hash(self.compilation_config.traced_files)
-            self.compilation_config.traced_files.clear()
-            factors.append(code_hash)
-
-            # 3. compiler hash
-            compiler_hash = self.compiler_manager.compute_hash(vllm_config)
-            factors.append(compiler_hash)
-
-            # combine all factors to generate the cache dir
-            hash_key = hashlib.md5(
-                str(factors).encode(), usedforsecurity=False
-            ).hexdigest()[:10]
-
+            factors = [env_hash, config_hash, code_hash, compiler_hash]
+            # Use SHA-256 for cache key hashing to be consistent across
+            # compute_hash functions. Truncate for a short cache dir name.
+            hash_key = hashlib.sha256(str(factors).encode()).hexdigest()[:10]
             cache_dir = os.path.join(
-                envs.VLLM_CACHE_ROOT,
-                "torch_compile_cache",
-                hash_key,
+                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key
             )
             self.compilation_config.cache_dir = cache_dir
 
@@ -621,6 +637,7 @@ def __call__(
         os.makedirs(local_cache_dir, exist_ok=True)
         self.compilation_config.local_cache_dir = local_cache_dir
 
+        # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
         disable_cache = not is_compile_cache_enabled(
             self.compilation_config.inductor_compile_config
         )
@@ -638,6 +655,50 @@ def __call__(
             local_cache_dir, disable_cache, self.prefix
         )
 
+        # Reuses existing cache key
+
+        logger.debug(
+            "torch.compile cache factors: env=%s cfg=%s comp=%s code=%s dir=%s",
+            env_hash,
+            config_hash,
+            compiler_hash,
+            code_hash,
+            local_cache_dir,
+        )
+
+        # Persist and log only hash-relevant factors together.
+        try:
+            logger.debug(
+                "Compile env factors (raw):\n%s\nVllm config hash: %s",
+                lazy(partial(pprint.pformat, env_factors, width=120)),
+                config_hash,
+            )
+            meta_path = os.path.join(local_cache_dir, "cache_key_factors.json")
+            if not os.path.exists(meta_path):
+                with open(meta_path, "w") as f:
+                    json.dump(
+                        {
+                            "env": env_factors,  # raw factors used for env_hash
+                            "config_hash": config_hash,
+                            "code_hash": code_hash,
+                            "compiler_hash": compiler_hash,
+                        },
+                        f,
+                        indent=2,
+                        sort_keys=True,
+                    )
+        except Exception:
+            # Best-effort only; metadata write failures are non-fatal.
+            logger.warning(
+                (
+                    "Could not write compile cache metadata at %s; continuing without "
+                    "metadata. Compiled cache remains valid; diagnostics may be "
+                    "limited."
+                ),
+                local_cache_dir,
+                exc_info=True,
+            )
+
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 0e8bb2fc9735..fe2547d7feca 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -127,7 +127,7 @@ def uuid(self):
         affects compilation caching. Its uuid depends on the UUIDs of all
         dependent passes and the pass config. See InductorPass for more info.
         """
-        state = {"pass_config": self.pass_config.uuid(), "passes": []}
+        state = {"pass_config": self.pass_config.compute_hash(), "passes": []}
         for pass_ in self.passes:
             state["passes"].append(pass_.uuid())
         state["passes"].append(self.fix_functionalization.uuid())
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 864cf1be81b2..2652c7c06ad0 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from dataclasses import field
 from typing import TYPE_CHECKING, Any, Literal
 
@@ -160,13 +159,29 @@ def compute_hash(self) -> str:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: list[Any] = []
-        factors.append(self.cache_dtype)
-        factors.append(self.mamba_cache_dtype)
-        factors.append(self.mamba_ssm_cache_dtype)
-        # `cpu_offload_gb` does not use `torch.compile` yet.
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
-        return hash_str
+        ignored_factors = {
+            # Runtime/derived knobs that don't affect compiled graph shape
+            "gpu_memory_utilization",
+            "swap_space",
+            "is_attention_free",
+            "num_gpu_blocks_override",
+            "enable_prefix_caching",
+            "prefix_caching_hash_algo",
+            # `cpu_offload_gb` does not use `torch.compile` yet.
+            "cpu_offload_gb",
+            "cpu_kvcache_space_bytes",
+            "mamba_page_size_padded",
+            # Post-init/derived counters
+            "num_gpu_blocks",
+            "num_cpu_blocks",
+            # WIP feature toggle not impacting compiled graph shape
+            "kv_sharing_fast_prefill",
+        }
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors)
+        return hash_factors(factors)
 
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 088d0b1af757..ca01cb3fb55d 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
-import hashlib
 from collections import Counter
 from collections.abc import Callable
 from dataclasses import asdict, field
@@ -160,7 +159,7 @@ def default_fi_allreduce_fusion_max_size_mb() -> dict[int, float]:
             current_platform.get_device_capability().to_int(), {}
         )
 
-    def uuid(self):
+    def compute_hash(self) -> str:
         """
         Produces a hash unique to the pass configuration.
         Any new fields that affect compilation should be added to the hash.
@@ -506,28 +505,33 @@ class CompilationConfig:
 
     def compute_hash(self) -> str:
         """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
         Provide a hash that uniquely identifies all the configs
         that affect the structure of the computation
         graph from input ids/embeddings to the final hidden states,
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: list[Any] = []
-        factors.append(self.mode)
-        factors.append(self.backend)
-        factors.append(self.custom_ops)
-        factors.append(self.splitting_ops)
-        factors.append(self.use_inductor)
-        factors.append(self.use_inductor_graph_partition)
-        factors.append(self.inductor_compile_config)
-        factors.append(self.inductor_passes)
-        factors.append(self.pass_config.uuid())
-        factors.append(self.compile_cache_save_format)
-        return hashlib.sha256(str(factors).encode()).hexdigest()
+        # Opt-out: default-include declared fields; keep a tiny exclude set;
+        # normalize types; keep SHA-256. For nested opaque configs, include a
+        # stable identifier (e.g., pass_config.compute_hash()) instead of object id.
+
+        ignored_factors = {
+            # Paths/dirs and runtime/metrics that don’t affect compiled graph
+            "debug_dump_path",
+            "cache_dir",
+            "local_cache_dir",
+            "bs_to_padded_graph_size",
+            "traced_files",
+            "compilation_time",
+            "static_forward_context",
+            "pass_config",  # handled separately below
+        }
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors)
+        factors["pass_config"] = self.pass_config.compute_hash()
+        return hash_factors(factors)
 
     def __repr__(self) -> str:
         exclude = {
diff --git a/vllm/config/model.py b/vllm/config/model.py
index f61dbb6a695a..b563a40eb8fc 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
-import json
 import warnings
 from collections.abc import Callable
 from dataclasses import InitVar, field
@@ -18,7 +16,7 @@
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
 from vllm.config.pooler import PoolerConfig
 from vllm.config.scheduler import RunnerType
-from vllm.config.utils import assert_hashable, config, getattr_iter
+from vllm.config.utils import config, getattr_iter
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import (
@@ -324,50 +322,50 @@ def compute_hash(self) -> str:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: list[Any] = []
-        factors.append(self.model)
-        factors.append(self.dtype)
-        factors.append(self.quantization)
-        factors.append(self.revision)
-        factors.append(self.code_revision)
-        factors.append(self.max_model_len)
-        factors.append(self.max_logprobs)
-        factors.append(self.disable_sliding_window)
-        factors.append(self.trust_remote_code)
-        factors.append(self.generation_config)
-        factors.append(self.model_impl)
-        factors.append(self.override_generation_config)
-        factors.append(self.video_pruning_rate)
-        factors.append(self.enable_prompt_embeds)
-
-        # hf_config can control how the model looks!
-        try:
-            hf_config_json = self.hf_config.to_json_string(use_diff=False)
-        except TypeError:
-            from transformers import PretrainedConfig
-
-            from vllm.utils.jsontree import json_map_leaves
-
-            # Handle nested HF configs with unserializable values gracefully
-            hf_config_json = (
-                json.dumps(
-                    json_map_leaves(
-                        lambda v: v.to_dict()
-                        if isinstance(v, PretrainedConfig)
-                        else str(v),
-                        self.hf_config.to_dict(),
-                    ),
-                    indent=2,
-                    sort_keys=True,
-                )
-                + "\n"
-            )
-
-        factors.append(hf_config_json)
-
-        str_factors = str(factors)
-        assert_hashable(str_factors)
-        return hashlib.sha256(str(factors).encode()).hexdigest()
+        ignored_factors = {
+            "runner",
+            "convert",
+            "task",
+            "tokenizer",
+            "tokenizer_mode",
+            "seed",
+            "hf_config_path",
+            "allowed_local_media_path",
+            "allowed_media_domains",
+            "tokenizer_revision",
+            "spec_target_max_model_len",
+            "enforce_eager",
+            "logprobs_mode",
+            "disable_cascade_attn",
+            "skip_tokenizer_init",
+            "enable_prompt_embeds",
+            "served_model_name",
+            "config_format",
+            "hf_token",
+            "hf_overrides",
+            "logits_processor_pattern",
+            "enable_sleep_mode",
+            "override_attention_dtype",
+            "logits_processors",
+            "io_processor_plugin",
+            "pooler_config",
+            "override_pooler_config",
+            "multimodal_config",
+            "limit_mm_per_prompt",
+            "media_io_kwargs",
+            "mm_processor_kwargs",
+            "mm_processor_cache_gb",
+            "mm_processor_cache_type",
+            "mm_shm_cache_max_object_size_mb",
+            "mm_encoder_tp_mode",
+            "interleave_mm_strings",
+            "skip_mm_profiling",
+        }
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors)
+        return hash_factors(factors)
 
     def _update_nested(
         self,
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 9a6326d62e82..0f107a7a3ef8 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 import os
 from typing import TYPE_CHECKING, Any, Literal
 
@@ -448,19 +447,41 @@ def compute_hash(self):
         This hash is also used for DP worker configuration validation
         to prevent hangs from mismatched collective communication patterns.
         """
-        factors: list[Any] = []
-        factors.append(self.pipeline_parallel_size)
-        factors.append(self.tensor_parallel_size)
-        factors.append(self.enable_expert_parallel)
-        factors.append(self.data_parallel_size)
-        factors.append(self.all2all_backend)
-        factors.append(self.enable_eplb)
-        if self.enable_eplb:
-            factors.append(self.eplb_config.log_balancedness)
-            factors.append(self.eplb_config.window_size)
-            factors.append(self.eplb_config.step_interval)
-            factors.append(self.eplb_config.num_redundant_experts)
-        return hashlib.sha256(str(factors).encode()).hexdigest()
+        ignored_factors = {
+            # Derived/runtime topology, networking, or launch details
+            "data_parallel_rank",
+            "data_parallel_rank_local",
+            "data_parallel_backend",
+            "data_parallel_external_lb",
+            "data_parallel_hybrid_lb",
+            "data_parallel_master_ip",
+            "data_parallel_master_port",
+            "_data_parallel_master_port_list",
+            "data_parallel_rpc_port",
+            "rank",
+            "master_addr",
+            "master_port",
+            "node_rank",
+            "nnodes",
+            "max_parallel_loading_workers",
+            "disable_custom_all_reduce",
+            "ray_workers_use_nsight",
+            "ray_runtime_env",
+            "placement_group",
+            "distributed_executor_backend",
+            "worker_cls",
+            "sd_worker_cls",
+            "worker_extension_cls",
+            "_api_process_count",
+            "_api_process_rank",
+        }
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors)
+        # Explicitly include backend affecting env factor as before
+        factors["VLLM_ALL2ALL_BACKEND"] = str(envs.VLLM_ALL2ALL_BACKEND)
+        return hash_factors(factors)
 
     def __post_init__(self) -> None:
         # Set all2all_backend from env var if not specified, with deprecation warning
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 7e0878d96bbd..02f2b75f608f 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -3,14 +3,19 @@
 """Utility functions for vLLM config dataclasses."""
 
 import ast
+import enum
+import hashlib
 import inspect
+import json
+import pathlib
 import textwrap
-from collections.abc import Iterable
+from collections.abc import Iterable, Mapping, Sequence, Set
 from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar
 
 import regex as re
+import torch
 from pydantic.fields import FieldInfo
 from typing_extensions import runtime_checkable
 
@@ -176,3 +181,115 @@ def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT:
             )
         processed_overrides[field_name] = value
     return replace(config, **processed_overrides)
+
+
+def normalize_value(x):
+    """Return a stable, JSON-serializable canonical form for hashing.
+    Order: primitives, special types (Enum, callable, torch.dtype, Path), then
+    generic containers (Mapping/Set/Sequence) with recursion.
+    """
+    # Fast path
+    if x is None or isinstance(x, (bool, int, float, str)):
+        return x
+
+    # Enums: tag with FQN to avoid primitive collisions.
+    # Ex: Enum(1) vs int(1) -> ("module.QualName", value).
+    if isinstance(x, enum.Enum):
+        enum_type = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
+        return (enum_type, normalize_value(x.value))
+
+    # Classes (types) are accepted and canonicalized by their fully-qualified
+    # name (module.qualname) for a stable identifier.
+    # Instances are only accepted if they expose uuid(); otherwise they are
+    # rejected to avoid under-hashing object state.
+
+    # Callables: accept classes only; reject funcs/lambdas/methods.
+    # Used by LogitsProcessor types and ModelConfig.hf_overrides.
+    if isinstance(x, type):
+        module = getattr(x, "__module__", "")
+        qual = getattr(x, "__qualname__", getattr(x, "__name__", ""))
+        return ".".join([p for p in (module, qual) if p]) or repr(x)
+
+    # Prefer stable uuid identifiers for objects that provide them, even if
+    # they are callable instances (e.g., InductorPass wrappers).
+    if hasattr(x, "uuid") and callable(getattr(x, "uuid", None)):
+        return x.uuid()
+
+    if callable(x):
+        raise TypeError("normalize_value: function or callable instance unsupported")
+
+    # Torch dtype: stringify (torch.float64 -> "torch.float64").
+    # We rely on the string form here; dtype-bearing fields that need additional
+    # disambiguation should encode that at the config layer.
+    if isinstance(x, torch.dtype):
+        return str(x)
+
+    # Bytes
+    if isinstance(x, (bytes, bytearray)):
+        return x.hex()
+
+    # Paths (canonicalize)
+    if isinstance(x, pathlib.Path):
+        try:
+            return str(x.expanduser().resolve())
+        except Exception:
+            return str(x)
+
+    # Dataclasses: represent as (FQN, sorted(field,value) tuple) for stability.
+    if is_dataclass(x):
+        type_fqn = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
+        items = tuple(
+            (f.name, normalize_value(getattr(x, f.name)))
+            for f in sorted(fields(x), key=lambda f: f.name)
+        )
+        return (type_fqn, items)
+
+    # Containers (generic)
+    if isinstance(x, Mapping):
+        return tuple(sorted((str(k), normalize_value(v)) for k, v in x.items()))
+    if isinstance(x, Set):
+        return tuple(sorted(repr(normalize_value(v)) for v in x))
+    if isinstance(x, Sequence) and not isinstance(x, (str, bytes, bytearray)):
+        return tuple(normalize_value(v) for v in x)
+
+    # PretrainedConfig
+    if hasattr(x, "to_json_string") and callable(x.to_json_string):
+        return x.to_json_string()
+
+    # Unsupported type: e.g., modules, generators, open files, or objects
+    # without a stable JSON/UUID representation. Hard-error to avoid
+    # under-hashing.
+    # If you hit this, either reshape your config to use supported primitives
+    # and containers, or extend normalize_value to provide a stable encoding
+    # (e.g., via uuid() or to_json_string()) for this type.
+    raise TypeError(
+        f"normalize_value: unsupported type '{type(x).__name__}'. "
+        "Ensure config values use supported primitives/containers or add a "
+        "stable representation for this type."
+    )
+
+
+def get_hash_factors(config: ConfigT, ignored_factors: set[str]) -> dict[str, object]:
+    """Gets the factors used for hashing a config class.
+    - Includes all dataclass fields not in `ignored_factors`.
+    - Errors on non-normalizable values.
+    """
+    factors: dict[str, object] = {}
+    for dc_field in fields(config):
+        factor = dc_field.name
+        if factor in ignored_factors:
+            continue
+        value = getattr(config, factor, None)
+        try:
+            factors[factor] = normalize_value(value)
+        except TypeError as e:
+            raise TypeError(
+                f"get_hash_factors: unsupported type for key '{factor}' "
+                f"({type(value).__name__})"
+            ) from e
+    return factors
+
+
+def hash_factors(items: dict[str, object]) -> str:
+    """Return a SHA-256 hex digest of the canonical items structure."""
+    return hashlib.sha256(json.dumps(items, sort_keys=True).encode()).hexdigest()
diff --git a/vllm/envs.py b/vllm/envs.py
index e61fb114325c..212d68114e46 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
-import hashlib
 import json
+import logging
 import os
 import sys
 import tempfile
@@ -426,6 +426,8 @@ def get_vllm_port() -> int | None:
 
 # --8<-- [start:env-vars-definition]
 
+logger = logging.getLogger(__name__)
+
 environment_variables: dict[str, Callable[[], Any]] = {
     # ================== Installation Time Env Vars ==================
     # Target device of vLLM, supporting [cuda (by default),
@@ -1540,85 +1542,88 @@ def is_set(name: str):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
-def compute_hash() -> str:
-    """
-    WARNING: Whenever a new key is added to this environment
-    variables, ensure that it is included in the factors list if
-    it affects the computation graph. For example, different values
-    of VLLM_PP_LAYER_PARTITION will generate different computation
-    graphs, so it is included in the factors list. The env vars that
-    affect the choice of different kernels or attention backends should
-    also be included in the factors list.
-    """
-
-    # The values of envs may affects the computation graph.
-    # TODO(DefTruth): hash all environment variables?
-    # for key in environment_variables:
-    #     factorize(key)
-    environment_variables_to_hash = [
-        "VLLM_PP_LAYER_PARTITION",
-        "VLLM_MLA_DISABLE",
-        "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
-        "VLLM_USE_TRITON_AWQ",
-        "VLLM_DP_RANK",
-        "VLLM_DP_SIZE",
-        "VLLM_USE_STANDALONE_COMPILE",
-        "VLLM_FUSED_MOE_CHUNK_SIZE",
-        "VLLM_FLASHINFER_MOE_BACKEND",
-        "VLLM_V1_USE_PREFILL_DECODE_ATTENTION",
-        "VLLM_ATTENTION_BACKEND",
-        "VLLM_USE_FLASHINFER_SAMPLER",
-        "VLLM_DISABLED_KERNELS",
-        "VLLM_USE_DEEP_GEMM",
-        "VLLM_MOE_USE_DEEP_GEMM",
-        "VLLM_USE_DEEP_GEMM_E8M0",
-        "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
-        "VLLM_USE_FLASHINFER_MOE_FP16",
-        "VLLM_USE_FLASHINFER_MOE_FP8",
-        "VLLM_USE_FLASHINFER_MOE_FP4",
-        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
-        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
-        "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
-        "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE",
-        "VLLM_USE_CUDNN_PREFILL",
-        "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
-        "VLLM_USE_TRTLLM_ATTENTION",
-        "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
-        "VLLM_ROCM_USE_AITER",
-        "VLLM_ROCM_USE_AITER_PAGED_ATTN",
-        "VLLM_ROCM_USE_AITER_LINEAR",
-        "VLLM_ROCM_USE_AITER_MOE",
-        "VLLM_ROCM_USE_AITER_RMSNORM",
-        "VLLM_ROCM_USE_AITER_MLA",
-        "VLLM_ROCM_USE_AITER_MHA",
-        "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM",
-        "VLLM_ROCM_USE_AITER_TRITON_ROPE",
-        "VLLM_ROCM_USE_AITER_FP8BMM",
-        "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION",
-        "VLLM_ROCM_USE_AITER_TRITON_GEMM",
-        "VLLM_ROCM_USE_SKINNY_GEMM",
-        "VLLM_ROCM_FP8_PADDING",
-        "VLLM_ROCM_MOE_PADDING",
-        "VLLM_ROCM_CUSTOM_PAGED_ATTN",
-        "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
-        "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16",
-        "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB",
-        "VLLM_ROCM_FP8_MFMA_PAGE_ATTN",
-        "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE",
-        "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING",
-        "VLLM_NVFP4_GEMM_BACKEND",
-        "VLLM_USE_FBGEMM",
-        "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE",
-        "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL",
-    ]
-    for key in environment_variables_to_hash:
-        # if this goes out of sync with environment_variables,
-        # it's not a user error, it's a bug
-        assert key in environment_variables, (
-            "Please update environment_variables_to_hash in envs.py"
-        )
+def compile_factors() -> dict[str, object]:
+    """Return env vars used for torch.compile cache keys.
+
+    Start with every known vLLM env var; drop entries in `ignored_factors`;
+    hash everything else. This keeps the cache key aligned across workers."""
+
+    ignored_factors: set[str] = {
+        "MAX_JOBS",
+        "VLLM_RPC_BASE_PATH",
+        "VLLM_USE_MODELSCOPE",
+        "VLLM_RINGBUFFER_WARNING_INTERVAL",
+        "VLLM_DEBUG_DUMP_PATH",
+        "VLLM_PORT",
+        "VLLM_CACHE_ROOT",
+        "LD_LIBRARY_PATH",
+        "VLLM_SERVER_DEV_MODE",
+        "VLLM_DP_MASTER_IP",
+        "VLLM_DP_MASTER_PORT",
+        "VLLM_RANDOMIZE_DP_DUMMY_INPUTS",
+        "VLLM_CI_USE_S3",
+        "VLLM_MODEL_REDIRECT_PATH",
+        "VLLM_HOST_IP",
+        "S3_ACCESS_KEY_ID",
+        "S3_SECRET_ACCESS_KEY",
+        "S3_ENDPOINT_URL",
+        "VLLM_USAGE_STATS_SERVER",
+        "VLLM_NO_USAGE_STATS",
+        "VLLM_DO_NOT_TRACK",
+        "VLLM_LOGGING_LEVEL",
+        "VLLM_LOGGING_PREFIX",
+        "VLLM_LOGGING_STREAM",
+        "VLLM_LOGGING_CONFIG_PATH",
+        "VLLM_LOG_STATS_INTERVAL",
+        "VLLM_DEBUG_LOG_API_SERVER_RESPONSE",
+        "VLLM_TUNED_CONFIG_FOLDER",
+        "VLLM_ENGINE_ITERATION_TIMEOUT_S",
+        "VLLM_HTTP_TIMEOUT_KEEP_ALIVE",
+        "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS",
+        "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH",
+        "VLLM_SLEEP_WHEN_IDLE",
+        "VLLM_IMAGE_FETCH_TIMEOUT",
+        "VLLM_VIDEO_FETCH_TIMEOUT",
+        "VLLM_AUDIO_FETCH_TIMEOUT",
+        "VLLM_MEDIA_URL_ALLOW_REDIRECTS",
+        "VLLM_MEDIA_LOADING_THREAD_COUNT",
+        "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB",
+        "VLLM_VIDEO_LOADER_BACKEND",
+        "VLLM_MEDIA_CONNECTOR",
+        "VLLM_ASSETS_CACHE",
+        "VLLM_ASSETS_CACHE_MODEL_CLEAN",
+        "VLLM_MM_INPUT_CACHE_GIB",
+        "VLLM_WORKER_MULTIPROC_METHOD",
+        "VLLM_ENABLE_V1_MULTIPROCESSING",
+        "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE",
+        "VLLM_CPU_KVCACHE_SPACE",
+        "VLLM_CPU_OMP_THREADS_BIND",
+        "VLLM_CPU_NUM_OF_RESERVED_CPU",
+        "VLLM_CPU_MOE_PREPACK",
+        "VLLM_CPU_SGL_KERNEL",
+        "VLLM_TEST_FORCE_LOAD_FORMAT",
+        "LOCAL_RANK",
+        "CUDA_VISIBLE_DEVICES",
+    }
+
+    from vllm.config.utils import normalize_value
+
+    factors: dict[str, object] = {}
+    for factor, getter in environment_variables.items():
+        if factor in ignored_factors:
+            continue
+
+        try:
+            raw = getter()
+        except Exception as exc:  # pragma: no cover - defensive logging
+            logger.warning(
+                "Skipping environment variable %s while hashing compile factors: %s",
+                factor,
+                exc,
+            )
+            continue
 
-    factors = [environment_variables[key]() for key in environment_variables_to_hash]
+        factors[factor] = normalize_value(raw)
 
     ray_noset_env_vars = [
         # Refer to
@@ -1641,8 +1646,8 @@ def compute_hash() -> str:
         "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR",
         "RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES",
     ]
-    factors.extend([os.getenv(var) for var in ray_noset_env_vars])
 
-    hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+    for var in ray_noset_env_vars:
+        factors[var] = normalize_value(os.getenv(var))
 
-    return hash_str
+    return factors
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
index 7202259ca21a..44b40ead973b 100644
--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
@@ -2,9 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.logging_utils.formatter import NewLineFormatter
+from vllm.logging_utils.lazy import lazy
 from vllm.logging_utils.log_time import logtime
 
 __all__ = [
     "NewLineFormatter",
+    "lazy",
     "logtime",
 ]
diff --git a/vllm/logging_utils/lazy.py b/vllm/logging_utils/lazy.py
new file mode 100644
index 000000000000..3ade79896285
--- /dev/null
+++ b/vllm/logging_utils/lazy.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+from typing import Any
+
+
+class lazy:
+    """Wrap a zero-argument callable evaluated only during log formatting."""
+
+    __slots__ = ("_factory",)
+
+    def __init__(self, factory: Callable[[], Any]) -> None:
+        self._factory = factory
+
+    def __str__(self) -> str:
+        return str(self._factory())
+
+    def __repr__(self) -> str:
+        return str(self)

From 48fc8b1e595766af9c91edfc1de43f3a352575eb Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 19 Nov 2025 10:04:07 -0500
Subject: [PATCH 192/578] [BugFix] Fix async-scheduling + FlashAttn MLA
 (#28990)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/attention/backends/mla/common.py        | 15 +++++++++------
 vllm/v1/attention/backends/mla/flashattn_mla.py |  2 +-
 vllm/v1/attention/backends/utils.py             |  1 +
 vllm/v1/worker/gpu_model_runner.py              | 10 +++++++---
 4 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 2ccdd1f143ce..e328049b53c7 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -755,6 +755,7 @@ def build(
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         dcp_local_seq_lens = common_attn_metadata.dcp_local_seq_lens
+        dcp_local_seq_lens_cpu = common_attn_metadata.dcp_local_seq_lens_cpu
 
         query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
@@ -944,18 +945,20 @@ def build(
 
         decode_metadata = None
         if num_decodes > 0:
+            dcp_tot_seq_lens_device = None
+            if self.dcp_world_size > 1:
+                dcp_tot_seq_lens_device = seq_lens[:num_decodes]
+                seq_lens_cpu = dcp_local_seq_lens_cpu
+                seq_lens = dcp_local_seq_lens
+
             decode_metadata = self._build_decode(
                 block_table_tensor=block_table_tensor[:num_decodes, ...],
                 seq_lens_cpu=seq_lens_cpu[:num_decodes],
-                seq_lens_device=dcp_local_seq_lens[:num_decodes]
-                if self.dcp_world_size > 1 and dcp_local_seq_lens is not None
-                else seq_lens[:num_decodes],
+                seq_lens_device=seq_lens[:num_decodes],
                 query_start_loc_cpu=query_start_loc_cpu[: num_decodes + 1],
                 query_start_loc_device=query_start_loc[: num_decodes + 1],
                 num_decode_tokens=num_decode_tokens,
-                dcp_tot_seq_lens_device=seq_lens[:num_decodes]
-                if self.dcp_world_size > 1
-                else None,
+                dcp_tot_seq_lens_device=dcp_tot_seq_lens_device,
             )
 
         attn_metadata = self.metadata_cls(
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index 7794e89cc0a9..12639edc8b9a 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -173,7 +173,7 @@ def _build_decode(
     ) -> FlashAttnMLADecodeMetadata:
         query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         max_query_len = query_lens_cpu.max().item()
-        max_seq_len = seq_lens_device.max().item()
+        max_seq_len = seq_lens_cpu.max().item()
 
         # For Flash Attention MLA + full cudagraph
         max_num_splits = 0
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 578153cda786..0dd189633129 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -92,6 +92,7 @@ class CommonAttentionMetadata:
     encoder_seq_lens: np.ndarray | None = None
 
     dcp_local_seq_lens: torch.Tensor | None = None
+    dcp_local_seq_lens_cpu: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 506118d2d762..3b00085b6bb9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1451,9 +1451,12 @@ def _build_attention_metadata(
         num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
             :num_reqs
         ]
-        dcp_local_seq_lens = (
-            self.dcp_local_seq_lens.gpu[:num_reqs] if self.dcp_world_size > 1 else None
-        )
+
+        dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None
+        if self.dcp_world_size > 1:
+            dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs]
+            dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs]
+
         spec_decode_common_attn_metadata = None
 
         if for_cudagraph_capture:
@@ -1521,6 +1524,7 @@ def _build_attention_metadata(
                 causal=True,
                 encoder_seq_lens=encoder_seq_lens,
                 dcp_local_seq_lens=dcp_local_seq_lens,
+                dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu,
             )
 
             if self.speculative_config and spec_decode_common_attn_metadata is None:

From d44e9df7d49a9bb3400b002c38c06fae2dd7d1e8 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Thu, 20 Nov 2025 00:24:55 +0800
Subject: [PATCH 193/578] [Model][Mamba] Add selector for mamba attention
 backend and make it pluggable for other device (#26487)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 docs/contributing/model/basic.md              |   1 +
 vllm/attention/__init__.py                    |   3 +-
 vllm/attention/backends/registry.py           | 114 +++++++++++++++---
 vllm/attention/selector.py                    |  33 ++++-
 vllm/model_executor/layers/kda.py             |   8 +-
 vllm/model_executor/layers/mamba/abstract.py  |  10 +-
 .../layers/mamba/linear_attn.py               |  14 ---
 .../layers/mamba/mamba_mixer.py               |  10 +-
 .../layers/mamba/mamba_mixer2.py              |   9 --
 .../model_executor/layers/mamba/short_conv.py |   9 --
 vllm/model_executor/models/plamo2.py          |   9 --
 vllm/model_executor/models/qwen3_next.py      |   9 +-
 12 files changed, 144 insertions(+), 85 deletions(-)

diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index a7b54f015c2d..d7f5d2f311a3 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -146,6 +146,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
 Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
+It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend.
 Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
 Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
 The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index dd35165d5415..8b4dc4013362 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -7,7 +7,7 @@
     AttentionType,
 )
 from vllm.attention.layer import Attention
-from vllm.attention.selector import get_attn_backend
+from vllm.attention.selector import get_attn_backend, get_mamba_attn_backend
 
 __all__ = [
     "Attention",
@@ -15,4 +15,5 @@
     "AttentionMetadata",
     "AttentionType",
     "get_attn_backend",
+    "get_mamba_attn_backend",
 ]
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index f07a6059be37..51899b023591 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention backend registry"""
 
-import enum
 from collections.abc import Callable
+from enum import Enum, EnumMeta
 from typing import TYPE_CHECKING, cast
 
 from vllm.logger import init_logger
@@ -15,7 +15,7 @@
 logger = init_logger(__name__)
 
 
-class _AttentionBackendEnumMeta(enum.EnumMeta):
+class _AttentionBackendEnumMeta(EnumMeta):
     """Metaclass for AttentionBackendEnum to provide better error messages."""
 
     def __getitem__(cls, name: str):
@@ -23,15 +23,15 @@ def __getitem__(cls, name: str):
         try:
             return super().__getitem__(name)
         except KeyError:
-            members = cast("dict[str, AttentionBackendEnum]", cls.__members__).values()
-            valid_backends = ", ".join(m.name for m in members)
+            members = cast("dict[str, Enum]", cls.__members__).keys()
+            valid_backends = ", ".join(members)
             raise ValueError(
                 f"Unknown attention backend: '{name}'. "
                 f"Valid options are: {valid_backends}"
             ) from None
 
 
-class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
+class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     """Enumeration of all supported attention backends.
 
     The enum value is the default class path, but this can be overridden
@@ -83,7 +83,7 @@ def get_path(self, include_classname: bool = True) -> str:
         Raises:
             ValueError: If Backend.CUSTOM is used without being registered
         """
-        path = _OVERRIDES.get(self, self.value)
+        path = _ATTN_OVERRIDES.get(self, self.value)
         if not path:
             raise ValueError(
                 f"Backend {self.name} must be registered before use. "
@@ -111,18 +111,93 @@ def is_overridden(self) -> bool:
         Returns:
             True if the backend has a registered override
         """
-        return self in _OVERRIDES
+        return self in _ATTN_OVERRIDES
 
     def clear_override(self) -> None:
         """Clear any override for this backend, reverting to the default."""
-        _OVERRIDES.pop(self, None)
+        _ATTN_OVERRIDES.pop(self, None)
 
 
-_OVERRIDES: dict[AttentionBackendEnum, str] = {}
+class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
+    """Enumeration of all supported mamba attention backends.
+
+    The enum value is the default class path, but this can be overridden
+    at runtime using register_backend().
+
+    To get the actual backend class (respecting overrides), use:
+        backend.get_class()
+    """
+
+    MAMBA1 = "vllm.v1.attention.backends.mamba1_attn.Mamba1AttentionBackend"
+    MAMBA2 = "vllm.v1.attention.backends.mamba2_attn.Mamba2AttentionBackend"
+    SHORT_CONV = "vllm.v1.attention.backends.short_conv_attn.ShortConvAttentionBackend"
+    LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend"
+    GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend"
+    # Placeholder for third-party/custom backends - must be registered before use
+    CUSTOM = ""
+
+    def get_path(self, include_classname: bool = True) -> str:
+        """Get the class path for this backend (respects overrides).
+
+        Returns:
+            The fully qualified class path string
+
+        Raises:
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        path = _MAMBA_ATTN_OVERRIDES.get(self, self.value)
+        if not path:
+            raise ValueError(
+                f"Backend {self.name} must be registered before use. "
+                f"Use register_backend(Backend.{self.name}, 'your.module.YourClass')"
+            )
+        if not include_classname:
+            path = path.rsplit(".", 1)[0]
+        return path
+
+    def get_class(self) -> "type[AttentionBackend]":
+        """Get the backend class (respects overrides).
+
+        Returns:
+            The backend class
+
+        Raises:
+            ImportError: If the backend class cannot be imported
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        return resolve_obj_by_qualname(self.get_path())
+
+    def is_overridden(self) -> bool:
+        """Check if this backend has been overridden.
+
+        Returns:
+            True if the backend has a registered override
+        """
+        return self in _MAMBA_ATTN_OVERRIDES
+
+    def clear_override(self) -> None:
+        """Clear any override for this backend, reverting to the default."""
+        _MAMBA_ATTN_OVERRIDES.pop(self, None)
+
+
+MAMBA_TYPE_TO_BACKEND_MAP = {
+    "mamba1": MambaAttentionBackendEnum.MAMBA1.name,
+    "mamba2": MambaAttentionBackendEnum.MAMBA2.name,
+    "short_conv": MambaAttentionBackendEnum.SHORT_CONV.name,
+    "linear_attention": MambaAttentionBackendEnum.LINEAR.name,
+    "gdn_attention": MambaAttentionBackendEnum.GDN_ATTN.name,
+    "custom": MambaAttentionBackendEnum.CUSTOM.name,
+}
+
+
+_ATTN_OVERRIDES: dict[AttentionBackendEnum, str] = {}
+_MAMBA_ATTN_OVERRIDES: dict[MambaAttentionBackendEnum, str] = {}
 
 
 def register_backend(
-    backend: AttentionBackendEnum, class_path: str | None = None
+    backend: AttentionBackendEnum | MambaAttentionBackendEnum,
+    is_mamba: bool = False,
+    class_path: str | None = None,
 ) -> Callable[[type], type]:
     """Register or override a backend implementation.
 
@@ -135,12 +210,17 @@ def register_backend(
         Decorator function if class_path is None, otherwise a no-op
 
     Examples:
-        # Override an existing backend
+        # Override an existing attention backend
         @register_backend(AttentionBackendEnum.FLASH_ATTN)
         class MyCustomFlashAttn:
             ...
 
-        # Register a custom third-party backend
+        # Override an existing mamba attention backend
+        @register_backend(MambaAttentionBackendEnum.LINEAR, is_mamba=True)
+        class MyCustomMambaAttn:
+            ...
+
+        # Register a custom third-party attention backend
         @register_backend(AttentionBackendEnum.CUSTOM)
         class MyCustomBackend:
             ...
@@ -153,11 +233,17 @@ class MyCustomBackend:
     """
 
     def decorator(cls: type) -> type:
-        _OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"
+        if is_mamba:
+            _MAMBA_ATTN_OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"  # type: ignore[index]
+        else:
+            _ATTN_OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"  # type: ignore[index]
         return cls
 
     if class_path is not None:
-        _OVERRIDES[backend] = class_path
+        if is_mamba:
+            _MAMBA_ATTN_OVERRIDES[backend] = class_path  # type: ignore[index]
+        else:
+            _ATTN_OVERRIDES[backend] = class_path  # type: ignore[index]
         return lambda x: x
 
     return decorator
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 1a092db9ce37..e9af08b2316d 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -12,7 +12,11 @@
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.backends.registry import (
+    MAMBA_TYPE_TO_BACKEND_MAP,
+    AttentionBackendEnum,
+    MambaAttentionBackendEnum,
+)
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.utils import STR_BACKEND_ENV_VAR
@@ -197,6 +201,33 @@ def _cached_get_attn_backend(
     return backend
 
 
+def get_mamba_attn_backend(
+    mamba_type: str,
+) -> type[AttentionBackend]:
+    """Select which mamba attention backend to use and lazily import it."""
+    return _cached_get_mamba_attn_backend(mamba_type)
+
+
+@cache
+def _cached_get_mamba_attn_backend(
+    mamba_type: str,
+) -> type[AttentionBackend]:
+    assert mamba_type and isinstance(mamba_type, str)
+
+    selected_backend = None
+    try:
+        backend_name = MAMBA_TYPE_TO_BACKEND_MAP[mamba_type]
+        selected_backend = MambaAttentionBackendEnum[backend_name]
+    except KeyError as e:
+        raise ValueError(
+            f"Invalid mamba attention backend type: '{backend_name}'. Valid "
+            f"backends are: {list(MambaAttentionBackendEnum.__members__.keys())}"
+        ) from e
+
+    mamba_attn_backend = selected_backend.get_class()
+    return mamba_attn_backend
+
+
 @contextmanager
 def global_force_attn_backend_context_manager(
     attn_backend: AttentionBackendEnum,
diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py
index 2e7500bac718..27cc3884517f 100644
--- a/vllm/model_executor/layers/kda.py
+++ b/vllm/model_executor/layers/kda.py
@@ -5,7 +5,6 @@
 from einops import rearrange
 from torch import nn
 
-from vllm.attention import AttentionBackend
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import (
@@ -83,12 +82,7 @@ def kda_attention_fake(
 class KimiDeltaAttention(nn.Module, MambaBase):
     @property
     def mamba_type(self) -> str:
-        return "linear_attention"
-
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend
-
-        return GDNAttentionBackend
+        return "gdn_attention"
 
     def get_state_dtype(
         self,
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index e68b09b4d81f..aa919d6fdc35 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -6,6 +6,7 @@
 
 import torch
 
+from vllm.attention.selector import get_mamba_attn_backend
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec
@@ -38,11 +39,6 @@ def get_state_shape(self) -> Iterable[tuple[int, ...]]:
     def mamba_type(self) -> str:
         pass
 
-    @abstractmethod
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        """Get the attention backend class for this Mamba layer."""
-        pass
-
     @abstractmethod
     def get_state_dtype(self) -> tuple[torch.dtype, ...]:
         pass
@@ -69,3 +65,7 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None:
                 else 0
             ),
         )
+
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        """Get the attention backend class for this Mamba layer."""
+        return get_mamba_attn_backend(self.mamba_type)
diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index 0a2742ff49a4..d85b3e61c5d6 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -2,12 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-from typing import TYPE_CHECKING
 
 import torch
 import torch.nn.functional as F
@@ -37,9 +31,6 @@
 from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
 
 class MiniMaxText01RMSNormTP(CustomOp):
     name = "MiniMaxText01RMSNormTP"
@@ -123,11 +114,6 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
     def mamba_type(self) -> str:
         return "linear_attention"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
-
-        return LinearAttentionBackend
-
     def get_state_dtype(self) -> tuple[torch.dtype]:
         assert self.model_config is not None
         assert self.cache_config is not None
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index b6345b8af7f0..90e520e24441 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, NamedTuple
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
+from typing import NamedTuple
 
 import torch
 from torch import nn
@@ -452,11 +449,6 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
     def mamba_type(self) -> str:
         return "mamba1"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
-
-        return Mamba1AttentionBackend
-
     def _time_proj_bias(self) -> torch.Tensor | None:
         if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None:
             return self.dt_proj.bias.float()
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 57313990b820..900701c46348 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -1,10 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 from torch import nn
@@ -908,11 +904,6 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
     def mamba_type(self) -> str:
         return "mamba2"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
-
-        return Mamba2AttentionBackend
-
 
 def mamba_mixer2(
     projected_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py
index 04efa8a8b373..0bbad17d7ebc 100644
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -1,10 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 
@@ -232,11 +228,6 @@ def get_state_shape(self) -> tuple[tuple[int, ...]]:
     def mamba_type(self) -> str:
         return "short_conv"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionBackend
-
-        return ShortConvAttentionBackend
-
 
 def short_conv(
     hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 0c87f5000ff4..52c9755e0e0e 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -4,10 +4,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 from torch import nn
@@ -467,11 +463,6 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
     def mamba_type(self) -> str:
         return "mamba2"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
-
-        return Mamba2AttentionBackend
-
 
 def plamo2_mamba_mixer(
     hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 0415c8e00fdf..ad631f61e4b9 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -10,7 +10,7 @@
 from torch import nn
 from transformers.activations import ACT2FN
 
-from vllm.attention import Attention, AttentionBackend, AttentionMetadata
+from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CacheConfig,
@@ -216,12 +216,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
     @property
     def mamba_type(self) -> str:
-        return "linear_attention"
-
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend
-
-        return GDNAttentionBackend
+        return "gdn_attention"
 
     def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
         return MambaStateDtypeCalculator.gated_delta_net_state_dtype(

From a8b70304d68497ac1c432a2ff343e9bfb516c227 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 19 Nov 2025 18:06:36 +0100
Subject: [PATCH 194/578] Update `rope_scaling` to `rope_parameters` in
 preparation for Transformers v5 (#28542)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |   6 +-
 benchmarks/kernels/benchmark_mrope.py         |  19 ++--
 .../offline_inference/context_extension.py    |   6 +-
 tests/compile/test_functionalization.py       |   4 +-
 tests/kernels/core/test_mrope.py              |  16 +--
 tests/kernels/core/test_pos_encoding.py       |  39 +++----
 .../moe/test_gpt_oss_triton_kernels.py        |   2 +-
 .../pooling/test_nomic_max_model_len.py       |  16 +--
 tests/test_config.py                          |  37 ++++---
 vllm/config/model.py                          |  63 +++++------
 .../layers/rotary_embedding/__init__.py       |  76 ++++++-------
 vllm/model_executor/models/afmoe.py           |  17 +--
 vllm/model_executor/models/apertus.py         |  22 +---
 vllm/model_executor/models/arcee.py           |  11 --
 vllm/model_executor/models/arctic.py          |   3 +-
 vllm/model_executor/models/baichuan.py        |   8 +-
 vllm/model_executor/models/bailing_moe.py     |   3 +-
 vllm/model_executor/models/bamba.py           |   6 +-
 vllm/model_executor/models/chameleon.py       |  29 +----
 vllm/model_executor/models/chatglm.py         |   3 +-
 vllm/model_executor/models/commandr.py        |   5 +-
 vllm/model_executor/models/config.py          |  22 ++--
 vllm/model_executor/models/dbrx.py            |   7 +-
 vllm/model_executor/models/deepseek_v2.py     |  43 +++-----
 vllm/model_executor/models/dots1.py           |  11 +-
 vllm/model_executor/models/ernie45_moe.py     |  14 +--
 vllm/model_executor/models/ernie45_vl_moe.py  |  13 +--
 vllm/model_executor/models/exaone.py          |  21 +---
 vllm/model_executor/models/exaone4.py         |  19 +---
 vllm/model_executor/models/falcon.py          |   3 +-
 vllm/model_executor/models/falcon_h1.py       |   8 +-
 vllm/model_executor/models/gemma.py           |   8 +-
 vllm/model_executor/models/gemma2.py          |   5 +-
 vllm/model_executor/models/gemma3.py          |  21 ++--
 vllm/model_executor/models/gemma3n.py         |  20 ++--
 vllm/model_executor/models/glm4.py            |  10 +-
 vllm/model_executor/models/glm4_1v.py         |   1 -
 vllm/model_executor/models/glm4_moe.py        |  11 +-
 vllm/model_executor/models/gpt_j.py           |   3 +-
 vllm/model_executor/models/gpt_neox.py        |   3 +-
 vllm/model_executor/models/gpt_oss.py         |  13 ++-
 vllm/model_executor/models/granite.py         |  17 +--
 vllm/model_executor/models/granitemoe.py      |  13 +--
 .../model_executor/models/granitemoehybrid.py |   5 +-
 .../model_executor/models/granitemoeshared.py |   6 +-
 vllm/model_executor/models/grok1.py           |  11 +-
 vllm/model_executor/models/hunyuan_v1.py      |  25 +----
 vllm/model_executor/models/internlm2.py       |  12 +--
 vllm/model_executor/models/internlm2_ve.py    |   5 +-
 vllm/model_executor/models/kimi_linear.py     |   5 -
 vllm/model_executor/models/lfm2.py            |  17 +--
 vllm/model_executor/models/lfm2_moe.py        |  17 +--
 vllm/model_executor/models/llama.py           |  22 +---
 vllm/model_executor/models/llama4.py          |  11 +-
 vllm/model_executor/models/longcat_flash.py   |  22 ++--
 vllm/model_executor/models/minicpm.py         |  12 +--
 vllm/model_executor/models/minicpm3.py        |  10 +-
 vllm/model_executor/models/minicpm_eagle.py   |   5 +-
 vllm/model_executor/models/minimax_m2.py      |  12 +--
 vllm/model_executor/models/minimax_text_01.py |   9 +-
 vllm/model_executor/models/mixtral.py         |   7 +-
 vllm/model_executor/models/mllama4.py         |   8 +-
 vllm/model_executor/models/molmo.py           |   3 +-
 vllm/model_executor/models/nemotron.py        |  17 +--
 vllm/model_executor/models/nemotron_nas.py    |  19 +---
 vllm/model_executor/models/olmo.py            |   3 +-
 vllm/model_executor/models/olmo2.py           |  13 +--
 vllm/model_executor/models/olmoe.py           |   6 +-
 vllm/model_executor/models/openpangu.py       |  26 ++---
 vllm/model_executor/models/orion.py           |  12 +--
 vllm/model_executor/models/ouro.py            |  11 +-
 vllm/model_executor/models/persimmon.py       |   3 +-
 vllm/model_executor/models/phi.py             |   6 +-
 vllm/model_executor/models/phimoe.py          |  18 ++--
 vllm/model_executor/models/plamo2.py          |   7 +-
 vllm/model_executor/models/qwen.py            |  11 +-
 vllm/model_executor/models/qwen2.py           |  16 +--
 vllm/model_executor/models/qwen2_5_vl.py      |   1 -
 vllm/model_executor/models/qwen2_moe.py       |  12 +--
 vllm/model_executor/models/qwen2_vl.py        |   1 -
 vllm/model_executor/models/qwen3.py           |  15 +--
 vllm/model_executor/models/qwen3_moe.py       |  12 +--
 vllm/model_executor/models/qwen3_next.py      |   3 +-
 .../models/qwen3_omni_moe_thinker.py          |   1 -
 vllm/model_executor/models/qwen3_vl.py        |   1 -
 vllm/model_executor/models/seed_oss.py        |  15 +--
 vllm/model_executor/models/solar.py           |  18 +---
 vllm/model_executor/models/stablelm.py        |   2 +-
 vllm/model_executor/models/starcoder2.py      |   3 +-
 vllm/model_executor/models/step3_text.py      |  16 ++-
 .../models/transformers/utils.py              |  10 +-
 vllm/model_executor/models/zamba2.py          |   4 +-
 vllm/transformers_utils/config.py             | 100 +++++++++++++-----
 vllm/transformers_utils/configs/afmoe.py      |   7 +-
 vllm/transformers_utils/configs/arctic.py     |  18 +++-
 vllm/transformers_utils/configs/flex_olmo.py  |  17 +--
 .../transformers_utils/configs/kimi_linear.py |  12 ++-
 vllm/transformers_utils/configs/lfm2_moe.py   |  12 ++-
 .../transformers_utils/configs/midashenglm.py |   2 +-
 vllm/transformers_utils/configs/mistral.py    |   4 +-
 vllm/transformers_utils/configs/nemotron.py   |  60 ++++++-----
 vllm/transformers_utils/configs/olmo3.py      |  12 ++-
 vllm/transformers_utils/configs/qwen3_next.py |  17 +--
 vllm/transformers_utils/configs/step3_vl.py   |  12 ++-
 104 files changed, 544 insertions(+), 912 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e62cd60efaec..d4b6f4077ab3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -872,12 +872,12 @@ steps:
   optional: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
     - pytest -v -s tests/models/test_transformers.py
     # - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
+    - pytest -v -s tests/models/multimodal/test_mapping.py
     - python3 examples/offline_inference/basic/chat.py
-    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
index cb848d2bf579..83bd91917508 100644
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -6,7 +6,7 @@
 #
 # The CSV file (named with current date/time) contains these columns:
 # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
-# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
+# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
 # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
 # speedup
 #
@@ -86,9 +86,8 @@ def benchmark_mrope(
     num_heads: int,
     num_kv_heads: int,
     max_position: int = 8192,
-    rope_theta: float = 10000,
     is_neox_style: bool = True,
-    rope_scaling: dict[str, Any] = None,
+    rope_parameters: dict[str, Any] | None = None,
     dtype: torch.dtype = torch.bfloat16,
     seed: int = 0,
     warmup_iter: int = 10,
@@ -102,9 +101,8 @@ def benchmark_mrope(
         head_size=head_dim,
         rotary_dim=head_dim,
         max_position=max_position,
-        base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=rope_scaling,
+        rope_parameters=rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
@@ -203,9 +201,8 @@ def benchmark_mrope(
             num_kv_heads,
             head_dim,
             max_position,
-            rope_theta,
             is_neox_style,
-            str(rope_scaling),
+            str(rope_parameters),
             str(dtype).split(".")[-1],
             torch_stats["mean"],
             torch_stats["median"],
@@ -255,9 +252,8 @@ def benchmark_mrope(
             "num_kv_heads",
             "head_dim",
             "max_position",
-            "rope_theta",
             "is_neox_style",
-            "rope_scaling",
+            "rope_parameters",
             "dtype",
             "torch_mean",
             "torch_median",
@@ -303,7 +299,7 @@ def benchmark_mrope(
                 q_size = num_heads * head_dim
                 kv_size = num_kv_heads * head_dim
                 is_neox_style = True
-                rope_theta = config.rope_theta
+                rope_parameters = config.rope_parameters
                 max_position = config.max_position_embeddings
 
                 for num_tokens in num_tokens_list:
@@ -315,9 +311,8 @@ def benchmark_mrope(
                         num_heads=num_heads,
                         num_kv_heads=num_kv_heads,
                         max_position=max_position,
-                        rope_theta=rope_theta,
                         is_neox_style=is_neox_style,
-                        rope_scaling=config.rope_scaling,
+                        rope_parameters=rope_parameters,
                         dtype=getattr(torch, args.dtype),
                         seed=args.seed,
                         warmup_iter=args.warmup_iter,
diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py
index df39e4c25d5c..67d33e1881ee 100644
--- a/examples/offline_inference/context_extension.py
+++ b/examples/offline_inference/context_extension.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This script demonstrates how to extend the context length
-of a Qwen model using the YARN method (rope_scaling)
+of a Qwen model using the YARN method (rope_parameters)
 and run a simple chat example.
 
 Usage:
@@ -19,8 +19,8 @@ def create_llm():
 
     # Use yarn to extend context
     hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 11ae96e930da..515e0a93ac2a 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -137,7 +137,7 @@ def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000):
             self.head_dim,
             rotary_dim=self.rotary_dim,
             max_position=max_position,
-            base=base,
+            rope_parameters={"rope_type": "default", "rope_theta": base},
         )
 
     def forward(self, positions, q, k):
@@ -172,7 +172,7 @@ def __init__(self, head_dim=64, num_heads=4, max_position=2048, base=10000):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=base,
+            rope_parameters={"rope_type": "default", "rope_theta": base},
         )
 
     def forward(self, positions, hidden_states):
diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py
index 02b795721f46..43b242ab2d58 100644
--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -5,11 +5,11 @@
 import pytest
 import torch
 from packaging.version import Version
-from transformers import AutoConfig
 from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_config
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -98,8 +98,7 @@ def test_mrope(
     atol = model_info.atol
     rtol = model_info.rtol
 
-    config = AutoConfig.from_pretrained(model_name)
-    config = config.get_text_config()
+    config = get_config(model_name, False).get_text_config()
 
     # get the model config
     total_num_kv_heads = config.num_key_value_heads
@@ -113,7 +112,6 @@ def test_mrope(
     )
     is_neox_style = True
 
-    rope_theta = config.rope_theta
     max_position = config.max_position_embeddings
     partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
     rotary_dim = int(head_dim * partial_rotary_factor)
@@ -122,9 +120,8 @@ def test_mrope(
         head_size=head_dim,
         rotary_dim=rotary_dim,
         max_position=max_position,
-        base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=config.rope_scaling,
+        rope_parameters=config.rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
@@ -173,8 +170,7 @@ def test_mrope_torch_compile_tracing(
     atol = model_info.atol
     rtol = model_info.rtol
 
-    config = AutoConfig.from_pretrained(model_name)
-    config = config.get_text_config()
+    config = get_config(model_name, False).get_text_config()
 
     # get the model config
     total_num_kv_heads = config.num_key_value_heads
@@ -187,7 +183,6 @@ def test_mrope_torch_compile_tracing(
         else config.hidden_size // total_num_heads
     )
     is_neox_style = True
-    rope_theta = config.rope_theta
     max_position = config.max_position_embeddings
     partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
     rotary_dim = int(head_dim * partial_rotary_factor)
@@ -196,9 +191,8 @@ def test_mrope_torch_compile_tracing(
         head_size=head_dim,
         rotary_dim=rotary_dim,
         max_position=max_position,
-        base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=config.rope_scaling,
+        rope_parameters=config.rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index c35ee5016ba0..a8ed3825689d 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -74,7 +74,7 @@ def test_rotary_embedding(
     device: str,
     use_key: bool,
     max_position: int = 8192,
-    base: float = 10000,
+    rope_theta: float = 10000,
 ) -> None:
     if rotary_dim is None:
         rotary_dim = head_size
@@ -83,7 +83,8 @@ def test_rotary_embedding(
     torch.set_default_device(device)
     if rotary_dim is None:
         rotary_dim = head_size
-    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+    rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+    rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters)
     rope = rope.to(dtype=dtype, device=torch.get_default_device())
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
@@ -120,9 +121,9 @@ def test_rotary_embedding(
 @torch.inference_mode()
 def test_rope_module_cache():
     MAX_POSITIONS = [123, 1234]
-    BASES = [10000, 1000000]
-    ROPE_SCALINGS = (
-        None,
+    ROPE_THETAS = [10000, 1000000]
+    ROPE_PARAMETERS = (
+        {"rope_type": "default"},
         {"rope_type": "linear", "factor": (1,)},
         {"rope_type": "dynamic", "factor": 1},
     )
@@ -130,9 +131,9 @@ def test_rope_module_cache():
         HEAD_SIZES,
         ROTARY_DIMS,
         MAX_POSITIONS,
-        BASES,
+        ROPE_THETAS,
         IS_NEOX_STYLE,
-        ROPE_SCALINGS,
+        ROPE_PARAMETERS,
         DTYPES,
     )
     rope_setting_id_map: dict[str, int] = {}
@@ -141,20 +142,20 @@ def test_rope_module_cache():
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            rope_theta,
+            is_neox_style,
+            rope_parameters,
             dtype,
         ) = setting
         if rotary_dim is None:
             rotary_dim = head_size
+        rope_parameters["rope_theta"] = rope_theta
         rope = get_rope(
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            is_neox_style,
+            rope_parameters,
             dtype,
         )
         # different settings cannot share the same rope module
@@ -168,20 +169,20 @@ def test_rope_module_cache():
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            rope_theta,
+            is_neox_style,
+            rope_parameters,
             dtype,
         ) = setting
         if rotary_dim is None:
             rotary_dim = head_size
+        rope_parameters["rope_theta"] = rope_theta
         rope = get_rope(
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            is_neox_style,
+            rope_parameters,
             dtype,
         )
         # check if cache take effect
diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
index dfd317bcf72f..af33fd4e3fc3 100644
--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -201,7 +201,7 @@ class ModelConfig:
     sliding_window: int = 128
     initial_context_length: int = 4096
     rope_theta: float = 150000.0
-    rope_scaling_factor: float = 32.0
+    rope_parameters_factor: float = 32.0
     rope_ntk_alpha: float = 1.0
     rope_ntk_beta: float = 32.0
 
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
index 88f088c60327..d6216a87a229 100644
--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
+from typing import Any
+
 import pytest
 
 from ...utils import EmbedModelInfo
@@ -79,8 +81,8 @@ def test_set_max_model_len_illegal(model_info, vllm_runner):
 @pytest.mark.parametrize("model_info", MODELS)
 def test_use_rope_scaling_legal(model_info, vllm_runner):
     hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
@@ -96,9 +98,9 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_use_rope_scaling_illegal(model_info, vllm_runner):
-    hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+    hf_overrides: dict[str, Any] = {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
@@ -115,8 +117,8 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
             pass
 
     hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
diff --git a/tests/test_config.py b/tests/test_config.py
index bba2fbec3db2..16f68d18fc68 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -249,45 +249,48 @@ def test_get_bert_tokenization_sentence_transformer_config():
 
 
 def test_rope_customization():
-    TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
-    TEST_ROPE_THETA = 16_000_000.0
-    LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
+    TEST_ROPE_PARAMETERS = {
+        "rope_theta": 16_000_000.0,
+        "rope_type": "dynamic",
+        "factor": 2.0,
+    }
+    LLAMA_ROPE_PARAMETERS = {"rope_theta": 500000.0, "rope_type": "default"}
+    LONGCHAT_ROPE_PARAMETERS = {"rope_type": "linear", "factor": 8.0}
 
     llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct")
-    assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
-    assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
+    assert (
+        getattr(llama_model_config.hf_config, "rope_parameters", None)
+        == LLAMA_ROPE_PARAMETERS
+    )
     assert llama_model_config.max_model_len == 8192
 
     llama_model_config = ModelConfig(
         "meta-llama/Meta-Llama-3-8B-Instruct",
-        hf_overrides={
-            "rope_scaling": TEST_ROPE_SCALING,
-            "rope_theta": TEST_ROPE_THETA,
-        },
+        hf_overrides={"rope_parameters": TEST_ROPE_PARAMETERS},
     )
     assert (
-        getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING
+        getattr(llama_model_config.hf_config, "rope_parameters", None)
+        == TEST_ROPE_PARAMETERS
     )
-    assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
-    # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
+    # Check if LONGCHAT_ROPE_PARAMETERS entries are in longchat_model_config
     assert all(
-        longchat_model_config.hf_config.rope_scaling.get(key) == value
-        for key, value in LONGCHAT_ROPE_SCALING.items()
+        longchat_model_config.hf_config.rope_parameters.get(key) == value
+        for key, value in LONGCHAT_ROPE_PARAMETERS.items()
     )
     assert longchat_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig(
         "lmsys/longchat-13b-16k",
         hf_overrides={
-            "rope_scaling": TEST_ROPE_SCALING,
+            "rope_parameters": TEST_ROPE_PARAMETERS,
         },
     )
     assert (
-        getattr(longchat_model_config.hf_config, "rope_scaling", None)
-        == TEST_ROPE_SCALING
+        getattr(longchat_model_config.hf_config, "rope_parameters", None)
+        == TEST_ROPE_PARAMETERS
     )
     assert longchat_model_config.max_model_len == 4096
 
diff --git a/vllm/config/model.py b/vllm/config/model.py
index b563a40eb8fc..d1e56a72a318 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -11,6 +11,7 @@
 from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
+from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 import vllm.envs as envs
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
@@ -2100,31 +2101,32 @@ def _get_and_verify_max_len(
         )
         derived_max_model_len = default_max_len
 
-    rope_scaling = getattr(hf_config, "rope_scaling", None)
+    # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict].
+    # To simplify the verification, we convert it to dict[str, TypedDict].
+    rope_parameters = getattr(hf_config, "rope_parameters", None)
+    if rope_parameters and not set(rope_parameters.keys()).issubset(
+        ALLOWED_LAYER_TYPES
+    ):
+        rope_parameters = {"": rope_parameters}
+
     # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE
     # scaling, so we skip applying the scaling factor again.
-    if rope_scaling is not None and "gemma3" not in hf_config.model_type:
-        # No need to consider "type" key because of patch_rope_scaling when
-        # loading HF config
-        rope_type = rope_scaling["rope_type"]
-
-        if rope_type not in ("su", "longrope", "llama3"):
-            if disable_sliding_window:
-                # TODO(robertgshaw): Find a model that supports rope_scaling
-                # with sliding window to see if this case should be allowed.
-                raise NotImplementedError(
-                    "Disabling sliding window is not supported for models "
-                    "with rope_scaling. Please raise an issue so we can "
-                    "investigate."
-                )
-
-            # NOTE: rope_type == "default" does not define factor
-            # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
-            scaling_factor = rope_scaling.get("factor", 1.0)
-
-            if rope_type == "yarn":
-                derived_max_model_len = rope_scaling["original_max_position_embeddings"]
-            derived_max_model_len *= scaling_factor
+    if rope_parameters is not None and "gemma3" not in hf_config.model_type:
+        scaling_factor = 1.0
+        for rp in rope_parameters.values():
+            # No need to consider "type" key because of patch_rope_parameters when
+            # loading HF config
+            rope_type = rp["rope_type"]
+
+            if rope_type not in ("su", "longrope", "llama3"):
+                # NOTE: rope_type == "default" does not define factor https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
+                # NOTE: This assumes all layer types have the same scaling factor.
+                scaling_factor = rp.get("factor", scaling_factor)
+
+                if rope_type == "yarn":
+                    derived_max_model_len = rp["original_max_position_embeddings"]
+        # Do this outside loop since all layer types should have the same scaling
+        derived_max_model_len *= scaling_factor
 
     if encoder_config and "max_seq_length" in encoder_config:
         derived_max_model_len = encoder_config["max_seq_length"]
@@ -2134,7 +2136,9 @@ def _get_and_verify_max_len(
     if max_model_len is None:
         # For LongRoPE, default to original_max_position_embeddings to avoid
         # performance degradation for shorter sequences
-        if rope_scaling is not None and rope_scaling["rope_type"] == "longrope":
+        if rope_parameters is not None and any(
+            rp["rope_type"] == "longrope" for rp in rope_parameters.values()
+        ):
             max_model_len = int(
                 getattr(
                     hf_config, "original_max_position_embeddings", derived_max_model_len
@@ -2151,16 +2155,7 @@ def _get_and_verify_max_len(
         # that will be bigger than derived_max_model_len. We compare user input
         # with model_max_length and allow this override when it's smaller.
         model_max_length = getattr(hf_config, "model_max_length", None)
-        if model_max_length is not None and max_model_len <= model_max_length:
-            if disable_sliding_window:
-                # TODO(robertgshaw): Find a model that has model_max_length
-                # with sliding window to see if this case should be allowed.
-                raise NotImplementedError(
-                    "Disabling sliding window is not supported for models "
-                    "model_max_length in the config. Please raise an issue "
-                    "so we can investigate."
-                )
-        else:
+        if model_max_length is None or max_model_len > model_max_length:
             msg = (
                 f"User-specified max_model_len ({max_model_len}) is greater "
                 f"than the derived max_model_len ({max_len_key}="
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index 56c165f9c041..ae8a7d93b50e 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -26,23 +26,23 @@ def get_rope(
     head_size: int,
     rotary_dim: int,
     max_position: int,
-    base: float,
     is_neox_style: bool = True,
-    rope_scaling: dict[str, Any] | None = None,
+    rope_parameters: dict[str, Any] | None = None,
     dtype: torch.dtype | None = None,
     partial_rotary_factor: float = 1.0,
     dual_chunk_attention_config: dict[str, Any] | None = None,
 ) -> RotaryEmbedding:
     if dtype is None:
         dtype = torch.get_default_dtype()
-    if rope_scaling is not None:
+    if rope_parameters is not None:
         # Transforms every value that is a list into a tuple for caching calls
-        rope_scaling_tuple = {
-            k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
+        rope_parameters_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in rope_parameters.items()
         }
-        rope_scaling_args = tuple(rope_scaling_tuple.items())
+        rope_parameters_args = tuple(rope_parameters_tuple.items())
     else:
-        rope_scaling_args = None
+        rope_parameters_args = None
 
     if dual_chunk_attention_config is not None:
         dual_chunk_attention_tuple = {
@@ -60,15 +60,15 @@ def get_rope(
         head_size,
         rotary_dim,
         max_position,
-        base,
         is_neox_style,
-        rope_scaling_args,
+        rope_parameters_args,
         dual_chunk_attention_args,
         dtype,
     )
     if key in _ROPE_DICT:
         return _ROPE_DICT[key]
 
+    base = rope_parameters["rope_theta"] if rope_parameters else 10000
     if dual_chunk_attention_config is not None:
         extra_kwargs = {
             k: v
@@ -84,18 +84,18 @@ def get_rope(
             dtype,
             **extra_kwargs,
         )
-    elif not rope_scaling:
+    elif not rope_parameters:
         rotary_emb = RotaryEmbedding(
             head_size, rotary_dim, max_position, base, is_neox_style, dtype
         )
     else:
-        scaling_type = rope_scaling["rope_type"]
+        scaling_type = rope_parameters["rope_type"]
 
         if scaling_type == "llama3":
-            scaling_factor = rope_scaling["factor"]
-            low_freq_factor = rope_scaling["low_freq_factor"]
-            high_freq_factor = rope_scaling["high_freq_factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            scaling_factor = rope_parameters["factor"]
+            low_freq_factor = rope_parameters["low_freq_factor"]
+            high_freq_factor = rope_parameters["high_freq_factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             rotary_emb = Llama3RotaryEmbedding(
                 head_size,
                 rotary_dim,
@@ -113,7 +113,7 @@ def get_rope(
                 head_size, rotary_dim, max_position, base, is_neox_style, dtype
             )
         elif scaling_type == "default":
-            if "mrope_section" in rope_scaling:
+            if "mrope_section" in rope_parameters:
                 rotary_emb = MRotaryEmbedding(
                     head_size,
                     rotary_dim,
@@ -121,8 +121,8 @@ def get_rope(
                     base,
                     is_neox_style,
                     dtype,
-                    mrope_section=rope_scaling["mrope_section"],
-                    mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
+                    mrope_section=rope_parameters["mrope_section"],
+                    mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
                 )
             else:
                 rotary_emb = RotaryEmbedding(
@@ -134,7 +134,7 @@ def get_rope(
                     dtype,
                 )
         elif scaling_type == "linear":
-            scaling_factor = rope_scaling["factor"]
+            scaling_factor = rope_parameters["factor"]
             rotary_emb = LinearScalingRotaryEmbedding(
                 head_size,
                 rotary_dim,
@@ -145,8 +145,8 @@ def get_rope(
                 dtype,
             )
         elif scaling_type == "ntk":
-            scaling_factor = rope_scaling["factor"]
-            mixed_b = rope_scaling.get("mixed_b", None)
+            scaling_factor = rope_parameters["factor"]
+            mixed_b = rope_parameters.get("mixed_b")
             rotary_emb = NTKScalingRotaryEmbedding(
                 head_size,
                 rotary_dim,
@@ -158,8 +158,8 @@ def get_rope(
                 mixed_b,
             )
         elif scaling_type == "dynamic":
-            if "alpha" in rope_scaling:
-                scaling_alpha = rope_scaling["alpha"]
+            if "alpha" in rope_parameters:
+                scaling_alpha = rope_parameters["alpha"]
                 rotary_emb = DynamicNTKAlphaRotaryEmbedding(
                     head_size,
                     rotary_dim,
@@ -169,8 +169,8 @@ def get_rope(
                     scaling_alpha,
                     dtype,
                 )
-            elif "factor" in rope_scaling:
-                scaling_factor = rope_scaling["factor"]
+            elif "factor" in rope_parameters:
+                scaling_factor = rope_parameters["factor"]
                 rotary_emb = DynamicNTKScalingRotaryEmbedding(
                     head_size,
                     rotary_dim,
@@ -185,11 +185,11 @@ def get_rope(
                     "Dynamic rope scaling must contain either 'alpha' or 'factor' field"
                 )
         elif scaling_type == "yarn":
-            scaling_factor = rope_scaling["factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            scaling_factor = rope_parameters["factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             extra_kwargs = {
                 k: v
-                for k, v in rope_scaling.items()
+                for k, v in rope_parameters.items()
                 if k
                 in (
                     "extrapolation_factor",
@@ -199,7 +199,7 @@ def get_rope(
                     "apply_yarn_scaling",
                 )
             }
-            if "mrope_section" in rope_scaling:
+            if "mrope_section" in rope_parameters:
                 extra_kwargs.pop("apply_yarn_scaling", None)
                 rotary_emb = MRotaryEmbedding(
                     head_size,
@@ -208,8 +208,8 @@ def get_rope(
                     base,
                     is_neox_style,
                     dtype,
-                    mrope_section=rope_scaling["mrope_section"],
-                    mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
+                    mrope_section=rope_parameters["mrope_section"],
+                    mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
                     scaling_factor=scaling_factor,
                     **extra_kwargs,
                 )
@@ -225,12 +225,12 @@ def get_rope(
                     **extra_kwargs,
                 )
         elif scaling_type == "deepseek_yarn":
-            scaling_factor = rope_scaling["factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            scaling_factor = rope_parameters["factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             # assert max_position == original_max_position * scaling_factor
             extra_kwargs = {
                 k: v
-                for k, v in rope_scaling.items()
+                for k, v in rope_parameters.items()
                 if k
                 in (
                     "extrapolation_factor",
@@ -252,12 +252,12 @@ def get_rope(
                 **extra_kwargs,
             )
         elif scaling_type == "longrope":
-            short_factor = rope_scaling["short_factor"]
-            long_factor = rope_scaling["long_factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            short_factor = rope_parameters["short_factor"]
+            long_factor = rope_parameters["long_factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             extra_kwargs = {
                 k: v
-                for k, v in rope_scaling.items()
+                for k, v in rope_parameters.items()
                 if k in ("short_mscale", "long_mscale")
             }
             rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py
index 6f654f47495f..4eb5665a71fc 100644
--- a/vllm/model_executor/models/afmoe.py
+++ b/vllm/model_executor/models/afmoe.py
@@ -5,7 +5,6 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -171,8 +170,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
         head_dim: int | None = None,
         rms_norm_eps: float = 1e-05,
@@ -202,7 +199,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         # Check if this is a local attention layer
@@ -246,8 +242,7 @@ def __init__(
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
-                base=rope_theta,
-                rope_scaling=rope_scaling,
+                rope_parameters=config["rope_parameters"],
                 is_neox_style=True,
             )
         else:
@@ -303,14 +298,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
 
         # DecoderLayers are created with `make_layers` which passes the prefix
@@ -323,8 +310,6 @@ def __init__(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             head_dim=config.head_dim,
             rms_norm_eps=config.rms_norm_eps,
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 0a8f21abb0a3..b75e91319bba 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -27,7 +27,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -118,8 +117,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -155,7 +152,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -176,9 +172,7 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        self._init_rotary_emb(
-            config, rope_scaling=rope_scaling, quant_config=quant_config
-        )
+        self._init_rotary_emb(config, quant_config=quant_config)
 
         sliding_window = None
         if layer_types := getattr(config, "layer_types", None):
@@ -224,7 +218,6 @@ def forward(
     def _init_rotary_emb(
         self,
         config: ApertusConfig,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
@@ -236,8 +229,7 @@ def _init_rotary_emb(
             self.head_dim,
             rotary_dim=int(self.partial_rotary_factor * self.head_dim),
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
             partial_rotary_factor=self.partial_rotary_factor,
         )
@@ -253,14 +245,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -288,8 +272,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 20c3ff075450..b3887b16f4d7 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -103,15 +103,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Rotary embedding parameters (reuse LLaMA defaults)
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Determine if attention bias is needed (some variants use bias terms)
         attention_bias = getattr(config, "attention_bias", False) or getattr(
@@ -133,8 +124,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index b5cc07a56535..b75a254761d4 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -292,7 +292,6 @@ def __init__(
         self.kv_size = self.num_kv_heads * self.head_dim
 
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.scaling = self.head_dim**-0.5
 
         self.qkv_proj = QKVParallelLinear(
@@ -317,7 +316,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=int(self.rope_theta),
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 8991ef4c606b..edf47270e527 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -136,7 +136,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         position_embedding: str,
-        rope_theta: float = 10000,
+        rope_parameters: dict,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -150,7 +150,6 @@ def __init__(
         self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
         self.head_dim = hidden_size // self.total_num_heads
         self.position_embedding = position_embedding
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         # pylint: disable=invalid-name
@@ -192,7 +191,7 @@ def __init__(
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=self.max_position_embeddings,
-                base=self.rope_theta,
+                rope_parameters=rope_parameters,
             )
             self.scaling = self.head_dim**-0.5
             self.attn = Attention(
@@ -229,13 +228,12 @@ def __init__(
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = BaiChuanAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             position_embedding=position_embedding,
-            rope_theta=rope_theta,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 024425bb2440..cc10e936a2d3 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -135,9 +135,8 @@ def __init__(
             self.head_dim,
             rotary_dim=self.rotary_dim,
             max_position=config.max_position_embeddings,
-            base=config.rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
-            rope_scaling=config.rope_scaling,
             partial_rotary_factor=self.partial_rotary_factor,
         )
 
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index c6cc83487fec..4422bb5da98f 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -156,8 +156,6 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -178,7 +176,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         if hasattr(config, "partial_rotary_factor"):
@@ -192,8 +189,7 @@ def __init__(
             head_size=self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            rope_scaling=rope_scaling,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
             dtype=torch.get_default_dtype(),  # see impl of get_rope
         )
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 3c87bbfefab3..b5a6d00dc309 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -265,8 +265,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any],
         max_position_embeddings: int = 4096,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -293,7 +292,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -318,8 +316,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
 
         self.attn = Attention(
@@ -369,14 +366,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
 
         self.self_attn = ChameleonAttention(
@@ -385,8 +374,7 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=False,
@@ -439,14 +427,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
 
         self.self_attn = ChameleonAttention(
@@ -455,8 +435,7 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=False,
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 5d6f5e9125a2..dbfcd62d0bca 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -99,6 +99,7 @@ def __init__(
         # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
         rope_ratio = getattr(config, "rope_ratio", 1.0)
         max_positions = getattr(config, "seq_length", 8192)
+        rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio}
         # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False,
         # which is equivalent to is_neox_style=True
         is_neox_style = not config.original_rope
@@ -106,7 +107,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim // 2,
             max_position=max_positions,
-            base=10000 * rope_ratio,
+            rope_parameters=rope_parameters,
             is_neox_style=is_neox_style,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 77bb17851981..5ed920927c77 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -156,8 +156,6 @@ def __init__(
         self.max_position_embeddings = getattr(
             config, "model_max_length", None
         ) or getattr(config, "max_position_embeddings", 8192)
-        self.rope_theta = config.rope_theta
-        self.rope_scaling = getattr(config, "rope_scaling", None)
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
         self.qkv_proj = QKVParallelLinear(
             self.hidden_size,
@@ -179,8 +177,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 66b246878b0a..3cf4bf991e66 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -8,6 +8,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
+from vllm.transformers_utils.config import set_default_rope_theta
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
@@ -46,8 +47,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
             "head_size": head_dim,
             "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
             "max_position": config.max_position_embeddings,
-            "base": config.rope_theta,
-            "rope_scaling": getattr(config, "rope_scaling", None),
+            "rope_parameters": config.rope_parameters,
         }
 
 
@@ -78,12 +78,13 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
             if not model_config.enforce_eager:
                 max_position = round_up(max_position, 8)
 
+            set_default_rope_theta(config, default_theta=config.rotary_emb_base)
+
             config.rotary_kwargs = {
                 "head_size": head_dim,
                 "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
                 "max_position": max_position,
-                "base": getattr(config, "rope_theta", config.rotary_emb_base),
-                "rope_scaling": getattr(config, "rope_scaling", None),
+                "rope_parameters": config.rope_parameters,
             }
 
 
@@ -117,18 +118,20 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         head_dim = config.hidden_size // config.num_attention_heads
         rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
         max_trained_positions = getattr(config, "max_trained_positions", 2048)
+
+        set_default_rope_theta(config, default_theta=config.rotary_emb_base)
+
         config.rotary_kwargs = {
             "head_size": head_dim,
             "rotary_dim": rotary_emb_dim,
             "max_position": max_trained_positions,
-            "base": getattr(config, "rope_theta", config.rotary_emb_base),
-            "rope_scaling": getattr(config, "rope_scaling", None),
+            "rope_parameters": config.rope_parameters,
         }
 
         # we ignore config.rotary_scaling_factor so that for datasets shorter
         # than max_trained_positions 2048, the results are consistent
         # with SentenceTransformer.
-        # The context extension uses vllm style rope_theta and rope_scaling.
+        # The context extension uses vllm style rope_theta and rope_parameters.
         # See #17785 #18755
         if (
             not vllm_config.model_config.hf_overrides
@@ -172,7 +175,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
             if hasattr(hf_text_config, "max_model_len"):
                 delattr(hf_text_config, "max_model_len")
             hf_text_config.max_position_embeddings = max_trained_positions
-            hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"]
+            hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"]
 
             # The priority of sentence_bert_config.json is higher
             # than max_position_embeddings
@@ -246,8 +249,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
             "head_size": head_dim,
             "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
             "max_position": config.max_position_embeddings,
-            "base": config.rope_theta,
-            "rope_scaling": getattr(config, "rope_scaling", None),
+            "rope_parameters": config.rope_parameters,
         }
 
 
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 528ef4f76742..2c729019081a 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -197,7 +197,10 @@ def __init__(
         self.head_dim = self.d_model // self.total_num_heads
         self.total_num_kv_heads = config.attn_config.kv_n_heads
         self.clip_qkv = config.attn_config.clip_qkv
-        self.rope_theta = config.attn_config.rope_theta
+        rope_parameters = {
+            "rope_type": "default",
+            "rope_theta": int(config.attn_config.rope_theta),
+        }
         self.max_position = config.max_seq_len
 
         # pylint: disable=invalid-name
@@ -221,7 +224,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index e8ee9951d611..6675b2133f38 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -27,7 +27,6 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -111,8 +110,6 @@ def __init__(
         config: DeepseekV2Config | DeepseekV3Config,
         hidden_size: int,
         num_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -139,7 +136,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -162,8 +158,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -409,8 +404,6 @@ def __init__(
         v_head_dim: int,
         q_lora_rank: int,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -430,7 +423,6 @@ def __init__(
         assert num_heads % tp_size == 0
         self.num_local_heads = num_heads // tp_size
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         assert topk_indices_buffer is None, (
             "topk_indices_buffer is not \
@@ -485,21 +477,20 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
         )
-        if rope_scaling:
-            rope_scaling["rope_type"] = "deepseek_yarn"
+        if config.rope_parameters["rope_type"] != "default":
+            config.rope_parameters["rope_type"] = "deepseek_yarn"
 
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
 
-        if rope_scaling:
-            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
-            scaling_factor = rope_scaling["factor"]
+        if config.rope_parameters["rope_type"] != "default":
+            mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
+            scaling_factor = config.rope_parameters["factor"]
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
@@ -903,8 +894,6 @@ def __init__(
         v_head_dim: int,
         q_lora_rank: int | None,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -927,7 +916,6 @@ def __init__(
         self.num_local_heads = num_heads // tp_size
 
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         if self.q_lora_rank is not None:
@@ -981,19 +969,18 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        if rope_scaling:
-            rope_scaling["rope_type"] = "deepseek_yarn"
+        if config.rope_parameters["rope_type"] != "default":
+            config.rope_parameters["rope_type"] = "deepseek_yarn"
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
-        if rope_scaling:
-            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
-            scaling_factor = rope_scaling["factor"]
+        if config.rope_parameters["rope_type"] != "default":
+            mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
+            scaling_factor = config.rope_parameters["factor"]
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
@@ -1073,8 +1060,6 @@ def __init__(
         parallel_config = vllm_config.parallel_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         moe_layer_freq = getattr(config, "moe_layer_freq", 1)
         # DecoderLayers are created with `make_layers` which passes the prefix
@@ -1107,8 +1092,6 @@ def __init__(
             v_head_dim=v_head_dim,
             q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None,
             kv_lora_rank=kv_lora_rank,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index d24da0c42a25..e65c275106a4 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -27,7 +27,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -202,8 +201,6 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         config: Dots1Config,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -229,7 +226,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         attention_bias = config.attention_bias
 
@@ -255,8 +251,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -296,8 +291,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         layer_idx = int(prefix.split(sep=".")[-1])
         self.layer_idx = layer_idx
@@ -307,8 +300,6 @@ def __init__(
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             config=config,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index f2999968669f..a7df3509e3ec 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -62,6 +62,7 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (
@@ -232,9 +233,8 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict[str, Any],
         head_dim: int | None = None,
-        rope_theta: float = 500000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
         rms_norm_eps: float = 1e-05,
         qkv_bias: bool = False,
@@ -266,7 +266,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -291,9 +290,8 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=False,
-            rope_scaling=rope_scaling,
         )
         self.attn = Attention(
             self.num_heads,
@@ -333,16 +331,14 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 500000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=500000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
         self.self_attn = Ernie4_5_MoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             head_dim=getattr(config, "head_dim", None),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "use_bias", False),
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index e8ef86f9b7f0..50e033d77606 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -58,6 +58,7 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .ernie45_moe import Ernie4_5_MoeMLP
 from .interfaces import SupportsPP
@@ -91,9 +92,8 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict[str, Any],
         head_dim: int | None = None,
-        rope_theta: float = 500000,
-        rope_scaling: dict[str, Any] | None = None,
         freq_allocation: int = 20,
         max_position_embeddings: int = 131072,
         rms_norm_eps: float = 1e-05,
@@ -126,7 +126,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -155,7 +154,7 @@ def __init__(
             head_size=self.head_dim,
             rotary_dim=self.head_dim,
             max_position_embeddings=max_position_embeddings,
-            base=rope_theta,
+            base=rope_parameters["rope_theta"],
             is_neox_style=False,
             dtype=torch.get_default_dtype(),
             mrope_section=[h_rope, w_rope, t_rope],
@@ -413,8 +412,7 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 500000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=500000)
         freq_allocation = getattr(config, "freq_allocation", 20)
         max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
 
@@ -423,8 +421,7 @@ def __init__(
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             head_dim=getattr(config, "head_dim", None),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             freq_allocation=freq_allocation,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 6c56bfc433c7..d13275488fe9 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -27,7 +27,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -113,8 +112,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -144,7 +141,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -173,8 +169,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
         )
         self.attn = Attention(
@@ -207,8 +202,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -221,8 +214,6 @@ def __init__(
             hidden_size=hidden_size,
             num_heads=num_heads,
             num_kv_heads=num_kv_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=bias,
@@ -251,14 +242,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -272,8 +255,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index b89e168ada20..70f3cce2b7c5 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -23,7 +23,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -52,6 +51,7 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
@@ -110,8 +110,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 1000000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -141,7 +139,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -176,12 +173,12 @@ def __init__(
         # apply rotary embeddings to every layer in full attention models
         self.apply_rope_all_layers = "sliding_attention" not in config.layer_types
 
+        set_default_rope_theta(config, default_theta=1000000)
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
         )
         self.attn = Attention(
@@ -227,14 +224,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -249,8 +238,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 85acdff3d96b..dc2d51f340c8 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -164,13 +164,12 @@ def __init__(
         )
 
         if self.use_rotary:
-            rope_theta = getattr(config, "rope_theta", 10000)
             max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
             self.rotary_emb = get_rope(
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
-                base=rope_theta,
+                rope_parameters=config.rope_parameters,
             )
             self.attn = Attention(
                 self.num_heads,
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index b985847af5da..9433f0d1b4a4 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -35,6 +35,7 @@
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import (
     HasInnerState,
@@ -214,8 +215,7 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
-        rope_theta = getattr(config, "rope_theta", 1e11)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1e11)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -240,7 +240,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         if hasattr(config, "partial_rotary_factor"):
@@ -254,8 +253,7 @@ def __init__(
             head_size=self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            rope_scaling=rope_scaling,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
             dtype=None,  # see impl of get_rope
         )
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 7aaae7c503b5..00c7f59a0809 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -20,6 +20,7 @@
 from collections.abc import Iterable
 from functools import cache
 from itertools import islice
+from typing import Any
 
 import torch
 from torch import nn
@@ -127,8 +128,8 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         head_dim: int,
+        rope_parameters: dict[str, Any],
         max_position_embeddings: int = 8192,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -153,7 +154,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -176,7 +176,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -218,7 +218,7 @@ def __init__(
             num_kv_heads=config.num_key_value_heads,
             head_dim=config.head_dim,
             max_position_embeddings=config.max_position_embeddings,
-            rope_theta=config.rope_theta,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 4d5d6cbb37c6..9b6cfe693230 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -107,7 +107,6 @@ def __init__(
         num_kv_heads: int,
         head_dim: int,
         max_position_embeddings: int,
-        rope_theta: float,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         attn_logits_soft_cap: float | None = None,
@@ -134,7 +133,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = config.query_pre_attn_scalar**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -156,7 +154,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
 
@@ -206,7 +204,6 @@ def __init__(
             num_kv_heads=config.num_key_value_heads,
             head_dim=config.head_dim,
             max_position_embeddings=config.max_position_embeddings,
-            rope_theta=config.rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
             attn_logits_soft_cap=config.attn_logit_softcapping,
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 357e61a4e78b..565719ae7fae 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -155,25 +155,28 @@ def __init__(
         self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
 
         layer_idx = extract_layer_index(prefix)
-        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        layer_type = config.layer_types[layer_idx]
+        self.is_sliding = layer_type == "sliding_attention"
         sliding_window = config.sliding_window if self.is_sliding else None
 
         # Initialize the rotary embedding.
-        if self.is_sliding:
-            # Local attention. Override the values in config.json.
-            self.rope_theta = config.rope_local_base_freq
-            self.rope_scaling = {"rope_type": "default"}
+        if layer_type in config.rope_parameters:
+            # Transformers v5 rope config.
+            rope_parameters = config.rope_parameters[layer_type]
         else:
+            # Transformers v4 rope config.
             # Global attention. Use the values in config.json.
-            self.rope_theta = config.rope_theta
-            self.rope_scaling = config.rope_scaling
+            rope_parameters = config.rope_parameters.copy()
+            # Local attention. Override the values in config.json.
+            if self.is_sliding:
+                rope_parameters["rope_theta"] = config.rope_local_base_freq
+
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=self.rope_scaling,
         )
 
         if getattr(config, "is_causal", True):
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 64443190f53e..8f1447ba34a8 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -332,18 +332,21 @@ def __init__(
         )
 
         layer_idx = extract_layer_index(prefix)
-        is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        layer_type = config.layer_types[layer_idx]
+        is_sliding = layer_type == "sliding_attention"
         self.sliding_window = config.sliding_window if is_sliding else None
 
         # Initialize the rotary embedding.
-        if is_sliding:
-            # Local attention. Override the values in config.json.
-            rope_theta = config.rope_local_base_freq
-            rope_scaling = {"rope_type": "default"}
+        if layer_type in config.rope_parameters:
+            # Transformers v5 rope config.
+            rope_parameters = config.rope_parameters[layer_type]
         else:
+            # Transformers v4 rope config.
             # Global attention. Use the values in config.json.
-            rope_theta = config.rope_theta
-            rope_scaling = config.rope_scaling
+            rope_parameters = config.rope_parameters.copy()
+            # Local attention. Override the values in config.json.
+            if is_sliding:
+                rope_parameters["rope_theta"] = config.rope_local_base_freq
 
         first_kv_shared_layer_idx = (
             config.num_hidden_layers - config.num_kv_shared_layers
@@ -383,9 +386,8 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=rope_scaling,
         )
 
         self.attn = Attention(
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index faa0674a2e43..f8ef3b0385fb 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -57,10 +57,8 @@ def __init__(
         max_position: int = 4096 * 32,
         head_dim: int | None = None,
         qkv_bias: bool = False,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -86,7 +84,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
             self.head_dim,
@@ -107,8 +104,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.rotary_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=partial_rotary_factor,
             is_neox_style=False,
         )
@@ -150,8 +146,6 @@ def __init__(
         quant_config = vllm_config.quant_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
 
         self.self_attn = Glm4Attention(
             config=config,
@@ -159,12 +153,10 @@ def __init__(
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             qkv_bias=getattr(config, "attention_bias", False),
             head_dim=getattr(config, "head_dim", None),
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
             prefix=f"{prefix}.self_attn",
             attn_type=AttentionType.DECODER,
         )
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 7a4fee76ae6b..6581bbda6d60 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -703,7 +703,6 @@ def __init__(
             head_size=head_dim,
             rotary_dim=head_dim // 2,
             max_position=8192,
-            base=10000.0,
             is_neox_style=True,
         )
         self.blocks = nn.ModuleList(
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 1422dbe9b3cd..5aa51af54a00 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -26,7 +26,6 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -233,8 +232,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
         head_dim: int | None = None,
         rms_norm_eps: float = 1e-05,
@@ -264,7 +261,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.use_qk_norm = use_qk_norm
 
@@ -291,8 +287,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=partial_rotary_factor,
         )
         self.attn = Attention(
@@ -341,8 +336,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
@@ -354,8 +347,6 @@ def __init__(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             head_dim=config.head_dim,
             rms_norm_eps=config.rms_norm_eps,
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index e416ecde0c1e..e94de8952fa6 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -95,13 +95,12 @@ def __init__(
         scaling = self.head_size**-0.5
         assert getattr(config, "rotary", True)
         assert config.rotary_dim % 2 == 0
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=config.rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index af0c9209231c..815c2fba4d9f 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -92,13 +92,12 @@ def __init__(
         scaling = self.head_size**-0.5
         rotary_dim = int(self.head_size * config.rotary_pct)
         assert rotary_dim % 2 == 0
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 7df3b087ccb8..f310f71af92d 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -67,16 +67,16 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=config.max_position_embeddings,
-            base=config.rope_theta,
             dtype=torch.float32,
-            rope_scaling={
+            rope_parameters={
+                "rope_theta": config.rope_parameters["rope_theta"],
                 "rope_type": "yarn",
-                "factor": config.rope_scaling["factor"],
-                "original_max_position_embeddings": config.rope_scaling[
+                "factor": config.rope_parameters["factor"],
+                "original_max_position_embeddings": config.rope_parameters[
                     "original_max_position_embeddings"
                 ],
-                "beta_fast": config.rope_scaling["beta_fast"],
-                "beta_slow": config.rope_scaling["beta_slow"],
+                "beta_fast": config.rope_parameters["beta_fast"],
+                "beta_slow": config.rope_parameters["beta_slow"],
             },
             is_neox_style=True,
         )
@@ -90,7 +90,6 @@ def __init__(
         self.q_size = self.num_attention_heads * self.head_dim // tp_size
         self.kv_size = self.num_key_value_heads * self.head_dim // tp_size
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = config.rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size=self.hidden_size,
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index c44b4021471e..1dc205b47753 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -26,7 +26,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -112,8 +111,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -143,7 +140,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = config.attention_multiplier
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -167,8 +163,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -204,14 +199,6 @@ def __init__(
         super().__init__()
         self.hidden_size = config.hidden_size
         self.residual_multiplier = config.residual_multiplier
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -225,8 +212,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 5c6759ded066..8f4139d63c3f 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -141,8 +141,7 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         attention_multiplier: float | None = None,
@@ -172,7 +171,6 @@ def __init__(
             if attention_multiplier is not None
             else self.head_dim**-1
         )
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -194,9 +192,8 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=rope_scaling,
         )
         self.attn = Attention(
             self.num_heads,
@@ -235,16 +232,12 @@ def __init__(
         parallel_config = vllm_config.parallel_config
 
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         self.self_attn = GraniteMoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index a340112ec62a..9d5eeef198a6 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -273,10 +273,7 @@ def __init__(
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=config.max_position_embeddings,
-                base=int(config.rope_theta),
-                rope_scaling=config.rope_scaling
-                if hasattr(config, "rope_scaling") and config.rope_scaling is not None
-                else None,
+                rope_parameters=config.rope_parameters,
                 is_neox_style=True,
             )
         else:
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index 926c539af33b..fd346db7e35a 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -84,16 +84,12 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         self.self_attn = GraniteMoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index 9dc231863f74..4bf23cd6fd19 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -25,6 +25,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -134,7 +135,7 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
+        rope_parameters: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -161,7 +162,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -183,7 +183,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
         )
 
@@ -234,15 +234,12 @@ def __init__(
             if not self.use_fp8 and hasattr(quant_config, "is_fp8"):
                 self.use_fp8 = quant_config.is_fp8
 
-        # Requires transformers > 4.32.0
-        # Default rope_theta value if not in config
-        rope_theta = 10000
         self.attn = Grok1Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 1eadcbe67ade..9fa5e2bd33f2 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -27,7 +27,6 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any
 
 import regex as re
 import torch
@@ -142,8 +141,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -177,7 +174,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
         self.layer_id = layer_id
@@ -204,8 +200,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -254,8 +249,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -289,7 +282,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
         self.layer_id = layer_id
@@ -314,8 +306,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -494,14 +485,6 @@ def __init__(
             if isinstance(config.intermediate_size, int)
             else config.intermediate_size[layer_id]
         )
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         attention_bias = getattr(config, "attention_bias", False) or getattr(
             config, "bias", False
@@ -520,8 +503,6 @@ def __init__(
                 num_kv_heads=getattr(
                     config, "num_key_value_heads", config.num_attention_heads
                 ),
-                rope_theta=rope_theta,
-                rope_scaling=rope_scaling,
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
@@ -537,8 +518,6 @@ def __init__(
                 num_kv_heads=getattr(
                     config, "num_key_value_heads", config.num_attention_heads
                 ),
-                rope_theta=rope_theta,
-                rope_scaling=rope_scaling,
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 60fbeb842dd4..dc8f821bd134 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -91,8 +91,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -120,7 +119,6 @@ def __init__(
         self.kv_size = self.num_kv_heads * self.head_dim
         self.key_value_groups = int(self.num_heads / self.num_kv_heads)
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.wqkv = QKVParallelLinear(
@@ -144,8 +142,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -204,15 +201,12 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.attention = InternLM2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 6dc081e34157..a57db82242af 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -30,15 +30,12 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.attention = InternLM2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py
index f3675075a48f..4562b2202c5e 100644
--- a/vllm/model_executor/models/kimi_linear.py
+++ b/vllm/model_executor/models/kimi_linear.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Any
 
 import torch
 from torch import nn
@@ -190,9 +189,7 @@ def __init__(
         v_head_dim: int,
         q_lora_rank: int | None,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
         use_nope: bool = False,
-        rope_scaling: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -210,11 +207,9 @@ def __init__(
         tp_size = get_tensor_model_parallel_world_size()
         self.num_local_heads = num_heads // tp_size
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.use_nope = use_nope
         assert self.use_nope is True
         assert self.q_lora_rank is None
-        assert rope_scaling is None
         assert num_heads % tp_size == 0
         self.kv_a_proj_with_mqa = ReplicatedLinear(
             self.hidden_size,
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index aeb25602f11a..74bdde27ece5 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 import torch.nn as nn
@@ -96,8 +95,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -126,7 +123,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -149,8 +145,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -199,14 +194,6 @@ def __init__(
         self.config = config
         self.layer_idx = layer_idx
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
 
         self.self_attn = Lfm2Attention(
@@ -215,8 +202,6 @@ def __init__(
             hidden_size=config.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index 6b7b5564ee98..c088a0821152 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 import torch.nn as nn
@@ -189,8 +188,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -219,7 +216,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -242,8 +238,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -293,14 +288,6 @@ def __init__(
         self.config = config
         self.layer_idx = layer_idx
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
 
         self.self_attn = Lfm2MoeAttention(
@@ -309,8 +296,6 @@ def __init__(
             hidden_size=config.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 0a3f37c30ab5..d5b49d2fb4c2 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -26,7 +26,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -120,8 +119,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -157,7 +154,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
@@ -186,9 +182,7 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        self._init_rotary_emb(
-            config, rope_scaling=rope_scaling, quant_config=quant_config
-        )
+        self._init_rotary_emb(config, quant_config=quant_config)
 
         sliding_window = None
         if layer_types := getattr(config, "layer_types", None):
@@ -258,7 +252,6 @@ def forward(
     def _init_rotary_emb(
         self,
         config: LlamaConfig,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
@@ -270,8 +263,7 @@ def _init_rotary_emb(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
             partial_rotary_factor=self.partial_rotary_factor,
         )
@@ -291,14 +283,6 @@ def __init__(
         quant_config = self.get_quant_config(vllm_config)
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -326,8 +310,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index a7e0732ec71e..4c6d1d424475 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -19,7 +19,6 @@
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Any
 
 import torch
 from torch import nn
@@ -171,8 +170,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -208,7 +205,6 @@ def __init__(
 
         self.floor_scale = getattr(config, "floor_scale", 8192.0)
         self.attn_scale = getattr(config, "attn_scale", 0.1)
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.n_rep = self.num_heads // self.num_kv_heads
         self.qk_norm = (
@@ -248,8 +244,7 @@ def __init__(
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
-                base=int(rope_theta),
-                rope_scaling=rope_scaling if rope_scaling != "default" else None,
+                rope_parameters=config.rope_parameters,
                 is_neox_style=is_neox_style,
             )
             if not self.nope
@@ -331,8 +326,6 @@ def __init__(
         self.layer_idx = extract_layer_index(prefix)
         self.global_layer = config.no_rope_layers[self.layer_idx] == 0
         self.hidden_size = config.hidden_size
-        rope_theta = config.rope_theta
-        rope_scaling = config.rope_scaling
         max_position_embeddings = config.max_position_embeddings
 
         self.self_attn = Llama4Attention(
@@ -340,8 +333,6 @@ def __init__(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=False,
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index 5de10e708683..fafe97cd2be7 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -108,8 +108,7 @@ def __init__(
         eos_token_id=100001,
         pretraining_tp=1,
         tie_word_embeddings=False,
-        rope_theta=1000000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         attention_bias=False,
         attention_dropout=0.0,
         mla_scale_q_lora=False,
@@ -162,8 +161,13 @@ def __init__(
         self.rms_norm_eps = rms_norm_eps
         self.pretraining_tp = pretraining_tp
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 1000000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.mla_scale_q_lora = mla_scale_q_lora
@@ -336,15 +340,7 @@ def __init__(
         super().__init__()
         self.layer_idx = int(prefix.split(sep=".")[-1])
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
 
         # Dual attention structure
         self.self_attn = nn.ModuleList(
@@ -361,8 +357,6 @@ def __init__(
                         config.q_lora_rank if hasattr(config, "q_lora_rank") else None
                     ),
                     kv_lora_rank=config.kv_lora_rank,
-                    rope_theta=rope_theta,
-                    rope_scaling=rope_scaling,
                     max_position_embeddings=max_position_embeddings,
                     cache_config=cache_config,
                     quant_config=None
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 914b097fe199..04923833065f 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -230,8 +230,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -257,7 +256,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -281,8 +279,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
 
         self.attn = Attention(
@@ -324,8 +321,6 @@ def __init__(
         self.cache_config = cache_config
         self.quant_config = quant_config
         self.hidden_size = config.hidden_size
-        self.rope_theta = getattr(config, "rope_theta", 10000)
-        self.rope_scaling = getattr(config, "rope_scaling", None)
         self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.prefix = prefix
         self._init_attn_block()
@@ -339,8 +334,7 @@ def _init_attn_block(self):
             hidden_size=self.hidden_size,
             num_heads=self.config.num_attention_heads,
             num_kv_heads=self.config.num_key_value_heads,
-            rope_theta=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=self.config.rope_parameters,
             max_position_embeddings=self.max_position_embeddings,
             cache_config=self.cache_config,
             quant_config=self.quant_config,
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index d3b6966ee3a7..2d775219fc97 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -25,8 +25,6 @@
 # limitations under the License.
 """Inference-only MiniCPM3 model compatible with HuggingFace weights."""
 
-from typing import Any
-
 import torch
 from torch import nn
 from transformers import PretrainedConfig
@@ -62,8 +60,6 @@ def __init__(
         v_head_dim: int,
         q_lora_rank: int,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -84,7 +80,6 @@ def __init__(
         self.num_local_heads = num_heads // tp_size
 
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.q_a_proj = ReplicatedLinear(
@@ -127,8 +122,7 @@ def __init__(
             self.qk_rope_head_dim,
             rotary_dim=self.qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_local_heads,
@@ -204,8 +198,6 @@ def _init_attn_block(self):
             v_head_dim=self.config.v_head_dim,
             q_lora_rank=self.config.q_lora_rank,
             kv_lora_rank=self.config.kv_lora_rank,
-            rope_theta=self.rope_theta,
-            rope_scaling=self.rope_scaling,
             max_position_embeddings=self.max_position_embeddings,
             cache_config=self.cache_config,
             quant_config=self.quant_config,
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index d0cdb70aa857..e6bccfcac4f1 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -69,8 +69,6 @@ def __init__(
         self.cache_config = cache_config
         self.quant_config = quant_config
         self.hidden_size = config.hidden_size
-        self.rope_theta = getattr(config, "rope_theta", 10000)
-        self.rope_scaling = getattr(config, "rope_scaling", None)
         self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.prefix = prefix
         self._init_attn_block()
@@ -84,8 +82,7 @@ def _init_attn_block(self):
             hidden_size=self.hidden_size,
             num_heads=self.config.num_attention_heads,
             num_kv_heads=self.config.num_key_value_heads,
-            rope_theta=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=self.config.rope_parameters,
             max_position_embeddings=self.max_position_embeddings,
             cache_config=self.cache_config,
             quant_config=self.quant_config,
diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
index 49d2f2d26196..4955c68c0cda 100644
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -149,8 +149,7 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         rotary_dim: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         attn_window_size: int | None = None,
         max_position_embeddings: int = 8192,
         head_dim: int | None = None,
@@ -180,7 +179,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -205,8 +203,7 @@ def __init__(
             self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -252,8 +249,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
             max_position_embeddings = max(
@@ -269,8 +264,7 @@ def __init__(
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             rotary_dim=config.rotary_dim,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "attention_bias", False),
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index bf1ecc822756..50f7396e2de6 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -188,7 +188,7 @@ def __init__(
         num_kv_heads: int,
         rotary_dim: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
+        rope_parameters: dict | None = None,
         sliding_window: int | None = None,
         quant_config: QuantizationConfig | None = None,
         layer_idx: int = None,
@@ -214,7 +214,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.sliding_window = sliding_window
         self.prefix = prefix
 
@@ -247,7 +246,7 @@ def __init__(
             head_size=self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position,
-            base=int(rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
             dtype=torch.float32,
         )
@@ -287,8 +286,6 @@ def __init__(
         self.hidden_size = config.hidden_size
         self.expert_num = expert_num
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-
         head_dim = getattr(config, "head_dim", None)
         if head_dim is None:
             head_dim = config.hidden_size // config.num_attention_heads
@@ -328,7 +325,7 @@ def __init__(
                 else head_dim,
                 num_kv_heads=config.num_key_value_heads,
                 max_position=max_position_embeddings,
-                rope_theta=rope_theta,
+                rope_parameters=config.rope_parameters,
                 sliding_window=config.sliding_window,
                 quant_config=quant_config,
                 layer_idx=self._ilayer,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index d7a1cb82fb4f..54ab8dd493e7 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -161,7 +161,6 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -189,7 +188,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -211,7 +209,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -248,15 +246,12 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
         self.self_attn = MixtralAttention(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index e25a104d822a..286859d188d3 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -292,13 +292,17 @@ def __init__(
                 prefix=f"{prefix}.o_proj",
             )
 
+        rope_parameters = {
+            "rope_type": "mllama4",
+            "rope_theta": config.rope_parameters["rope_theta"],
+        }
+
         self.rotary_emb = get_rope(
             head_size=self.head_dim,
             rotary_dim=config.hidden_size // config.num_attention_heads // 2,
             # number of image patches
             max_position=(config.image_size // config.patch_size) ** 2,
-            base=config.rope_theta,
-            rope_scaling={"rope_type": "mllama4"},
+            rope_parameters=rope_parameters,
             is_neox_style=False,
             dtype=torch.complex64,  # important
         )
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index ab83a271e30a..dc06938d5d6e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -410,7 +410,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
 
         # Attention input projection. Projects x -> (q, k, v)
         self.qkv_proj = QKVParallelLinear(
@@ -437,7 +436,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.scaling = self.head_dim**-0.5
         self.attn = Attention(
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 92dcf5ea5700..c3337bd1ea69 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -26,7 +26,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -150,8 +149,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -181,7 +178,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.partial_rotary_factor = config.partial_rotary_factor
         self.max_position_embeddings = max_position_embeddings
 
@@ -206,8 +202,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=self.partial_rotary_factor,
         )
         self.attn = Attention(
@@ -243,14 +238,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -264,8 +251,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index b839206a3094..2eebe38051cb 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -26,7 +26,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
@@ -82,8 +81,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -97,8 +94,6 @@ def __init__(
             hidden_size,
             num_heads,
             num_kv_heads,
-            rope_theta,
-            rope_scaling,
             max_position_embeddings,
             quant_config,
             bias,
@@ -111,7 +106,6 @@ def __init__(
     def _init_rotary_emb(
         self,
         config,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         # Enables YARN for Mistral and LLaMA4 derivatives.
@@ -126,8 +120,7 @@ def _init_rotary_emb(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
             partial_rotary_factor=self.partial_rotary_factor,
         )
@@ -148,14 +141,6 @@ def __init__(
         self._is_no_op_ffn = block_config.ffn.no_op
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -176,8 +161,6 @@ def __init__(
                 hidden_size=self.hidden_size,
                 num_heads=config.num_attention_heads,
                 num_kv_heads=num_kv_heads,
-                rope_theta=rope_theta,
-                rope_scaling=rope_scaling,
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 487e3f671a45..bd8a8e317544 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -87,7 +87,6 @@ def __init__(
         self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.clip_qkv = config.clip_qkv
 
         # Attention input projection. Projects x -> (q, k, v)
@@ -105,7 +104,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.scaling = self.head_dim**-0.5
         self.attn = Attention(
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 045582c889ee..f0f6b2f6b3e6 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -99,7 +99,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.max_position_embeddings = self.config.max_position_embeddings
-        self.rope_theta = self.config.rope_theta
 
         # Attention input projection. Projects x -> (q, k, v)
         self.qkv_proj = QKVParallelLinear(
@@ -139,15 +138,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=f"{prefix}.attn",
         )
 
-        # Rotary embeddings. Rope scaling is only applied on full attention
-        # layers.
-        self.rope_scaling = self.config.rope_scaling if sliding_window is None else None
+        # Rotary embeddings. Rope scaling is only applied on full attention layers.
+        if sliding_window is None:
+            rope_parameters = self.config.rope_parameters
+        else:
+            rope_theta = self.config.rope_parameters["rope_theta"]
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,  # type: ignore
-            rope_scaling=self.rope_scaling,
+            rope_parameters=rope_parameters,
         )
 
         # Attention output projection.
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 499eb05de76e..c39e338d72e2 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -123,8 +123,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         quant_config = vllm_config.quant_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
 
         num_heads = config.num_attention_heads
@@ -148,7 +146,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -176,8 +173,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index d13a745beffe..f814cdfec5a2 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -77,6 +77,7 @@
     sequence_parallel_chunk,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 
 def check_ffn_act_fn(act_fn: str):
@@ -259,7 +260,6 @@ def __init__(
         v_head_dim: int,
         q_lora_rank: int | None,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -274,8 +274,6 @@ def __init__(
         self.v_head_dim = v_head_dim
         self.q_lora_rank = q_lora_rank
         self.kv_lora_rank = kv_lora_rank
-        self.rope_theta = rope_theta
-
         self.tp_size = get_tensor_model_parallel_world_size()
         if num_heads % self.tp_size != 0:
             raise ValueError(
@@ -339,7 +337,9 @@ def __init__(
         )
 
         # TODO: remove hard coding
-        rope_scaling = {
+        set_default_rope_theta(config, default_theta=10000)
+        rope_parameters = {
+            "rope_theta": config.rope_parameters["rope_theta"],
             "beta_fast": 32,
             "beta_slow": 1,
             "factor": 1,
@@ -353,8 +353,7 @@ def __init__(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             is_neox_style=False,
         )
 
@@ -407,8 +406,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -454,7 +451,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -475,9 +471,7 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        self._init_rotary_emb(
-            config, rope_scaling=rope_scaling, quant_config=quant_config
-        )
+        self._init_rotary_emb(config, quant_config=quant_config)
 
         if hasattr(config, "interleaved_sliding_window"):
             interleaved_sliding_window = config.interleaved_sliding_window
@@ -521,7 +515,6 @@ def forward(
     def _init_rotary_emb(
         self,
         config: PretrainedConfig,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
@@ -533,8 +526,7 @@ def _init_rotary_emb(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
         )
 
@@ -555,7 +547,6 @@ def __init__(
         parallel_config = vllm_config.parallel_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
 
         layer_idx = int(prefix.split(sep=".")[-1])
@@ -579,7 +570,6 @@ def __init__(
                     config.q_lora_rank if hasattr(config, "q_lora_rank") else None
                 ),
                 kv_lora_rank=config.kv_lora_rank,
-                rope_theta=rope_theta,
                 max_position_embeddings=max_position_embeddings,
                 cache_config=cache_config,
                 quant_config=quant_config,
@@ -607,8 +597,6 @@ def __init__(
                 num_kv_heads=getattr(
                     config, "num_key_value_heads", config.num_attention_heads
                 ),
-                rope_theta=rope_theta,
-                rope_scaling=getattr(config, "rope_scaling", None),
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 859cd2cecf89..b30be93ca726 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -88,8 +88,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -115,7 +114,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -139,8 +137,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -175,15 +172,12 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = OrionAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
index 9db6c317c26a..63d2fff6ec8b 100644
--- a/vllm/model_executor/models/ouro.py
+++ b/vllm/model_executor/models/ouro.py
@@ -112,10 +112,8 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
         dual_chunk_attention_config: dict[str, Any] | None = None,
@@ -140,7 +138,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
         # Get total_ut_steps from config, default to 4 if not specified
@@ -170,8 +167,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = nn.ModuleList()
@@ -226,9 +222,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -244,10 +237,8 @@ def __init__(
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 3bf6a1d9763d..98963d52e484 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -106,7 +106,6 @@ def __init__(
         self.num_heads = self.total_num_heads // tensor_parallel_world_size
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.partial_rotary_factor = config.partial_rotary_factor
         self.is_causal = True
 
@@ -138,7 +137,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=self.partial_rotary_factor,
         )
         self.scaling = self.head_dim**-0.5
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 8fee53c23fb4..da476f621627 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -115,16 +115,12 @@ def __init__(
         )
         assert rotary_dim % 2 == 0
 
-        # pylint: disable=C0301
-        # Refer to:
-        # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518
-        rope_theta = getattr(config, "rope_theta", 10000.0)
         max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 92fd858b608b..8ffac95d9396 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -86,7 +86,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         tie_word_embeddings=False,
-        rope_theta=1e6,
+        rope_parameters=None,
         sliding_window=None,
         attention_dropout=0.0,
         num_experts_per_tok=2,
@@ -119,7 +119,9 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
+        if rope_parameters is None:
+            rope_theta = kwargs.pop("rope_theta", 1e6)
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
         self.attention_dropout = attention_dropout
 
         self.num_experts_per_tok = num_experts_per_tok
@@ -302,12 +304,11 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict,
         head_dim: int | None = None,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: dict | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -332,8 +333,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -355,9 +354,8 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=self.rope_scaling,
         )
         self.attn = Attention(
             self.num_heads,
@@ -393,7 +391,6 @@ def __init__(
         super().__init__()
         self.hidden_size = config.hidden_size
         # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
         self.self_attn = PhiMoEAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -402,10 +399,9 @@ def __init__(
             head_dim=getattr(
                 config, "head_dim", self.hidden_size // config.num_attention_heads
             ),
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=config.rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
         )
         self.block_sparse_moe = PhiMoE(
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 52c9755e0e0e..22f9c87fc905 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -567,10 +567,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No
             prefix=f"{prefix}.o_proj",
         )
 
-        self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000
-        self.rope_scaling = (
-            config.rope_scaling if hasattr(config, "rope_scaling") else None
-        )
         max_position = config.max_position_embeddings
         if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
             vllm_config.model_config.max_model_len, int
@@ -581,8 +577,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps)
         self.q_norm.weight = torch.nn.Parameter(
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 50a125c3f597..c973e7917098 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -83,8 +83,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         max_position_embeddings: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -117,8 +116,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -153,14 +151,11 @@ def __init__(
         super().__init__()
         self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         self.attn = QWenAttention(
             config.hidden_size,
             config.num_attention_heads,
             config.max_position_embeddings,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 1bbb969ce5aa..32b6d6dd07b8 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -57,7 +57,7 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.config import is_interleaved
+from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (
@@ -114,11 +114,10 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict[str, Any],
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
         dual_chunk_attention_config: dict[str, Any] | None = None,
@@ -143,7 +142,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
         self.qkv_proj = QKVParallelLinear(
@@ -167,8 +165,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         attn_cls = (
@@ -216,9 +213,7 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1000000)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -237,10 +232,9 @@ def __init__(
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 5b5d50ec8935..8e3c0e84dfe5 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -641,7 +641,6 @@ def __init__(
             head_size=head_dim,
             rotary_dim=head_dim // 2,
             max_position=8192,
-            base=10000.0,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 2ff0d19df238..6b97d0b2ca2e 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -194,8 +194,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -222,7 +221,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
@@ -248,8 +246,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = Attention(
@@ -291,8 +288,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -301,8 +296,7 @@ def __init__(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index cda8eaf5377f..d25ff2785bfe 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -643,7 +643,6 @@ def __init__(
             head_size=head_dim,
             rotary_dim=head_dim // 2,
             max_position=8192,
-            base=10000.0,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 8d7f22a33fe6..93a629d81e8f 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -42,6 +42,7 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
@@ -57,14 +58,13 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict,
         max_position: int = 4096 * 32,
         head_dim: int | None = None,
         rms_norm_eps: float = 1e-06,
         qkv_bias: bool = False,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
         dual_chunk_attention_config: dict[str, Any] | None = None,
@@ -89,7 +89,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
         self.qkv_proj = QKVParallelLinear(
@@ -113,8 +112,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = Attention(
@@ -166,9 +164,7 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1000000)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -187,13 +183,12 @@ def __init__(
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "attention_bias", False),
             head_dim=getattr(config, "head_dim", None),
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 96751fee800b..8ee3dd99e11d 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -216,8 +216,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any],
         max_position_embeddings: int = 8192,
         head_dim: int | None = None,
         rms_norm_eps: float = 1e-06,
@@ -247,7 +246,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
@@ -273,8 +271,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = Attention(
@@ -326,8 +323,6 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         quant_config = vllm_config.quant_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
@@ -336,8 +331,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "attention_bias", False),
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index ad631f61e4b9..bfed64728305 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -748,8 +748,7 @@ def __init__(
             head_size=self.head_dim,
             rotary_dim=self.head_dim,
             max_position=config.max_position_embeddings,
-            base=config.rope_theta,
-            rope_scaling=config.rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=config.partial_rotary_factor,
             dual_chunk_attention_config=self.dual_chunk_attention_config,
         )
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index d2fd74a5e41a..54ef56f83344 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -338,7 +338,6 @@ def __init__(
             head_size=head_dim,
             rotary_dim=head_dim // 2,
             max_position=8192,
-            base=10000.0,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 0c546309400b..c10aeaec5ab8 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -345,7 +345,6 @@ def __init__(
             head_size=head_dim,
             rotary_dim=head_dim // 2,
             max_position=8192,
-            base=10000.0,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index bf211d28f184..4744d8e44f39 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -54,6 +54,7 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
@@ -112,11 +113,10 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         head_dim: int,
+        rope_parameters: dict,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -140,7 +140,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -163,8 +162,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -200,9 +198,7 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1000000)
 
         # By default, SeedOss uses causal attention as it is a
         # decoder-only model.
@@ -219,10 +215,9 @@ def __init__(
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
             head_dim=config.head_dim,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
         )
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 4ec855f79444..7e9fc51036d2 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -25,7 +25,6 @@
 """Inference-only Solar model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Any
 
 import torch
 from torch import nn
@@ -111,8 +110,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -142,7 +139,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -166,8 +162,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -202,15 +197,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -224,8 +210,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 06eb7201c1a8..a738fcbb4ee2 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -153,7 +153,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.config.max_position_embeddings,
-            base=self.config.rope_theta,
+            rope_parameters=self.config.rope_parameters,
             partial_rotary_factor=self.partial_rotary_factor,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 0f2942acd500..1118fca3cac9 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -91,7 +91,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = config.rope_theta
         self.max_position_embeddings = config.max_position_embeddings
         self.use_bias = config.use_bias
 
@@ -115,7 +114,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=int(self.rope_theta),
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 4fff356b29e2..3c377a2c539d 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -36,6 +36,7 @@
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.step3_vl import Step3TextConfig
 
 from .interfaces import SupportsPP
 from .utils import (
@@ -144,9 +145,8 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         norm_eps: float,
-        rope_theta: int,
+        rope_parameters: dict[str, Any],
         share_q_dim: int | None = None,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embedding: int = 8192,
         head_dim: int = 256,
         cache_config: CacheConfig | None = None,
@@ -198,8 +198,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embedding,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         scaling = self.head_dim**-0.5
         self.attn = Attention(
@@ -227,15 +226,13 @@ def forward(
 class Step3TextDecoderLayer(nn.Module):
     def __init__(
         self,
-        config: ModelConfig,
+        config: Step3TextConfig,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
-        config = config.hf_config
         self.hidden_size = config.hidden_size
-        rope_scaling = getattr(config, "rope_scaling", None)
 
         self.self_attn = Step3TextAttention(
             hidden_size=self.hidden_size,
@@ -247,8 +244,7 @@ def __init__(
             max_position_embedding=config.max_position_embedding,
             head_dim=config.head_dim,
             share_q_dim=config.share_q_dim,
-            rope_theta=config.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
         )
 
@@ -338,7 +334,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: Step3TextDecoderLayer(
-                config=vllm_config.model_config,
+                config=config,
                 cache_config=cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py
index 517eb54d53ac..b807f45b5d52 100644
--- a/vllm/model_executor/models/transformers/utils.py
+++ b/vllm/model_executor/models/transformers/utils.py
@@ -22,6 +22,7 @@
 
 import torch
 from torch import nn
+from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 from vllm.config.utils import getattr_iter
 from vllm.logger import init_logger
@@ -203,5 +204,10 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool:
     """
     text_config = vllm_config.model_config.hf_config.get_text_config()
     # Dynamic rope scaling is not compatible with torch.compile
-    rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {}
-    return rope_scaling.get("rope_type") != "dynamic"
+    rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {}
+    if rope_parameters:
+        # Nest rope_parameters if not nested already to simplify logic
+        if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+            rope_parameters = {"": rope_parameters}
+        return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values())
+    return True
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 729a9655d087..653b5b9beef7 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -128,7 +128,6 @@ def __init__(
         tp_size = get_tensor_model_parallel_world_size()
         self.config = config
         self.num_hybrid_layers = num_hybrid_layers
-        self.rope_theta = config.rope_theta
 
         self.attention_hidden_size = config.attention_hidden_size
         self.total_num_attention_heads = config.num_attention_heads
@@ -233,8 +232,7 @@ def __init__(
                 head_size=self.attention_head_dim,
                 rotary_dim=self.attention_head_dim,
                 max_position=config.max_position_embeddings,
-                base=self.rope_theta,
-                rope_scaling=None,
+                rope_parameters=config.rope_parameters,
                 is_neox_style=True,
             )
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index ac4a71648cec..4ca155af03dc 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -7,8 +7,9 @@
 from collections.abc import Callable
 from dataclasses import asdict
 from functools import cache, partial
+from importlib.metadata import version
 from pathlib import Path
-from typing import Any, Literal, TypeVar
+from typing import Any, Literal, TypeAlias, TypeVar
 
 import huggingface_hub
 from huggingface_hub import (
@@ -24,7 +25,9 @@
     RepositoryNotFoundError,
     RevisionNotFoundError,
 )
+from packaging.version import Version
 from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig
+from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 from transformers.models.auto.image_processing_auto import get_image_processor_config
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -390,21 +393,61 @@ def file_or_path_exists(
     )
 
 
-def patch_rope_scaling(config: PretrainedConfig) -> None:
+def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None:
+    """Some models may have no rope_theta in their config but still use RoPE.
+    This function sets a default rope_theta if it's missing."""
+    if getattr(config, "rope_parameters", None) is None:
+        config.rope_parameters = {"rope_type": "default"}
+    if "rope_theta" not in config.rope_parameters:
+        config.rope_parameters["rope_theta"] = default_theta
+
+
+def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
-    text_config = getattr(config, "text_config", None)
-    if text_config is not None:
-        patch_rope_scaling(text_config)
+    # Retrieve rope_parameters differently based on Transformers version
+    if Version(version("transformers")) >= Version("5.0.0.dev0"):
+        from transformers.modeling_rope_utils import RopeParameters
 
-    rope_scaling = getattr(config, "rope_scaling", None)
-    if rope_scaling is not None:
-        patch_rope_scaling_dict(rope_scaling)
+        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr(
+            config, "rope_parameters", None
+        )
+    elif hasattr(config, "rope_parameters"):
+        # We are in Transformers v4 and rope_parameters
+        # has already been patched for this config
+        return
+    else:
+        # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters
+        rope_theta: float | None = getattr(config, "rope_theta", None)
+        rope_scaling: dict | None = getattr(config, "rope_scaling", None)
+        rope_parameters = rope_scaling
+        # Move rope_theta into rope_parameters
+        if rope_theta is not None:
+            rope_parameters = rope_parameters or {"rope_type": "default"}
+            rope_parameters["rope_theta"] = rope_theta
+        # Add original_max_position_embeddings if present
+        if rope_parameters and (
+            ompe := getattr(config, "original_max_position_embeddings", None)
+        ):
+            rope_parameters["original_max_position_embeddings"] = ompe
+        # Write back to config
+        config.rope_parameters = rope_parameters
+
+    # No RoPE parameters to patch
+    if rope_parameters is None:
+        return
+
+    # Handle nested rope_parameters in interleaved sliding attention models
+    if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+        for rope_parameters_layer_type in rope_parameters.values():
+            patch_rope_parameters_dict(rope_parameters_layer_type)
+    else:
+        patch_rope_parameters_dict(rope_parameters)
 
 
-def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
-    if "rope_type" in rope_scaling and "type" in rope_scaling:
-        rope_type = rope_scaling["rope_type"]
-        rope_type_legacy = rope_scaling["type"]
+def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
+    if "rope_type" in rope_parameters and "type" in rope_parameters:
+        rope_type = rope_parameters["rope_type"]
+        rope_type_legacy = rope_parameters["type"]
         if rope_type != rope_type_legacy:
             raise ValueError(
                 f"Found conflicts between 'rope_type={rope_type}' (modern "
@@ -412,28 +455,28 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
                 "You should only specify one of them."
             )
 
-    if "rope_type" not in rope_scaling and "type" in rope_scaling:
-        rope_scaling["rope_type"] = rope_scaling["type"]
+    if "rope_type" not in rope_parameters and "type" in rope_parameters:
+        rope_parameters["rope_type"] = rope_parameters["type"]
         logger.info("Replacing legacy 'type' key with 'rope_type'")
 
-    if "rope_type" not in rope_scaling:
-        raise ValueError("rope_scaling should have a 'rope_type' key")
+    if "rope_type" not in rope_parameters:
+        raise ValueError("rope_parameters should have a 'rope_type' key")
 
-    if rope_scaling["rope_type"] == "su":
-        rope_scaling["rope_type"] = "longrope"
+    if rope_parameters["rope_type"] == "su":
+        rope_parameters["rope_type"] = "longrope"
         logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
-    elif rope_scaling["rope_type"] == "mrope":
-        assert "mrope_section" in rope_scaling
-        rope_scaling["rope_type"] = "default"
+    elif rope_parameters["rope_type"] == "mrope":
+        assert "mrope_section" in rope_parameters
+        rope_parameters["rope_type"] = "default"
         logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
 
 
 def _uses_mrope(config: PretrainedConfig) -> bool:
-    rope_scaling = getattr(config, "rope_scaling", None)
-    if rope_scaling is None:
+    rope_parameters = getattr(config, "rope_parameters", None)
+    if rope_parameters is None:
         return False
 
-    return "mrope_section" in rope_scaling
+    return "mrope_section" in rope_parameters
 
 
 def uses_mrope(config: PretrainedConfig) -> bool:
@@ -690,7 +733,14 @@ def get_config(
         logger.debug("Overriding HF config with %s", hf_overrides_fn)
         config = hf_overrides_fn(config)
 
-    patch_rope_scaling(config)
+    # Exhaustively patch RoPE parameters everywhere they might be
+    patch_rope_parameters(config)
+    patch_rope_parameters(config.get_text_config())
+    SubConfigs: TypeAlias = dict[str, PretrainedConfig]
+    sub_configs: SubConfigs | None = getattr(config, "sub_configs", None)
+    if sub_configs:
+        for sub_config in sub_configs:
+            patch_rope_parameters(getattr(config, sub_config))
 
     if trust_remote_code:
         maybe_register_config_serialize_by_value()
diff --git a/vllm/transformers_utils/configs/afmoe.py b/vllm/transformers_utils/configs/afmoe.py
index 9b634fd037a3..47fee9882f9f 100644
--- a/vllm/transformers_utils/configs/afmoe.py
+++ b/vllm/transformers_utils/configs/afmoe.py
@@ -24,7 +24,7 @@ def __init__(
         rms_norm_eps: float = 1e-5,
         use_cache: bool = True,
         tie_word_embeddings: bool = False,
-        rope_theta: float = 10000.0,
+        rope_parameters: dict | None = None,
         rope_scaling: dict | None = None,
         num_experts: int = 64,
         num_experts_per_tok: int = 6,
@@ -56,7 +56,10 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
         self.rope_scaling = rope_scaling
 
         self.moe_intermediate_size = moe_intermediate_size
diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py
index 1707e15285c8..ba4b1a8f701f 100644
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -85,8 +85,15 @@ class ArcticConfig(PretrainedConfig):
             The id of the "end-of-sequence" token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings.
+        rope_parameters (`dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
         sliding_window (`int`, *optional*):
             Sliding window attention window size. If not specified, will default to `4096`.
         attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -132,7 +139,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         tie_word_embeddings=False,
-        rope_theta=1e6,
+        rope_parameters: dict[str, Any] | None = None,
         sliding_window=None,
         attention_dropout=0.0,
         num_experts_per_tok=1,
@@ -165,7 +172,10 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
+        rope_theta = kwargs.pop("rope_theta", 1e6)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
         self.attention_dropout = attention_dropout
 
         self.num_experts_per_tok = num_experts_per_tok
diff --git a/vllm/transformers_utils/configs/flex_olmo.py b/vllm/transformers_utils/configs/flex_olmo.py
index 1f2f4d446288..c343dc0999a8 100644
--- a/vllm/transformers_utils/configs/flex_olmo.py
+++ b/vllm/transformers_utils/configs/flex_olmo.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
 
 from transformers.configuration_utils import PretrainedConfig
 
@@ -25,8 +26,7 @@ def __init__(
         bos_token_id=None,
         eos_token_id=100257,
         tie_word_embeddings=False,
-        rope_theta=500000.0,
-        rope_scaling=None,
+        rope_parameters: dict[str, Any] | None = None,
         attention_bias=False,
         attention_dropout=0.0,
         num_experts_per_tok=5,
@@ -62,8 +62,13 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 500000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.num_experts_per_tok = num_experts_per_tok
@@ -73,5 +78,5 @@ def __init__(
         self.norm_topk_prob = norm_topk_prob
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, move it to 'rope_type'.
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        if self.rope_parameters is not None and "type" in self.rope_parameters:
+            self.rope_parameters["rope_type"] = self.rope_parameters["type"]
diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py
index 65ddf48c5249..14894816801d 100644
--- a/vllm/transformers_utils/configs/kimi_linear.py
+++ b/vllm/transformers_utils/configs/kimi_linear.py
@@ -29,8 +29,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         tie_word_embeddings=False,
         moe_intermediate_size: int | None = None,
         moe_renormalize: bool = True,
@@ -73,8 +72,13 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
 
         self.q_lora_rank = q_lora_rank
         self.kv_lora_rank = kv_lora_rank
diff --git a/vllm/transformers_utils/configs/lfm2_moe.py b/vllm/transformers_utils/configs/lfm2_moe.py
index 37c038e12db8..b399a03c030f 100644
--- a/vllm/transformers_utils/configs/lfm2_moe.py
+++ b/vllm/transformers_utils/configs/lfm2_moe.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
 
 from transformers.configuration_utils import PretrainedConfig
 
@@ -35,8 +36,8 @@ class Lfm2MoeConfig(PretrainedConfig):
             End of stream token id.
         tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings.
+        rope_parameters (`dict`, *optional*):
+            The parameters of the RoPE embeddings.
         max_position_embeddings (`int`, *optional*, defaults to 128000):
             The maximum sequence length that this model might ever be used with.
         use_cache (`bool`, *optional*, defaults to `True`):
@@ -100,7 +101,7 @@ def __init__(
         bos_token_id: int = 1,
         eos_token_id: int = 2,
         tie_word_embeddings: bool = True,
-        rope_theta: float = 1000000.0,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 128_000,
         use_cache: bool = True,
         norm_eps: float = 0.00001,
@@ -121,7 +122,10 @@ def __init__(
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
-        self.rope_theta = rope_theta
+        rope_theta = kwargs.pop("rope_theta", 1000000.0)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
         self.max_position_embeddings = max_position_embeddings
         self.use_cache = use_cache
         self.norm_eps = norm_eps
diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py
index e49bd26b2b00..f1bbd057103e 100644
--- a/vllm/transformers_utils/configs/midashenglm.py
+++ b/vllm/transformers_utils/configs/midashenglm.py
@@ -98,6 +98,6 @@ def __init__(
             if text_config
             else Qwen2_5OmniTextConfig()
         )
-        self.text_config.rope_scaling = None  # uses_mrope is false
+        self.text_config.rope_parameters = None  # uses_mrope is false
         self.audio_token_id = audio_token_id
         super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index c6f04febe37e..8f72f0b28b0d 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -86,13 +86,13 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
         "apply_scale": "apply_yarn_scaling",
     }
     yarn_config = config.get("yarn") or {}
-    config["rope_scaling"] = {
+    config["rope_parameters"] = {
         "rope_type": "yarn",
         "mscale_all_dim": 1,
     }
     for old_name, new_name in yarn_config_map.items():
         if old_name in yarn_config:
-            config["rope_scaling"][new_name] = yarn_config.pop(old_name)
+            config["rope_parameters"][new_name] = yarn_config.pop(old_name)
 
     assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
 
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 60eed549561f..d112c71d7d20 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -88,8 +88,8 @@ class NemotronConfig(PretrainedConfig):
             End of stream token id.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
+        rope_parameters (`dict`, *optional*):
+            The parameters of the RoPE embeddings.
         partial_rotary_factor (`float`, *optional*, defaults to 0.5):
             Percentage of the query and keys which will have rotary embedding.
         attention_bias (`bool`, *optional*, defaults to `False`):
@@ -132,8 +132,7 @@ def __init__(
         bos_token_id=2,
         eos_token_id=3,
         tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         partial_rotary_factor=0.5,
         attention_bias=False,
         attention_dropout=0.0,
@@ -160,8 +159,13 @@ def __init__(
         self.initializer_range = initializer_range
         self.norm_eps = norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         # for backward compatibility
         partial_rotary_factor = (
             kwargs.get("rope_percent")
@@ -169,7 +173,7 @@ def __init__(
             or partial_rotary_factor
         )
         self.partial_rotary_factor = partial_rotary_factor
-        self._rope_scaling_validation()
+        self._rope_parameters_validation()
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.mlp_bias = mlp_bias
@@ -182,31 +186,29 @@ def __init__(
             **kwargs,
         )
 
-    def _rope_scaling_validation(self):
+    def _rope_parameters_validation(self):
         """
-        Validate the `rope_scaling` configuration.
+        Validate the `rope_parameters` configuration.
         """
-        if self.rope_scaling is None:
+        if self.rope_parameters is None:
             return
 
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, "
-                f"`type` and `factor`, got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                "`rope_scaling`'s type field must be one of ['linear', "
-                f"'dynamic'], got {rope_scaling_type}"
-            )
-        if (
-            rope_scaling_factor is None
-            or not isinstance(rope_scaling_factor, float)
-            or rope_scaling_factor <= 1.0
-        ):
+        rope_type: str | None = self.rope_parameters.get("rope_type", None)
+        factor: float | None = self.rope_parameters.get("factor", None)
+
+        if rope_type not in {"default", "linear", "dynamic"}:
             raise ValueError(
-                "`rope_scaling`'s factor field must be a float > 1, got "
-                f"{rope_scaling_factor}"
+                "`rope_type` must be one of ['default', 'linear', 'dynamic'], "
+                f"got {rope_type}"
             )
+        if rope_type != "default":
+            if factor is None:
+                raise ValueError(
+                    "If `rope_type` is not 'default', `rope_parameters` "
+                    "must include a `factor` field. Got `None`."
+                )
+            if not isinstance(factor, float) or factor <= 1.0:
+                raise ValueError(
+                    "`rope_parameters`'s factor field must be a float > 1, got "
+                    f"{factor}"
+                )
diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py
index f5a9a7cd36bd..c4691b661af3 100644
--- a/vllm/transformers_utils/configs/olmo3.py
+++ b/vllm/transformers_utils/configs/olmo3.py
@@ -24,8 +24,7 @@ def __init__(
         bos_token_id=None,
         eos_token_id=50279,
         tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         attention_bias=False,
         attention_dropout=0.0,
         rms_norm_eps=1e-5,
@@ -63,8 +62,13 @@ def __init__(
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
 
diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py
index 21750bde2f87..d2fe58d48da6 100644
--- a/vllm/transformers_utils/configs/qwen3_next.py
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -66,13 +66,12 @@ class Qwen3NextConfig(PretrainedConfig):
             relevant if `config.is_decoder=True`.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
+        rope_parameters (`dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
             and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
             accordingly.
             Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
                 `rope_type` (`str`):
                     The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                     'llama3'], with 'default' being the original RoPE implementation.
@@ -199,8 +198,7 @@ def __init__(
         rms_norm_eps=1e-6,
         use_cache=True,
         tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         partial_rotary_factor=0.25,
         attention_bias=False,
         attention_dropout=0.0,
@@ -236,8 +234,13 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.partial_rotary_factor = partial_rotary_factor
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py
index 637b82d88e26..0ee650a70451 100644
--- a/vllm/transformers_utils/configs/step3_vl.py
+++ b/vllm/transformers_utils/configs/step3_vl.py
@@ -52,8 +52,7 @@ def __init__(
         moe_intermediate_size: int = 5120,
         moe_num_experts: int = 48,
         moe_top_k: int = 3,
-        rope_theta: float = 500000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embedding: int = 65536,
         share_expert_dim: int = 5120,
         share_q_dim: int = 2048,
@@ -130,8 +129,13 @@ def __init__(
         self.moe_intermediate_size = moe_intermediate_size
         self.moe_num_experts = moe_num_experts
         self.moe_top_k = moe_top_k
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 500000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.max_position_embedding = max_position_embedding
         self.share_expert_dim = share_expert_dim
         self.share_q_dim = share_q_dim

From 0c80efd94fb8c17cfc7d1bcb9cdb65f154340994 Mon Sep 17 00:00:00 2001
From: Yuxuan Zhang <2448370773@qq.com>
Date: Thu, 20 Nov 2025 01:32:55 +0800
Subject: [PATCH 195/578] GLM-V video segmentation solution adjustment (#28941)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
---
 vllm/model_executor/models/glm4_1v.py | 94 +++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 6581bbda6d60..d141e9549806 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -37,7 +37,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from transformers import BatchFeature
+from transformers import BatchFeature, Glm4vProcessor
 from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
 from transformers.models.glm4v.image_processing_glm4v import (
     Glm4vImageProcessor,
@@ -1028,7 +1028,7 @@ def get_num_frames_with_most_features(
 
         return max(max_frames_per_video, 1)
 
-    def _get_video_second_idx(
+    def _get_video_second_idx_glm4v(
         self, metadata: dict[str, Any], total_frames: int
     ) -> list[int]:
         video_processor = self.get_video_processor()
@@ -1079,6 +1079,83 @@ def _get_video_second_idx(
             selected_timestamps.append(timestamps_list[idx])
         return selected_timestamps
 
+    def _get_video_second_idx_glm46v(
+        self, metadata: dict[str, Any], total_frames: int
+    ) -> list[int]:
+        video_processor = self.get_video_processor()
+
+        video_fps = metadata["fps"]
+        meta_frames = metadata.get("total_num_frames", total_frames)
+        max_frame_idx = meta_frames - 1
+        duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1)
+
+        do_sample_frames = metadata.get("do_sample_frames", True)
+        if not do_sample_frames:
+            frame_indices = metadata["frames_indices"]
+        else:
+            DYNAMIC_FPS_THRES = {30: 3, 300: 1, 2400: 0.5}
+            MAX_FRAME_COUNT_DYNAMIC = 640
+            MAX_DURATION = 2400
+
+            effective_duration = min(duration, MAX_DURATION)
+            if effective_duration <= 30:
+                target_fps = DYNAMIC_FPS_THRES[30]
+            elif effective_duration <= 300:
+                target_fps = DYNAMIC_FPS_THRES[300]
+            else:
+                target_fps = DYNAMIC_FPS_THRES[2400]
+
+            temporal_patch_size = getattr(video_processor, "temporal_patch_size", 1)
+            extract_t = int(effective_duration * target_fps * temporal_patch_size)
+            extract_t = min(extract_t, MAX_FRAME_COUNT_DYNAMIC)
+
+            duration_per_frame = 1 / video_fps
+            timestamps = [i * duration_per_frame for i in range(meta_frames)]
+            max_second = int(duration)
+
+            if meta_frames < extract_t:
+                frame_indices = np.linspace(
+                    0, meta_frames - 1, extract_t, dtype=int
+                ).tolist()
+            else:
+                frame_indices = []
+                current_second = 0.0
+                inv_fps = 1 / (temporal_patch_size * target_fps)
+                for frame_index in range(meta_frames):
+                    if timestamps[frame_index] >= current_second:
+                        current_second += inv_fps
+                        frame_indices.append(frame_index)
+                        if current_second >= max_second:
+                            break
+
+            if len(frame_indices) < extract_t:
+                if len(frame_indices) == 0:
+                    start, end = 0, max(meta_frames - 1, 0)
+                else:
+                    start, end = frame_indices[0], frame_indices[-1]
+                frame_indices = np.linspace(start, end, extract_t, dtype=int).tolist()
+            elif len(frame_indices) > extract_t:
+                frame_indices = np.linspace(
+                    0, meta_frames - 1, extract_t, dtype=int
+                ).tolist()
+
+        seen, uniq = set(), []
+        for idx in frame_indices:
+            if idx not in seen:
+                seen.add(idx)
+                uniq.append(idx)
+
+        if len(uniq) & 1:
+            uniq.append(uniq[-1])
+
+        frame_indices = uniq
+        full_second_idxs = [int(idx / video_fps) for idx in frame_indices]
+        timestamps_list = full_second_idxs[::2]
+        selected_timestamps = []
+        for idx in range(len(timestamps_list)):
+            selected_timestamps.append(timestamps_list[idx])
+        return selected_timestamps
+
     def _construct_video_placeholder(
         self,
         video_array: np.ndarray,
@@ -1097,9 +1174,18 @@ def _construct_video_placeholder(
         merge_length = image_processor.merge_size**2
 
         assert isinstance(grid_thw, torch.Tensor)
-        timestamps = self._get_video_second_idx(metadata, len(video_array))
+        timestamps = (
+            self._get_video_second_idx_glm4v(metadata, len(video_array))
+            if isinstance(hf_processor, Glm4vProcessor)
+            else self._get_video_second_idx_glm46v(metadata, len(video_array))
+        )
+
+        timestamp_format = (
+            "{}" if isinstance(hf_processor, Glm4vProcessor) else "{:.1f} seconds"
+        )
         frames_idx_token = [
-            tokenizer.encode(str(i), add_special_tokens=False) for i in timestamps
+            tokenizer.encode(timestamp_format.format(i), add_special_tokens=False)
+            for i in timestamps
         ]
         T, H, W = grid_thw
         num_tokens_per_frame = int(H * W) // merge_length

From 61728cd1dfb03cbbfa03924f2a2cda311cfc13ac Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Wed, 19 Nov 2025 13:32:19 -0500
Subject: [PATCH 196/578] Re-enable FlashInfer for Llama4 on Blackwell in e2e
 fusion tests (#28966)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |  2 ++
 tests/compile/distributed/test_fusions_e2e.py | 12 ++++--------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d4b6f4077ab3..98daebcc0693 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -930,6 +930,8 @@ steps:
   - csrc/quantization/fp4/
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
   - vllm/compilation/
   # can affect pattern matching
   - vllm/model_executor/layers/layernorm.py
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 2e1b595a4389..661172e1965b 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -47,12 +47,8 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             # Use smaller model for L40s in CI
             model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-            # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
-            #  so FI attention+fp8_quant is at least tested once
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER
-            if is_blackwell()
-            else AttentionBackendEnum.TRITON_ATTN,
+            backend=AttentionBackendEnum.TRITON_ATTN,
             matches=Matches(
                 attention_fusion=32,
                 allreduce_fusion=65,
@@ -65,9 +61,9 @@ class ModelBackendTestCase(NamedTuple):
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
             # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
             # https://github.com/vllm-project/vllm/issues/28568
-            # TODO FlashInfer attn broken on Blackwell for llama4:
-            # https://github.com/vllm-project/vllm/issues/28604
-            backend=AttentionBackendEnum.TRITON_ATTN,
+            backend=AttentionBackendEnum.FLASHINFER
+            if is_blackwell()
+            else AttentionBackendEnum.TRITON_ATTN,
             matches=Matches(
                 attention_fusion=48,
                 allreduce_fusion=96,

From 3319a493fcc3e4733382f0dc812184234e9c3dcb Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Wed, 19 Nov 2025 11:20:22 -0800
Subject: [PATCH 197/578] [Core] Reuse created spec tokens lists to mitigate GC
 cost (#28917)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 vllm/v1/worker/gpu_input_batch.py  | 18 ++++++++++++------
 vllm/v1/worker/gpu_model_runner.py |  3 ++-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 023b5edb2c34..c1bfe727d86e 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -251,7 +251,7 @@ def __init__(
         self.logitsprocs_need_output_token_ids = logitsprocs_need_output_token_ids
 
         # Store last speculative tokens for sampler.
-        self.spec_token_ids: list[list[int] | None] = []
+        self.spec_token_ids: list[list[int]] = [[] for _ in range(max_num_reqs)]
 
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
@@ -313,7 +313,7 @@ def add_request(
         else:
             self._req_ids[req_index] = req_id
             self.req_output_token_ids[req_index] = request.output_token_ids
-            self.spec_token_ids[req_index] = []
+            self.spec_token_ids[req_index].clear()
 
         self.req_id_to_index[req_id] = req_index
 
@@ -462,7 +462,7 @@ def remove_request(self, req_id: str) -> int | None:
         self.batch_update_builder.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
-        self.spec_token_ids[req_index] = None
+        self.spec_token_ids[req_index].clear()
 
         # LoRA
         lora_id = self.request_lora_mapping[req_index]
@@ -654,9 +654,15 @@ def condense(self) -> None:
             self.req_output_token_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
-            spec_token_ids = self.spec_token_ids[last_req_index]
-            self.spec_token_ids[empty_index] = spec_token_ids
-            self.spec_token_ids[last_req_index] = None
+            if last_req_index != empty_index:
+                (
+                    self.spec_token_ids[last_req_index],
+                    self.spec_token_ids[empty_index],
+                ) = (
+                    self.spec_token_ids[empty_index],
+                    self.spec_token_ids[last_req_index],
+                )
+                self.spec_token_ids[last_req_index].clear()
 
             num_tokens = self.num_tokens[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3b00085b6bb9..0c35f1330e9f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -892,7 +892,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             # conform to the schema. This can result in
             # scheduler_output.scheduled_spec_decode_tokens being empty,
             # even when speculative decoding is enabled.
-            self.input_batch.spec_token_ids[req_index] = spec_token_ids
+            self.input_batch.spec_token_ids[req_index].clear()
+            self.input_batch.spec_token_ids[req_index].extend(spec_token_ids)
 
             # there are no draft tokens with async scheduling,
             # we clear the spec_decoding info in scheduler_output and

From fe69f331f84d99541564dfe4852dd45220ed7875 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Wed, 19 Nov 2025 14:23:54 -0500
Subject: [PATCH 198/578] [Kernels] Improve H200 Fused MoE Config (#28992)

Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 72 +++++++++----------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 6fcf408755f5..532c16e89926 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,11 +1,11 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "2": {
         "BLOCK_SIZE_M": 16,
@@ -13,82 +13,82 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "48": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "256": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -96,10 +96,10 @@
         "num_stages": 3
     },
     "512": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -109,7 +109,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
@@ -117,21 +117,21 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     }

From 9d2d5612573c20f8bf00242a8525c2a5dcfe4c06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?=
 <38908462+zhyajie@users.noreply.github.com>
Date: Thu, 20 Nov 2025 03:30:57 +0800
Subject: [PATCH 199/578] [Bugfix]  Fix precision corruption when
 shared_experts_stream=None (#28942)

Signed-off-by: zhyajie <yajizhan@amd.com>
Co-authored-by: zhyajie <yajizhan@amd.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 11 +++++++----
 vllm/utils/torch_utils.py                     |  3 +--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index c41995e4a913..8e9bba344287 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -371,8 +371,8 @@ def __init__(
             logger.info_once("Disabling MoE shared_experts cuda stream")
             self.shared_experts_stream = None
         else:
-            # TODO(rob): enable shared expert overlap with non-cuda.
-            # aux_stream() returns None on non-cuda platforms.
+            # TODO(rob): enable shared expert overlap with non-cuda-alike.
+            # aux_stream() returns None on non-cuda-alike platforms.
             self.shared_experts_stream = aux_stream()
             if self.shared_experts_stream is not None:
                 logger.info_once("Enabled separate cuda stream for MoE shared_experts")
@@ -1865,6 +1865,11 @@ def forward_impl(
                 hidden_states_combined, router_logits = get_ep_group().dispatch(
                     hidden_states, router_logits, self.is_sequence_parallel
                 )
+            # Run shared experts before matrix multiply.
+            # because matrix multiply maybe modify the hidden_states.
+            if has_separate_shared_experts and not use_shared_experts_stream:
+                assert self.shared_experts is not None
+                shared_output = self.shared_experts(hidden_states)
 
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
@@ -1908,8 +1913,6 @@ def forward_impl(
                         # conflict with the main stream
                         shared_output = self.shared_experts(hidden_states_clone)
                     current_stream().wait_stream(self.shared_experts_stream)
-                else:
-                    shared_output = self.shared_experts(hidden_states)
 
                 final_hidden_states = (
                     shared_output,
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index 7c094e14cff7..3661dfd09047 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -426,8 +426,7 @@ def aux_stream() -> torch.cuda.Stream | None:
 
     from vllm.platforms import current_platform
 
-    # TODO: validate this works properly on ROCm platform.
-    if _aux_stream is None and current_platform.is_cuda():
+    if _aux_stream is None and current_platform.is_cuda_alike():
         _aux_stream = torch.cuda.Stream()
 
     return _aux_stream

From ac10fd3c6900228e3c0a8fae20d039668c132446 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Wed, 19 Nov 2025 11:59:30 -0800
Subject: [PATCH 200/578] Upstreaming aiter triton attention backend as a new
 backend (#28701)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
---
 vllm/attention/backends/registry.py           |  3 +
 vllm/platforms/rocm.py                        |  4 +-
 .../backends/mla/aiter_triton_mla.py          | 74 +++++++++++++++++++
 3 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 vllm/v1/attention/backends/mla/aiter_triton_mla.py

diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 51899b023591..91e1cad01f4f 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -46,6 +46,9 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
     ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
     ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
+    ROCM_AITER_TRITON_MLA = (
+        "vllm.v1.attention.backends.mla.aiter_triton_mla.AiterTritonMLABackend"
+    )
     ROCM_AITER_FA = (
         "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
     )
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index bb116792fed5..f07f068a9249 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -234,7 +234,6 @@ def get_attn_backend_cls(
                     if rocm_aiter_ops.is_mla_enabled() or block_size == 1
                     else AttentionBackendEnum.TRITON_MLA
                 )
-
             if selected_backend == AttentionBackendEnum.TRITON_MLA:
                 if block_size != 1:
                     logger.info_once("Using Triton MLA backend.")
@@ -246,6 +245,9 @@ def get_attn_backend_cls(
             if selected_backend == AttentionBackendEnum.ROCM_AITER_MLA:
                 logger.info("Using AITER MLA backend.")
                 return AttentionBackendEnum.ROCM_AITER_MLA.get_path()
+            if selected_backend == AttentionBackendEnum.ROCM_AITER_TRITON_MLA:
+                logger.info("Using AITER TRITON MLA backend.")
+                return AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path()
 
             raise ValueError(
                 f" The selected backend, {selected_backend.name},"
diff --git a/vllm/v1/attention/backends/mla/aiter_triton_mla.py b/vllm/v1/attention/backends/mla/aiter_triton_mla.py
new file mode 100644
index 000000000000..8a92152a0ca5
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/aiter_triton_mla.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.attention.backends.mla.common import MLACommonBackend
+from vllm.v1.attention.backends.mla.rocm_aiter_mla import (
+    AiterMLAImpl,
+    AiterMLAMetadataBuilder,
+)
+
+
+class AiterTritonMLABackend(MLACommonBackend):
+    @staticmethod
+    def get_name() -> str:
+        return "AITER_TRITON_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> type["AiterTritonMLAImpl"]:
+        return AiterTritonMLAImpl
+
+    @staticmethod
+    def get_builder_cls() -> type["AiterMLAMetadataBuilder"]:
+        return AiterMLAMetadataBuilder
+
+
+class AiterTritonMLAImpl(AiterMLAImpl):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
+        from aiter.ops.triton.mha import flash_attn_varlen_func
+
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+
+    def _flash_attn_varlen_diff_headdims(
+        self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
+    ):
+        result = self.flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            softmax_scale=softmax_scale,
+            return_lse=return_softmax_lse,
+            **kwargs,
+        )
+        # Transpose the LSE if Triton MHA is used:
+        # (q.shape[0], num_q_heads) to (num_q_heads, q.shape[0])
+        if type(result) is tuple and return_softmax_lse:
+            output, lse = result
+            lse = lse.T.contiguous()
+            return (output, lse)
+        return result

From 02f5903b84cfdf0b7cb31d46e995e3d4b9ad9e53 Mon Sep 17 00:00:00 2001
From: Izzy Putterman <carlipp176@gmail.com>
Date: Wed, 19 Nov 2025 12:01:05 -0800
Subject: [PATCH 201/578] Eagle: MM Cuda Graphs with MRope (#28896)

Signed-off-by: Izzy Putterman <iputterman@nvidia.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/llama_eagle3.py | 14 ++++++--------
 vllm/v1/spec_decode/eagle.py               | 13 +++++++++++--
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 75c671311b49..3eaf2d80082f 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -23,7 +23,6 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM
-from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
 
 from .utils import (
@@ -121,13 +120,12 @@ def forward(
 
 
 @support_torch_compile(
-    # torch.compile is disabled for multimodal EAGLE3 models due to constraint
-    # violations with dynamic shapes during tensor concatenation operations.
-    # See: https://github.com/vllm-project/vllm/pull/22872/files#r2362028132
-    # Non-multimodal EAGLE3 models can still use torch.compile safely.
-    enable_if=lambda vllm_config: not MULTIMODAL_REGISTRY.supports_multimodal_inputs(
-        vllm_config.model_config
-    ),
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "hidden_states": 0,
+        "input_embeds": 0,
+    }
 )
 class LlamaModel(nn.Module):
     def __init__(
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 5bf2503c3027..406bb696bd4c 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -116,9 +116,18 @@ def __init__(
         )
         self.uses_mrope = self.vllm_config.model_config.uses_mrope
         if self.uses_mrope:
-            # M-RoPE need (3, max_num_tokens)
+            # NOTE: `mrope_positions` is implemented with one additional dummy
+            # position on purpose to make it non-contiguous so that it can work
+            # with torch compile.
+            # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923
+
+            # NOTE: When M-RoPE is enabled, position ids are 3D regardless of
+            # the modality of inputs. For text-only inputs, each dimension has
+            # identical position IDs, making M-RoPE functionally equivalent to
+            # 1D-RoPE.
+            # See page 5 of https://arxiv.org/abs/2409.12191
             self.mrope_positions = torch.zeros(
-                (3, self.max_num_tokens), dtype=torch.int64, device=device
+                (3, self.max_num_tokens + 1), dtype=torch.int64, device=device
             )
         else:
             # RoPE need (max_num_tokens,)

From 2fd893b4cec0975a2a8430077fd9b4f294eb3561 Mon Sep 17 00:00:00 2001
From: Qiu <qiuchunshuo@huawei.com>
Date: Thu, 20 Nov 2025 04:52:44 +0800
Subject: [PATCH 202/578] [Feature] Prefill Context Parallel (PCP) basic
 support (#28718)

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
Signed-off-by: FENP <yuanyongjie.yyj@antgroup.com>
Signed-off-by: LookAround <lixushi@huawei.com>
Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com>
Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Co-authored-by: FENP <yuanyongjie.yyj@antgroup.com>
Co-authored-by: LookAround <lixushi@huawei.com>
Co-authored-by: Jingchun Gao <gaojingchun1@huawei.com>
Co-authored-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Co-authored-by: Jingchun Gao <63247409+gjc0824@users.noreply.github.com>
---
 tests/distributed/test_context_parallel.py    | 12 +--
 .../moe/modular_kernel_tools/common.py        |  7 +-
 tests/v1/worker/test_gpu_model_runner.py      |  4 +-
 vllm/attention/backends/abstract.py           | 17 +++++
 vllm/attention/ops/common.py                  | 40 +++++++++-
 vllm/config/parallel.py                       | 40 +++++++---
 vllm/config/vllm.py                           | 32 ++++++--
 vllm/distributed/parallel_state.py            | 74 +++++++++++++++----
 vllm/engine/arg_utils.py                      | 22 ++++++
 .../model_executor/layers/fused_moe/config.py | 59 ++++++++++-----
 vllm/model_executor/layers/fused_moe/layer.py | 32 ++++++++
 vllm/model_executor/models/gpt_oss.py         |  9 ++-
 vllm/v1/attention/backends/flash_attn.py      |  6 +-
 vllm/v1/attention/backends/mla/common.py      |  6 +-
 vllm/v1/attention/backends/utils.py           | 18 ++---
 vllm/v1/core/kv_cache_coordinator.py          | 17 +++++
 vllm/v1/core/kv_cache_manager.py              |  9 +--
 vllm/v1/core/kv_cache_utils.py                | 13 +++-
 vllm/v1/core/sched/scheduler.py               |  2 +
 vllm/v1/core/single_type_kv_cache_manager.py  | 19 ++++-
 vllm/v1/engine/core.py                        |  1 +
 vllm/v1/executor/multiproc_executor.py        | 23 ++++--
 vllm/v1/kv_cache_interface.py                 |  5 +-
 vllm/v1/worker/block_table.py                 | 35 +++++----
 vllm/v1/worker/gpu_input_batch.py             |  4 +-
 vllm/v1/worker/gpu_model_runner.py            |  4 +-
 vllm/v1/worker/gpu_worker.py                  |  3 +
 27 files changed, 399 insertions(+), 114 deletions(-)

diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index b16fd0d06b14..7e4713b8aece 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -31,7 +31,7 @@ class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
     dcp_size: int
-    dcp_kv_cache_interleave_size: int
+    cp_kv_cache_interleave_size: int
     eager_mode: bool
     chunked_prefill: bool
 
@@ -55,7 +55,7 @@ def detailed(
         tp_base: int = 4,
         pp_base: int = 1,
         dcp_base: int = 1,
-        dcp_kv_cache_interleave_size: int = 1,
+        cp_kv_cache_interleave_size: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
         load_format: str | None = None,
@@ -71,7 +71,7 @@ def detailed(
                                 tp_size=tp_base,
                                 pp_size=pp_multiplier * pp_base,
                                 dcp_size=int(dcp_multiplier * tp_base),
-                                dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+                                cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
                                 eager_mode=eager_mode_val,
                                 chunked_prefill=chunked_prefill_val,
                             )
@@ -116,7 +116,7 @@ def _compare_cp_with_tp(
         tp_size,
         pp_size,
         dcp_size,
-        dcp_kv_cache_interleave_size,
+        cp_kv_cache_interleave_size,
         eager_mode,
         chunked_prefill,
     ) = parallel_setup
@@ -197,7 +197,7 @@ def _compare_cp_with_tp(
         "--decode-context-parallel-size",
         str(dcp_size),
         "--dcp-kv-cache-interleave-size",
-        str(dcp_kv_cache_interleave_size),
+        str(cp_kv_cache_interleave_size),
         "--distributed-executor-backend",
         distributed_backend,
     ]
@@ -227,7 +227,7 @@ def _compare_cp_with_tp(
     "deepseek-ai/DeepSeek-V2-Lite-Chat": [
         CPTestSettings.detailed(),
         CPTestSettings.detailed(tp_base=2),
-        CPTestSettings.detailed(tp_base=2, dcp_kv_cache_interleave_size=64),
+        CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64),
     ],
     "bigcode/gpt_bigcode-santacoder": [
         CPTestSettings.detailed(),
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index 1d925dc1bea8..d95c22fdf0a5 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -15,7 +15,11 @@
 )
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig
-from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import (
+    get_dp_group,
+    get_pcp_group,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
@@ -561,6 +565,7 @@ def next_power_of_2(x):
     # make moe config
     moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
         tp_size_=get_tensor_model_parallel_world_size(),
+        pcp_size_=get_pcp_group().world_size,
         dp_size_=get_dp_group().world_size,
         vllm_parallel_config=vllm_config.parallel_config,
     )
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index b95c8df3469b..824e45897835 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -956,7 +956,7 @@ def test_hybrid_block_table_initialization():
     max_num_reqs = 10
     max_num_blocks_per_req = 20
     max_num_batched_tokens = 512
-    dcp_kv_cache_interleave_size = 8
+    cp_kv_cache_interleave_size = 8
 
     block_table = BlockTable(
         block_size=block_size,
@@ -966,7 +966,7 @@ def test_hybrid_block_table_initialization():
         pin_memory=False,
         device=torch.device(DEVICE),
         kernel_block_size=kernel_block_sizes[0],
-        dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+        cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
     )
 
     # Verify hybrid block configuration
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 9275d70fd86a..d28bc065852d 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -266,6 +266,12 @@ class AttentionImpl(ABC, Generic[T]):
     dcp_world_size: int
     dcp_rank: int
 
+    pcp_world_size: int
+    pcp_rank: int
+
+    total_cp_world_size: int
+    total_cp_rank: int
+
     def __new__(cls, *args, **kwargs):
         # use __new__ so that all subclasses will call this
         self = super().__new__(cls)
@@ -278,6 +284,17 @@ def __new__(cls, *args, **kwargs):
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
+        try:
+            from vllm.distributed.parallel_state import get_pcp_group
+
+            self.pcp_world_size = get_pcp_group().world_size
+            self.pcp_rank = get_pcp_group().rank_in_group
+        except AssertionError:
+            self.pcp_world_size = 1
+            self.pcp_rank = 0
+        self.total_cp_world_size = self.pcp_world_size * self.dcp_world_size
+        self.total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank
+
         self.need_to_return_lse_for_decode = (
             self.dcp_world_size > 1 and self.can_return_lse_for_decode
         )
diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index 2cbb5c91cc3b..67c5f7dbba9c 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -169,12 +169,11 @@ def correct_attn_out(
     return out, lse
 
 
-def cp_lse_ag_out_rs(
+def _cp_lse_common(
     cp_attn_out: torch.Tensor,
     cp_attn_lse: torch.Tensor,
     cp_group: GroupCoordinator,
-    ctx: CPTritonContext = None,
-    return_lse=False,
+    ctx: CPTritonContext | None = None,
 ):
     """
     cp_attn_out: [ B, H, D ]
@@ -195,6 +194,22 @@ def cp_lse_ag_out_rs(
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
     out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
+    assert out.is_contiguous()
+    return out, lse
+
+
+def cp_lse_ag_out_rs(
+    cp_attn_out: torch.Tensor,
+    cp_attn_lse: torch.Tensor,
+    cp_group: GroupCoordinator,
+    ctx: CPTritonContext | None = None,
+    return_lse: bool = False,
+):
+    """
+    cp_attn_out: [ B, H, D ]
+    cp_attn_lse: [ B, H ]
+    """
+    out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
     out = cp_group.reduce_scatter(out, dim=1)
 
     if return_lse:
@@ -205,6 +220,25 @@ def cp_lse_ag_out_rs(
     return out
 
 
+def cp_lse_ag_out_ar(
+    cp_attn_out: torch.Tensor,
+    cp_attn_lse: torch.Tensor,
+    cp_group: GroupCoordinator,
+    ctx: CPTritonContext | None = None,
+    return_lse: bool = False,
+):
+    """
+    cp_attn_out: [ B, H, D ]
+    cp_attn_lse: [ B, H ]
+    """
+    out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
+    out = cp_group.all_reduce(out)
+
+    if return_lse:
+        return out, lse
+    return out
+
+
 @triton.jit
 def _pack_seq_kernel(
     x_ptr,  # [N, D]
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 0f107a7a3ef8..4b0236d8de3f 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -71,6 +71,8 @@ class ParallelConfig:
     """Number of pipeline parallel groups."""
     tensor_parallel_size: int = 1
     """Number of tensor parallel groups."""
+    prefill_context_parallel_size: int = 1
+    """Number of prefill context parallel groups."""
     data_parallel_size: int = 1
     """Number of data parallel groups. MoE layers will be sharded according to
     the product of the tensor parallel size and data parallel size."""
@@ -239,14 +241,25 @@ class is dynamically inherited by the worker class. This is used to inject
     needs to be divisible by dcp_size."""
 
     dcp_kv_cache_interleave_size: int = 1
-    """Interleave size of kv_cache storage while using dcp or cp > 1,
-    store interleave_size tokens on (d)cp i,
-    then store next interleave_size tokens on (d)cp i+1.
-    Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size.
-    Interleave_size=block_size: block-level align, first fill the block on first rank,
-    token is stored on rank i+1 block j after rank i block j is full.
-    Block_size should be greater than or equal to dcp_kv_cache_interleave_size.
-    Block_size should be divisible by dcp_kv_cache_interleave_size.
+    """
+    Interleave size of kv_cache storage while using DCP.
+    dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size,
+    and will be deprecated when PCP is fully supported.
+
+    """
+    cp_kv_cache_interleave_size: int = 1
+    """Interleave size of kv_cache storage while using DCP or PCP.
+    For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
+        and `total_cp_world_size = pcp_world_size * dcp_world_szie`.
+    store interleave_size tokens on total_cp_rank i,
+    then store next interleave_size tokens on taotal_cp_rank i+1.
+    Interleave_size=1: token-level alignment, where token `i` is stored on
+        total_cp_rank `i % total_cp_world_size`.
+    Interleave_size=block_size: block-level alignment, where tokens are
+        first populated to the preceding ranks. Tokens are then stored
+        in (rank i+1, block j) only after (rank i, block j) is fully occupied.
+    Block_size should be greater than or equal to cp_kv_cache_interleave_size.
+    Block_size should be divisible by cp_kv_cache_interleave_size.
     """
 
     _api_process_count: int = Field(default=1, gt=0)
@@ -311,6 +324,11 @@ def _validate_parallel_config(self) -> Self:
                     "num_redundant_experts."
                 )
 
+        if self.prefill_context_parallel_size > 1:
+            raise ValueError(
+                "Prefill context parallelism is not fully supported. "
+                "Please set prefill_context_parallel_size to 1."
+            )
         return self
 
     @property
@@ -529,7 +547,11 @@ def __post_init__(self) -> None:
             )
 
         # Continue with the rest of the initialization
-        self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size
+        self.world_size = (
+            self.pipeline_parallel_size
+            * self.tensor_parallel_size
+            * self.prefill_context_parallel_size
+        )
 
         if self.distributed_executor_backend == "external_launcher":
             logger.info("Using external launcher for distributed inference.")
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 672b004c4aa5..d64e315b4fe3 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -481,6 +481,14 @@ def __post_init__(self):
                         "Overriding cudagraph_mode to PIECEWISE."
                     )
                     self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+                # prefill context parallel do not support full cudagraphs
+                elif self.parallel_config.prefill_context_parallel_size > 1:
+                    logger.warning_once(
+                        "Prefill context parallel (PCP) is enabled, which is "
+                        "incompatible with full CUDA graphs. "
+                        "Overriding cudagraph_mode to PIECEWISE."
+                    )
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
                 elif self.model_config is not None:
                     if self.model_config.pooler_config is not None:
                         logger.warning_once(
@@ -610,22 +618,34 @@ def __post_init__(self):
 
         # If DCP, ensure the block size is right.
         if self.parallel_config.decode_context_parallel_size > 1:
+            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
+                self.parallel_config.cp_kv_cache_interleave_size
+                != self.parallel_config.dcp_kv_cache_interleave_size
+            ):
+                self.parallel_config.cp_kv_cache_interleave_size = (
+                    self.parallel_config.dcp_kv_cache_interleave_size
+                )
+                logger.warning_once(
+                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
+                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
+                    "deprecated when PCP is fully supported."
+                )
             assert (
-                self.parallel_config.dcp_kv_cache_interleave_size
+                self.parallel_config.cp_kv_cache_interleave_size
                 <= self.cache_config.block_size
                 and self.cache_config.block_size
-                % self.parallel_config.dcp_kv_cache_interleave_size
+                % self.parallel_config.cp_kv_cache_interleave_size
                 == 0
             ), (
                 f"Block_size({self.cache_config.block_size}) should be greater "
-                "than or equal to and divisible by dcp_kv_cache_interleave_size "
-                f"({self.parallel_config.dcp_kv_cache_interleave_size})."
+                "than or equal to and divisible by cp_kv_cache_interleave_size "
+                f"({self.parallel_config.cp_kv_cache_interleave_size})."
             )
 
         assert (
-            self.parallel_config.dcp_kv_cache_interleave_size == 1
+            self.parallel_config.cp_kv_cache_interleave_size == 1
             or self.speculative_config is None
-        ), "MTP with dcp_kv_cache_interleave_size > 1 is not supported now."
+        ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
 
         # Do this after all the updates to compilation_config.mode
         if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 852c4c644433..f81612fd1f4a 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1098,6 +1098,12 @@ def get_dcp_group() -> GroupCoordinator:
 
 _PP: GroupCoordinator | None = None
 
+
+def get_pp_group() -> GroupCoordinator:
+    assert _PP is not None, "pipeline model parallel group is not initialized"
+    return _PP
+
+
 _DP: GroupCoordinator | None = None
 
 
@@ -1114,9 +1120,12 @@ def get_ep_group() -> GroupCoordinator:
     return _EP
 
 
-def get_pp_group() -> GroupCoordinator:
-    assert _PP is not None, "pipeline model parallel group is not initialized"
-    return _PP
+_PCP: GroupCoordinator | None = None
+
+
+def get_pcp_group() -> GroupCoordinator:
+    assert _PCP is not None, "prefill context parallel group is not initialized"
+    return _PCP
 
 
 @deprecated(
@@ -1276,6 +1285,7 @@ def init_distributed_environment(
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
+    prefill_context_model_parallel_size: int = 1,
     decode_context_model_parallel_size: int | None = 1,
     backend: str | None = None,
 ) -> None:
@@ -1325,7 +1335,11 @@ def initialize_model_parallel(
     # to get group_ranks for each dimension, transpose that dimension to the
     # last dimension, then reshape to 2D, then unbind the last dimension
     all_ranks = torch.arange(world_size).reshape(
-        -1, data_parallel_size, pipeline_model_parallel_size, tensor_model_parallel_size
+        -1,
+        data_parallel_size,
+        pipeline_model_parallel_size,
+        prefill_context_model_parallel_size,
+        tensor_model_parallel_size,
     )  # noqa
 
     # Build the tensor model-parallel groups.
@@ -1360,11 +1374,23 @@ def initialize_model_parallel(
         group_name="dcp",
     )
 
+    global _PCP
+    assert _PCP is None, "prefill context parallel group is already initialized"
+    group_ranks = (
+        all_ranks.transpose(3, 4)
+        .reshape(-1, prefill_context_model_parallel_size)
+        .unbind(0)
+    )
+    group_ranks = [x.tolist() for x in group_ranks]
+    _PCP = init_model_parallel_group(
+        group_ranks, get_world_group().local_rank, backend, group_name="pcp"
+    )
+
     # Build the pipeline model-parallel groups.
     global _PP
     assert _PP is None, "pipeline model parallel group is already initialized"
     group_ranks = (
-        all_ranks.transpose(2, 3).reshape(-1, pipeline_model_parallel_size).unbind(0)
+        all_ranks.transpose(2, 4).reshape(-1, pipeline_model_parallel_size).unbind(0)
     )
     group_ranks = [x.tolist() for x in group_ranks]
     _PP = init_model_parallel_group(
@@ -1373,7 +1399,7 @@ def initialize_model_parallel(
 
     global _DP
     assert _DP is None, "data parallel group is already initialized"
-    group_ranks = all_ranks.transpose(1, 3).reshape(-1, data_parallel_size).unbind(0)
+    group_ranks = all_ranks.transpose(1, 4).reshape(-1, data_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
     _DP = init_model_parallel_group(
         group_ranks, get_world_group().local_rank, backend, group_name="dp"
@@ -1383,7 +1409,12 @@ def initialize_model_parallel(
     assert _EP is None, "expert parallel group is already initialized"
     group_ranks = (
         all_ranks.transpose(1, 2)
-        .reshape(-1, data_parallel_size * tensor_model_parallel_size)
+        .reshape(
+            -1,
+            data_parallel_size
+            * prefill_context_model_parallel_size
+            * tensor_model_parallel_size,
+        )
         .unbind(0)
     )
     group_ranks = [x.tolist() for x in group_ranks]
@@ -1393,11 +1424,13 @@ def initialize_model_parallel(
 
     logger.info_once(
         "rank %s in world size %s is assigned as "
-        "DP rank %s, PP rank %s, TP rank %s, EP rank %s",
+        "DP rank %s, PP rank %s, PCP rank %s, "
+        "TP rank %s, EP rank %s",
         rank,
         world_size,
         _DP.rank_in_group,
         _PP.rank_in_group,
+        _PCP.rank_in_group,
         _TP.rank_in_group,
         _EP.rank_in_group,
     )
@@ -1406,6 +1439,7 @@ def initialize_model_parallel(
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
+    prefill_context_model_parallel_size: int = 1,
     decode_context_model_parallel_size: int | None = 1,
     backend: str | None = None,
 ) -> None:
@@ -1418,6 +1452,7 @@ def ensure_model_parallel_initialized(
         initialize_model_parallel(
             tensor_model_parallel_size,
             pipeline_model_parallel_size,
+            prefill_context_model_parallel_size,
             decode_context_model_parallel_size,
             backend,
         )
@@ -1434,6 +1469,12 @@ def ensure_model_parallel_initialized(
         f"got: {pp_world_size=} vs. "
         f"wanted: {pipeline_model_parallel_size=}"
     )
+    pcp_world_size = get_pcp_group().world_size
+    assert pcp_world_size == prefill_context_model_parallel_size, (
+        "prefill context parallel group already initialized, but of unexpected size: "
+        f"{pcp_world_size=} vs. "
+        f"{prefill_context_model_parallel_size=}"
+    )
 
 
 def prepare_communication_buffer_for_model(model: torch.nn.Module):
@@ -1445,6 +1486,8 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module):
     """
     if _TP is not None:
         _TP.prepare_communication_buffer_for_model(model)
+    if _PCP is not None:
+        _PCP.prepare_communication_buffer_for_model(model)
     if _PP is not None:
         _PP.prepare_communication_buffer_for_model(model)
     if _DP is not None:
@@ -1520,16 +1563,21 @@ def destroy_model_parallel():
         _TP.destroy()
     _TP = None
 
-    global _PP
-    if _PP:
-        _PP.destroy()
-    _PP = None
-
     global _DCP
     if _DCP:
         _DCP.destroy()
     _DCP = None
 
+    global _PCP
+    if _PCP:
+        _PCP.destroy()
+    _PCP = None
+
+    global _PP
+    if _PP:
+        _PP.destroy()
+    _PP = None
+
     global _DP
     if _DP:
         _DP.destroy()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e2f7326448b3..68205b6079d7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -389,8 +389,10 @@ class EngineArgs:
     nnodes: int = ParallelConfig.nnodes
     node_rank: int = ParallelConfig.node_rank
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
+    prefill_context_parallel_size: int = ParallelConfig.prefill_context_parallel_size
     decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size
     dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size
+    cp_kv_cache_interleave_size: int = ParallelConfig.cp_kv_cache_interleave_size
     data_parallel_size: int = ParallelConfig.data_parallel_size
     data_parallel_rank: int | None = None
     data_parallel_start_rank: int | None = None
@@ -770,6 +772,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--dcp-kv-cache-interleave-size",
             **parallel_kwargs["dcp_kv_cache_interleave_size"],
         )
+        parallel_group.add_argument(
+            "--cp-kv-cache-interleave-size",
+            **parallel_kwargs["cp_kv_cache_interleave_size"],
+        )
+        parallel_group.add_argument(
+            "--prefill-context-parallel-size",
+            "-pcp",
+            **parallel_kwargs["prefill_context_parallel_size"],
+        )
         parallel_group.add_argument(
             "--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"]
         )
@@ -1600,6 +1611,7 @@ def create_engine_config(
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
+            prefill_context_parallel_size=self.prefill_context_parallel_size,
             data_parallel_size=self.data_parallel_size,
             data_parallel_rank=self.data_parallel_rank or 0,
             data_parallel_external_lb=data_parallel_external_lb,
@@ -1631,6 +1643,7 @@ def create_engine_config(
             worker_extension_cls=self.worker_extension_cls,
             decode_context_parallel_size=self.decode_context_parallel_size,
             dcp_kv_cache_interleave_size=self.dcp_kv_cache_interleave_size,
+            cp_kv_cache_interleave_size=self.cp_kv_cache_interleave_size,
             _api_process_count=self._api_process_count,
             _api_process_rank=self._api_process_rank,
         )
@@ -1952,6 +1965,15 @@ def _set_default_args(
             default_prefix_caching,
         ) = self.get_chunked_prefill_prefix_caching_defaults(model_config)
 
+        if self.prefill_context_parallel_size > 1:
+            default_chunked_prefill = False
+            default_prefix_caching = False
+            logger.warning(
+                "--prefill-context-parallel-size > 1 is not compatible with "
+                "chunked prefill and prefix caching now. Chunked prefill "
+                "and prefix caching have been disabled by default."
+            )
+
         if self.enable_chunked_prefill is None:
             self.enable_chunked_prefill = default_chunked_prefill
 
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index a7bd64b1c65e..21eb4d590a7d 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -8,7 +8,11 @@
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig
-from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank
+from vllm.distributed import (
+    get_dp_group,
+    get_pcp_group,
+    get_tensor_model_parallel_rank,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
     OCP_MX_DTYPES,
@@ -684,9 +688,11 @@ def biased_moe_quant_config(
 @dataclass
 class FusedMoEParallelConfig:
     tp_size: int
+    pcp_size: int
     dp_size: int
     ep_size: int
     tp_rank: int
+    pcp_rank: int
     dp_rank: int
     ep_rank: int
 
@@ -713,19 +719,22 @@ def use_deepep_ll_kernels(self):
         return self.use_all2all_kernels and self.all2all_backend == "deepep_low_latency"
 
     @staticmethod
-    def flatten_tp_across_dp(
-        tp_size: int, dp_size: int, dp_rank: int
+    def flatten_tp_across_dp_and_pcp(
+        tp_size: int, dp_size: int, dp_rank: int, pcp_size: int, pcp_rank: int
     ) -> tuple[int, int]:
         tp_rank = 0 if tp_size == 1 else get_tensor_model_parallel_rank()
-        # There are actually dp_size * tp_size devices. Update tp_size
-        # and tp_rank so we shard across all devices.
-        flatten_tp_size = dp_size * tp_size
-        flatten_tp_rank = dp_rank * tp_size + tp_rank
+        # There are actually dp_size * pcp_size * tp_size devices.
+        # Update tp_size and tp_rank so we shard across all devices.
+        flatten_tp_size = dp_size * pcp_size * tp_size
+        flatten_tp_rank = dp_rank * pcp_size * tp_size + pcp_rank * tp_size + tp_rank
         return flatten_tp_size, flatten_tp_rank
 
     @staticmethod
     def make(
-        tp_size_: int, dp_size_: int, vllm_parallel_config: ParallelConfig
+        tp_size_: int,
+        pcp_size_: int,
+        dp_size_: int,
+        vllm_parallel_config: ParallelConfig,
     ) -> "FusedMoEParallelConfig":
         """
         Determine MoE parallel configuration. Based on the input `tp_size_`,
@@ -734,19 +743,22 @@ def make(
 
         Args:
             tp_size_ (int): `tp_size` passed into the FusedMoE constructor.
+            pcp_size_ (int): `pcp_size` passed into the FusedMoE constructor.
             dp_size_ (int): `dp_size` passed into the FusedMoE constructor.
             vllm_parallel_config (ParallelConfig): vLLM's parallel config
                 object which contains the `enable_expert_parallel` flag.
 
         Examples:
             When there is no parallelism requested,
-            i.e. `tp_size_` = `dp_size_` = 1, we simply return the sizes
+            i.e. `tp_size_` = `pcp_size_` = `dp_size_` = 1, we simply return the sizes
             unaltered and the ranks set to 0.
 
-            Expert Parallelism is considered only when either `dp_size_` or
+            Expert Parallelism is considered only when either `dp_size_`, `pcp_size_` or
             `tp_size_` is non trivial.
 
-            When TP = 2, DP = 1 and EP = False, the configuration on different
+            Note that PCP serves the same function as DP here.
+
+            When TP = 2, DP(PCP) = 1 and EP = False, the configuration on different
             devices:
 
             - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
@@ -754,7 +766,7 @@ def make(
             - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
             - Comment : Tensors are sharded across 2 devices.
 
-            When TP = 1, DP = 2 and EP = False, the configuration on different
+            When TP = 1, DP(PCP) = 2 and EP = False, the configuration on different
                 devices:
 
             - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
@@ -762,7 +774,7 @@ def make(
             - Comment: There are 2 engine instances and the tensors are sharded
                 across 2 decvices.
 
-            When TP = 2, DP = 2 and EP = False, the configuration on different
+            When TP = 2, DP(PCP) = 2 and EP = False, the configuration on different
                 devices:
 
             - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
@@ -772,14 +784,14 @@ def make(
             - Comment: There are 2 engine instances and the tensors are sharded
                 across 4 devices.
 
-            When, TP = 2, DP = 1 and EP = True, the configuration on different
+            When, TP = 2, DP(PCP) = 1 and EP = True, the configuration on different
                 devices:
 
             - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
             - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
             - Comment: The experts are split between the 2 devices.
 
-            When, TP = 1, DP = 2 and EP = True, the configuration on different
+            When, TP = 1, DP(PCP) = 2 and EP = True, the configuration on different
                 devices:
 
             - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
@@ -787,7 +799,7 @@ def make(
             - Comment: There are 2 engine instances and the experts are split
                 between the 2 devices.
 
-            When TP = 2, DP = 2 and EP = True, the configuration on different
+            When TP = 2, DP(PCP) = 2 and EP = True, the configuration on different
                 devices:
 
             - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
@@ -798,18 +810,25 @@ def make(
                 between the 4 devices.
         """
 
-        use_ep = dp_size_ * tp_size_ > 1 and vllm_parallel_config.enable_expert_parallel
+        use_ep = (
+            dp_size_ * pcp_size_ * tp_size_ > 1
+            and vllm_parallel_config.enable_expert_parallel
+        )
 
         dp_size = dp_size_
         dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
-        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
-            tp_size_, dp_size_, dp_rank
+        pcp_size = pcp_size_
+        pcp_rank = get_pcp_group().rank_in_group if pcp_size > 1 else 0
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
+            tp_size_, dp_size_, dp_rank, pcp_size_, pcp_rank
         )
 
         if not use_ep:
             return FusedMoEParallelConfig(
                 tp_size=tp_size,
                 tp_rank=tp_rank,
+                pcp_size=pcp_size,
+                pcp_rank=pcp_rank,
                 dp_size=dp_size,
                 dp_rank=dp_rank,
                 ep_size=1,
@@ -826,6 +845,8 @@ def make(
         return FusedMoEParallelConfig(
             tp_size=1,
             tp_rank=0,
+            pcp_size=pcp_size,
+            pcp_rank=pcp_rank,
             dp_size=dp_size,
             dp_rank=dp_rank,
             ep_size=ep_size,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 8e9bba344287..7b15e63e9e35 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -18,6 +18,7 @@
 from vllm.distributed import (
     get_dp_group,
     get_ep_group,
+    get_pcp_group,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
@@ -343,6 +344,7 @@ def __init__(
         tp_size: int | None = None,
         ep_size: int | None = None,
         dp_size: int | None = None,
+        pcp_size: int | None = None,
         prefix: str = "",
         custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
@@ -398,12 +400,14 @@ def __init__(
             tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
         )
         dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size
+        pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size
 
         self.is_sequence_parallel = is_sequence_parallel
         self.sp_size = tp_size_ if is_sequence_parallel else 1
 
         self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
             tp_size_=tp_size_,
+            pcp_size_=pcp_size_,
             dp_size_=dp_size_,
             vllm_parallel_config=vllm_config.parallel_config,
         )
@@ -679,6 +683,10 @@ def tp_size(self):
     def dp_size(self):
         return self.moe_parallel_config.dp_size
 
+    @property
+    def pcp_size(self):
+        return self.moe_parallel_config.pcp_size
+
     @property
     def ep_size(self):
         return self.moe_parallel_config.ep_size
@@ -691,6 +699,10 @@ def tp_rank(self):
     def dp_rank(self):
         return self.moe_parallel_config.dp_rank
 
+    @property
+    def pcp_rank(self):
+        return self.moe_parallel_config.pcp_rank
+
     @property
     def ep_rank(self):
         return self.moe_parallel_config.ep_rank
@@ -1871,6 +1883,19 @@ def forward_impl(
                 assert self.shared_experts is not None
                 shared_output = self.shared_experts(hidden_states)
 
+            # NOTE: Similar with DP, PCP also needs dispatch and combine. For
+            # simplicity, AgRsAll2All was added separately for PCP here. Maybe
+            # we should modify All2AllManager abstract to better support PCP.
+            if self.pcp_size > 1:
+                hidden_states = get_pcp_group().all_gather(
+                    hidden_states,
+                    dim=0,
+                )
+                router_logits = get_pcp_group().all_gather(
+                    router_logits,
+                    dim=0,
+                )
+
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
@@ -1925,6 +1950,13 @@ def forward_impl(
             def combine_output(states: torch.Tensor) -> torch.Tensor:
                 if do_naive_dispatch_combine:
                     states = get_ep_group().combine(states, self.is_sequence_parallel)
+
+                if self.pcp_size > 1:
+                    states = get_pcp_group().reduce_scatter(
+                        states,
+                        dim=0,
+                    )
+
                 return states
 
             if self.shared_experts is not None:
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index f310f71af92d..25048330f797 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -13,6 +13,7 @@
 from vllm.distributed import (
     get_dp_group,
     get_ep_group,
+    get_pcp_group,
     get_pp_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -322,10 +323,12 @@ def _load_weights_mxfp4(
 
         # In MoE, we need to flatten the tensor parallel size across the data
         # parallel size when EP is disabled.
-        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
             tp_size=get_tensor_model_parallel_world_size(),
             dp_size=get_dp_group().world_size,
             dp_rank=get_dp_group().rank_in_group,
+            pcp_size=get_pcp_group().world_size,
+            pcp_rank=get_pcp_group().rank_in_group,
         )
 
         intermediate_size = self.config.intermediate_size
@@ -507,10 +510,12 @@ def _load_weights_other(
 
         # In MoE, we need to flatten the tensor parallel size across the data
         # parallel size when EP is disabled.
-        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
             tp_size=get_tensor_model_parallel_world_size(),
             dp_size=get_dp_group().world_size,
             dp_rank=get_dp_group().rank_in_group,
+            pcp_size=get_pcp_group().world_size,
+            pcp_rank=get_pcp_group().rank_in_group,
         )
 
         intermediate_size = self.config.intermediate_size
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index fdc99a0df1c8..cf3c1d05f5b3 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -265,8 +265,8 @@ def __init__(
             self.dcp_world_size = 1
             self.dcp_rank = 0
 
-        self.dcp_kv_cache_interleave_size = (
-            self.parallel_config.dcp_kv_cache_interleave_size
+        self.cp_kv_cache_interleave_size = (
+            self.parallel_config.cp_kv_cache_interleave_size
         )
 
         self.use_full_cuda_graph = (
@@ -388,7 +388,7 @@ def schedule(
                 dcp_context_kv_lens_cpu,
                 self.dcp_world_size,
                 self.dcp_rank,
-                self.dcp_kv_cache_interleave_size,
+                self.cp_kv_cache_interleave_size,
             )
             dcp_context_kv_lens = dcp_context_kv_lens_cpu.to(self.device)
             max_dcp_context_kv_len = dcp_context_kv_lens.max().item()
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index e328049b53c7..32f406980f2e 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -536,7 +536,7 @@ def __init__(
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
-        self.dcp_local_block_size = parallel_config.dcp_kv_cache_interleave_size
+        self.dcp_local_block_size = parallel_config.cp_kv_cache_interleave_size
         self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size
 
         # Don't try to access the runner on AMD
@@ -1289,8 +1289,8 @@ def __init__(self, *args, **kwargs) -> None:
                 get_current_vllm_config()
             )
         )
-        self.dcp_kv_cache_interleave_size: int = (
-            get_current_vllm_config().parallel_config.dcp_kv_cache_interleave_size
+        self.cp_kv_cache_interleave_size: int = (
+            get_current_vllm_config().parallel_config.cp_kv_cache_interleave_size
         )
 
     def _flash_attn_varlen_diff_headdims(
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 0dd189633129..540a8e2b1d01 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -1080,9 +1080,9 @@ def compute_causal_conv1d_metadata(query_start_loc_p: torch.Tensor):
 
 def get_dcp_local_seq_lens(
     seq_lens: torch.Tensor,
-    dcp_world_size: int = 1,
+    dcp_size: int = 1,
     dcp_rank: int | None = None,
-    dcp_kv_cache_interleave_size: int = 1,
+    cp_kv_cache_interleave_size: int = 1,
 ) -> torch.Tensor:
     """While using dcp, kv_cache size stored on each rank may be different,
     use this function to calculate split decode seq_lens of each dcp rank.
@@ -1091,7 +1091,7 @@ def get_dcp_local_seq_lens(
     num_requests = seq_lens.size(0)
     if dcp_rank is None:
         rank_offsets = (
-            torch.arange(dcp_world_size, dtype=torch.int32)
+            torch.arange(dcp_size, dtype=torch.int32)
             .unsqueeze(0)
             .repeat(num_requests, 1)
         )
@@ -1102,15 +1102,15 @@ def get_dcp_local_seq_lens(
     )
     base = (
         seq_lens_tiled
-        // dcp_kv_cache_interleave_size
-        // dcp_world_size
-        * dcp_kv_cache_interleave_size
+        // cp_kv_cache_interleave_size
+        // dcp_size
+        * cp_kv_cache_interleave_size
     )
-    remainder = seq_lens_tiled - base * dcp_world_size
+    remainder = seq_lens_tiled - base * dcp_size
     remainder = torch.clip(
-        remainder - rank_offsets * dcp_kv_cache_interleave_size,
+        remainder - rank_offsets * cp_kv_cache_interleave_size,
         0,
-        dcp_kv_cache_interleave_size,
+        cp_kv_cache_interleave_size,
     )
     dcp_local_seq_lens = base + remainder
     return dcp_local_seq_lens.squeeze(1)
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 137e5e0cdb6d..1531b61f88fe 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -27,6 +27,7 @@ def __init__(
         enable_caching: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
     ):
         self.kv_cache_config = kv_cache_config
         self.max_model_len = max_model_len
@@ -44,6 +45,7 @@ def __init__(
                 block_pool=self.block_pool,
                 kv_cache_group_id=i,
                 dcp_world_size=dcp_world_size,
+                pcp_world_size=pcp_world_size,
             )
             for i, kv_cache_group in enumerate(self.kv_cache_config.kv_cache_groups)
         )
@@ -210,6 +212,7 @@ def __init__(
         use_eagle: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -218,6 +221,7 @@ def __init__(
             False,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
         self.num_single_type_manager = len(self.single_type_managers)
 
@@ -250,6 +254,7 @@ def __init__(
         enable_caching: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -258,12 +263,16 @@ def __init__(
             enable_caching,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
         self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec
         self.block_size = self.kv_cache_spec.block_size
         self.dcp_world_size = dcp_world_size
+        self.pcp_world_size = pcp_world_size
         if dcp_world_size > 1:
             self.block_size *= dcp_world_size
+        if pcp_world_size > 1:
+            self.block_size *= pcp_world_size
         assert len(self.kv_cache_config.kv_cache_groups) == 1, (
             "UnitaryKVCacheCoordinator assumes only one kv cache group"
         )
@@ -281,6 +290,7 @@ def find_longest_cache_hit(
             kv_cache_spec=self.kv_cache_spec,
             use_eagle=self.use_eagle,
             dcp_world_size=self.dcp_world_size,
+            pcp_world_size=self.pcp_world_size,
         )
         return hit_blocks, len(hit_blocks[0]) * self.block_size
 
@@ -302,6 +312,7 @@ def __init__(
         enable_caching: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -310,8 +321,10 @@ def __init__(
             enable_caching,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
         assert dcp_world_size == 1, "DCP not support hybrid attn now."
+        assert pcp_world_size == 1, "PCP not support hybrid attn now."
         self.verify_and_split_kv_cache_groups()
 
     def verify_and_split_kv_cache_groups(self) -> None:
@@ -452,6 +465,7 @@ def get_kv_cache_coordinator(
     enable_caching: bool,
     enable_kv_cache_events: bool,
     dcp_world_size: int,
+    pcp_world_size: int,
 ) -> KVCacheCoordinator:
     if not enable_caching:
         return KVCacheCoordinatorNoPrefixCache(
@@ -460,6 +474,7 @@ def get_kv_cache_coordinator(
             use_eagle,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
     if len(kv_cache_config.kv_cache_groups) == 1:
         return UnitaryKVCacheCoordinator(
@@ -469,6 +484,7 @@ def get_kv_cache_coordinator(
             enable_caching,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
     return HybridKVCacheCoordinator(
         kv_cache_config,
@@ -477,4 +493,5 @@ def get_kv_cache_coordinator(
         enable_caching,
         enable_kv_cache_events,
         dcp_world_size=dcp_world_size,
+        pcp_world_size=pcp_world_size,
     )
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 7f405fc248ac..2012c3fef88b 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -100,6 +100,7 @@ def __init__(
         log_stats: bool = False,
         enable_kv_cache_events: bool = False,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> None:
         self.max_model_len = max_model_len
 
@@ -124,12 +125,9 @@ def __init__(
                 0
             ].kv_cache_spec.block_size
 
-            if dcp_world_size > 1:
+            if dcp_world_size * pcp_world_size > 1:
                 assert len(kv_cache_config.kv_cache_groups) == 1
-                # Note(hc): need revisit. When both DCP and any future
-                # PCP are enabled, the block_size may need to be scaled
-                # by a factor of dcp_size × pcp_size?
-                self.block_size *= dcp_world_size
+                self.block_size *= dcp_world_size * pcp_world_size
 
         self.coordinator = get_kv_cache_coordinator(
             kv_cache_config=kv_cache_config,
@@ -138,6 +136,7 @@ def __init__(
             enable_caching=self.enable_caching,
             enable_kv_cache_events=enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
         )
         self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
         self.block_pool = self.coordinator.block_pool
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 6e026215d402..01ecd881115d 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1219,11 +1219,16 @@ def _report_kv_cache_config(
         // len(kv_cache_config.kv_cache_groups)
         * min_block_size
     )
-    if vllm_config.parallel_config.decode_context_parallel_size > 1:
-        num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
+    dcp_size = vllm_config.parallel_config.decode_context_parallel_size
+    pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
+    if pcp_size * dcp_size > 1:
+        num_tokens *= pcp_size * dcp_size
         logger.info(
-            "Multiplying the GPU KV cache size by the dcp_world_size %d.",
-            vllm_config.parallel_config.decode_context_parallel_size,
+            "Multiplying the GPU KV cache size by the cp_world_size %d "
+            "(pcp_world_size %d * dcp_world_size %d).",
+            pcp_size * dcp_size,
+            pcp_size,
+            dcp_size,
         )
     num_tokens_str = f"{num_tokens:,}"
     logger.info_once("GPU KV cache size: %s tokens", num_tokens_str, scope="local")
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4323141c435b..4cc4c29591cc 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -121,6 +121,7 @@ def __init__(
 
         self.block_size = block_size
         self.dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size
+        self.pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size
 
         # req_id -> Request
         self.requests: dict[str, Request] = {}
@@ -183,6 +184,7 @@ def __init__(
             log_stats=self.log_stats,
             enable_kv_cache_events=self.enable_kv_cache_events,
             dcp_world_size=self.dcp_world_size,
+            pcp_world_size=self.pcp_world_size,
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
 
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 14ac83028ee4..d90ec550f766 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -32,6 +32,7 @@ def __init__(
         block_pool: BlockPool,
         kv_cache_group_id: int,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> None:
         """
         Initializes the SingleTypeKVCacheManager.
@@ -42,8 +43,9 @@ def __init__(
         """
         self.block_size = kv_cache_spec.block_size
         self.dcp_world_size = dcp_world_size
-        if self.dcp_world_size > 1:
-            self.block_size *= dcp_world_size
+        self.pcp_world_size = pcp_world_size
+        if dcp_world_size * pcp_world_size > 1:
+            self.block_size *= dcp_world_size * pcp_world_size
         self.kv_cache_spec = kv_cache_spec
         self.block_pool = block_pool
 
@@ -212,6 +214,7 @@ def find_longest_cache_hit(
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
         Get the longest cache hit prefix of the blocks that is not longer than
@@ -303,6 +306,7 @@ def find_longest_cache_hit(
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(
             kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec)
@@ -314,8 +318,8 @@ def find_longest_cache_hit(
             [] for _ in range(len(kv_cache_group_ids))
         )
         block_size = kv_cache_spec.block_size
-        if dcp_world_size > 1:
-            block_size *= dcp_world_size
+        if dcp_world_size * pcp_world_size > 1:
+            block_size *= dcp_world_size * pcp_world_size
         max_num_blocks = max_length // block_size
         for block_hash in itertools.islice(block_hashes, max_num_blocks):
             # block_hashes is a chain of block hashes. If a block hash is not
@@ -362,11 +366,13 @@ def find_longest_cache_hit(
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, SlidingWindowSpec), (
             "SlidingWindowManager can only be used for sliding window groups"
         )
         assert dcp_world_size == 1, "DCP not support sliding window attn now."
+        assert pcp_world_size == 1, "PCP not support sliding window attn now."
 
         # The number of contiguous blocks needed for prefix cache hit.
         # -1 since the input token itself is also included in the window
@@ -476,6 +482,7 @@ def find_longest_cache_hit(
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
         For chunked local attention, we need to find the longest cache hit
@@ -516,6 +523,7 @@ def find_longest_cache_hit(
             "Hybrid KV cache is not supported for " + "eagle + chunked local attention."
         )
         assert dcp_world_size == 1, "DCP not support chunked local attn now."
+        assert pcp_world_size == 1, "PCP not support chunked local attn now."
         max_num_blocks = max_length // kv_cache_spec.block_size
         if max_length > 0:
             local_attention_start_idx = (
@@ -611,11 +619,13 @@ def find_longest_cache_hit(
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, MambaSpec), (
             "MambaManager can only be used for mamba groups"
         )
         assert dcp_world_size == 1, "DCP not support mamba now."
+        assert pcp_world_size == 1, "PCP not support mamba now."
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
             [] for _ in range(len(kv_cache_group_ids))
         )
@@ -705,6 +715,7 @@ def find_longest_cache_hit(
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, CrossAttentionSpec), (
             "CrossAttentionManager can only be used for cross-attention groups"
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 3a25827cec38..6be19894d332 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -128,6 +128,7 @@ def __init__(
         scheduler_block_size = (
             vllm_config.cache_config.block_size
             * vllm_config.parallel_config.decode_context_parallel_size
+            * vllm_config.parallel_config.prefill_context_parallel_size
         )
 
         self.scheduler: SchedulerInterface = Scheduler(
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index ad2ece50f981..7e8ebe25c460 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -35,6 +35,7 @@
     get_dp_group,
     get_ep_group,
     get_inner_dp_world_group,
+    get_pcp_group,
     get_pp_group,
     get_tp_group,
 )
@@ -110,12 +111,14 @@ def _init_executor(self) -> None:
             f"({self.parallel_config.nnodes_within_dp}). "
         )
         self.local_world_size = self.parallel_config.local_world_size
-        tensor_parallel_size = self.parallel_config.tensor_parallel_size
-        pp_parallel_size = self.parallel_config.pipeline_parallel_size
-        assert self.world_size == tensor_parallel_size * pp_parallel_size, (
+        tp_size = self.parallel_config.tensor_parallel_size
+        pp_size = self.parallel_config.pipeline_parallel_size
+        pcp_size = self.parallel_config.prefill_context_parallel_size
+        assert self.world_size == tp_size * pp_size * pcp_size, (
             f"world_size ({self.world_size}) must be equal to the "
-            f"tensor_parallel_size ({tensor_parallel_size}) x pipeline"
-            f"_parallel_size ({pp_parallel_size}). "
+            f"tensor_parallel_size ({tp_size}) x pipeline"
+            f"_parallel_size ({pp_size}) x prefill_context"
+            f"_parallel_size ({pcp_size}). "
         )
 
         # Set multiprocessing envs
@@ -424,7 +427,11 @@ def _get_output_rank(self) -> int:
         # 16-23, PP rank 2
         # 24-31, PP rank 3
         # so world_size - tp_size = 32 - 8 = 24 should be PP rank = -1 (i.e. 3)
-        return self.world_size - self.parallel_config.tensor_parallel_size
+        return (
+            self.world_size
+            - self.parallel_config.tensor_parallel_size
+            * self.parallel_config.prefill_context_parallel_size
+        )
 
 
 @dataclass
@@ -828,6 +835,8 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
         dp_rank = get_dp_group().rank_in_group
         pp_size = get_pp_group().world_size
         pp_rank = get_pp_group().rank_in_group
+        pcp_size = get_pcp_group().world_size
+        pcp_rank = get_pcp_group().rank_in_group
         tp_size = get_tp_group().world_size
         tp_rank = get_tp_group().rank_in_group
         dcp_size = get_dcp_group().world_size
@@ -837,6 +846,8 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
             process_name += f"_DP{dp_rank}"
         if pp_size > 1:
             process_name += f"_PP{pp_rank}"
+        if pcp_size > 1:
+            process_name += f"_PCP{pcp_rank}"
         if tp_size > 1:
             process_name += f"_TP{tp_rank}"
         if dcp_size > 1:
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 7f33eb7e699c..751862aa9c76 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -95,10 +95,11 @@ class FullAttentionSpec(AttentionSpec):
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
         dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size
+        pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size
         # Note(hc): each dcp rank only need save
         # (max_model_len//dcp_world_size) tokens locally.
-        if dcp_world_size > 1:
-            max_model_len = cdiv(max_model_len, dcp_world_size)
+        if dcp_world_size * pcp_world_size > 1:
+            max_model_len = cdiv(max_model_len, dcp_world_size * pcp_world_size)
         return cdiv(max_model_len, self.block_size) * self.page_size_bytes
 
     @classmethod
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 9f6c19e46430..76e17f3797a1 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -4,7 +4,7 @@
 import numpy as np
 import torch
 
-from vllm.distributed import get_dcp_group
+from vllm.distributed import get_dcp_group, get_pcp_group
 from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv
 from vllm.v1.utils import CpuGpuBuffer
@@ -22,7 +22,7 @@ def __init__(
         pin_memory: bool,
         device: torch.device,
         kernel_block_size: int,
-        dcp_kv_cache_interleave_size: int,
+        cp_kv_cache_interleave_size: int,
     ):
         """
         Args:
@@ -80,6 +80,13 @@ def __init__(
         else:
             self._kernel_block_arange = None
 
+        try:
+            self.pcp_world_size = get_pcp_group().world_size
+            self.pcp_rank = get_pcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.pcp_world_size = 1
+            self.pcp_rank = 0
         try:
             self.dcp_world_size = get_dcp_group().world_size
             self.dcp_rank = get_dcp_group().rank_in_group
@@ -87,7 +94,7 @@ def __init__(
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
-        self.dcp_kv_cache_interleave_size = dcp_kv_cache_interleave_size
+        self.cp_kv_cache_interleave_size = cp_kv_cache_interleave_size
 
     def append_row(
         self,
@@ -131,14 +138,16 @@ def compute_slot_mapping(
         # NOTE(woosuk): We can't simply use `token_indices // block_size`
         # here because M (max_model_len) is not necessarily divisible by
         # block_size.
-        if self.dcp_world_size > 1:
+        total_cp_world_size = self.pcp_world_size * self.dcp_world_size
+        total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank
+        if total_cp_world_size > 1:
             # Note(hc): The DCP implement store kvcache with an interleave
             # style, the kvcache for the token whose token_idx is i is
             # always stored on the GPU whose dcp_rank equals i % cp_world_size:
 
             # Use a "virtual block" which equals to world_size * block_size
             # for block_table_indices calculation.
-            virtual_block_size = self.block_size * self.dcp_world_size
+            virtual_block_size = self.block_size * total_cp_world_size
             block_table_indices = (
                 req_indices * self.max_num_blocks_per_req
                 + positions // virtual_block_size
@@ -150,16 +159,16 @@ def compute_slot_mapping(
             virtual_block_offsets = positions % virtual_block_size
             mask = (
                 virtual_block_offsets
-                // self.dcp_kv_cache_interleave_size
-                % self.dcp_world_size
-                == self.dcp_rank
+                // self.cp_kv_cache_interleave_size
+                % total_cp_world_size
+                == total_cp_rank
             )
             # Calculate local block_offsets
             block_offsets = (
                 virtual_block_offsets
-                // (self.dcp_world_size * self.dcp_kv_cache_interleave_size)
-                * self.dcp_kv_cache_interleave_size
-                + virtual_block_offsets % self.dcp_kv_cache_interleave_size
+                // (total_cp_world_size * self.cp_kv_cache_interleave_size)
+                * self.cp_kv_cache_interleave_size
+                + virtual_block_offsets % self.cp_kv_cache_interleave_size
             )
             # Calculate slot_mapping
             slot_mapping = block_numbers * self.block_size + block_offsets
@@ -253,7 +262,7 @@ def __init__(
         block_sizes: list[int],
         kernel_block_sizes: list[int],
         num_speculative_tokens: int = 0,
-        dcp_kv_cache_interleave_size: int = 1,
+        cp_kv_cache_interleave_size: int = 1,
     ) -> None:
         # Note(hc): each dcp rank only store
         # (max_model_len//dcp_world_size) tokens in kvcache,
@@ -283,7 +292,7 @@ def __init__(
                 pin_memory,
                 device,
                 kernel_block_size,
-                dcp_kv_cache_interleave_size,
+                cp_kv_cache_interleave_size,
             )
             for block_size, kernel_block_size in zip(block_sizes, kernel_block_sizes)
         ]
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index c1bfe727d86e..7b4bc1d2a224 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -87,7 +87,7 @@ def __init__(
         is_spec_decode: bool = False,
         is_pooling_model: bool = False,
         num_speculative_tokens: int = 0,
-        dcp_kv_cache_interleave_size: int = 1,
+        cp_kv_cache_interleave_size: int = 1,
     ):
         self.is_pooling_model = is_pooling_model
         self.is_spec_decode = is_spec_decode
@@ -141,7 +141,7 @@ def __init__(
             block_sizes=block_sizes,
             kernel_block_sizes=kernel_block_sizes,
             num_speculative_tokens=num_speculative_tokens,
-            dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+            cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
         )
 
         # Sampling-related.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0c35f1330e9f..80f8344d4410 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -426,7 +426,7 @@ def __init__(
             # uses output token ids so we set this conservatively.
             logitsprocs_need_output_token_ids=bool(custom_logitsprocs),
             is_pooling_model=self.is_pooling_model,
-            dcp_kv_cache_interleave_size=self.parallel_config.dcp_kv_cache_interleave_size,
+            cp_kv_cache_interleave_size=self.parallel_config.cp_kv_cache_interleave_size,
         )
 
         self.use_async_scheduling = self.scheduler_config.async_scheduling
@@ -1436,7 +1436,7 @@ def _build_attention_metadata(
                 self.seq_lens.cpu[:num_reqs],
                 self.dcp_world_size,
                 self.dcp_rank,
-                self.parallel_config.dcp_kv_cache_interleave_size,
+                self.parallel_config.cp_kv_cache_interleave_size,
             )
             self.dcp_local_seq_lens.copy_to_gpu(num_reqs)
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 315f01b68499..b8339fc4dc8b 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -26,6 +26,7 @@
     has_kv_transfer_group,
 )
 from vllm.distributed.parallel_state import (
+    get_pcp_group,
     get_pp_group,
     get_tp_group,
 )
@@ -733,6 +734,7 @@ def update_moe_modules(moe_modules: list[FusedMoE], num_local_experts: int):
                 module.global_num_experts = module.moe_config.num_experts
                 module.moe_parallel_config = FusedMoEParallelConfig.make(
                     tp_size_=get_tp_group().world_size,
+                    pcp_size_=get_pcp_group().world_size,
                     dp_size_=get_dp_group().world_size,
                     vllm_parallel_config=parallel_config,
                 )
@@ -886,6 +888,7 @@ def init_worker_distributed_environment(
     ensure_model_parallel_initialized(
         parallel_config.tensor_parallel_size,
         parallel_config.pipeline_parallel_size,
+        parallel_config.prefill_context_parallel_size,
         parallel_config.decode_context_parallel_size,
     )
 

From 68d7231991cc307d6865eac5bfca551c06f67465 Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Wed, 19 Nov 2025 15:04:36 -0600
Subject: [PATCH 203/578] [CI/Build] Fix test_prefix_prefill for AMD (#28905)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 tests/kernels/attention/test_prefix_prefill.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 78cdbbbf7379..e041e8c8d2ff 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -174,11 +174,11 @@ def test_contexted_kv_attention(
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
+        torch.int32
     )
     for i in range(BS):
         for j in range(query_lens[i]):
@@ -417,11 +417,11 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
+        torch.int32
     )
     for i in range(BS):
         for j in range(query_lens[i]):

From 1607e664f0de4b7eb113c0259b889edbe73c4341 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 19 Nov 2025 16:18:32 -0500
Subject: [PATCH 204/578] [Bug] Fix Batch Invariant MLA test (#28967)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/v1/determinism/test_batch_invariance.py | 41 +++++++++++++++----
 vllm/model_executor/layers/batch_invariant.py |  2 +-
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index f018ee551dbf..d4e88891512c 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -9,13 +9,33 @@
 from utils import _extract_step_logprobs, _random_prompt, skip_unsupported
 
 from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+
+BACKENDS: list[str] = [
+    "FLASH_ATTN",
+    "FLASHINFER",
+]
+
+if current_platform.is_cuda() and current_platform.is_device_capability(90):
+    BACKENDS.append("FLASH_ATTN_MLA")
+
+DEFAULT_MODEL = "Qwen/Qwen3-1.7B"
+MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+
+
+def resolve_model_name(backend: str) -> str:
+    """Resolve the model name for the given backend, respecting env overrides."""
+    model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL)
+    if backend.endswith("MLA") and model == DEFAULT_MODEL:
+        return MLA_MODEL
+    return model
 
 
 @skip_unsupported
 @pytest.mark.timeout(1000)
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
 def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     backend, monkeypatch: pytest.MonkeyPatch
@@ -47,7 +67,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
     # Allow overrides from environment (useful for CI tuning)
     # "facebook/opt-125m" is too small, doesn't reliably test determinism
-    model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model = resolve_model_name(backend)
     num_trials = int(os.getenv("VLLM_NEEDLE_TRIALS", "5"))
     max_batch_size = int(os.getenv("VLLM_NEEDLE_BATCH_SIZE", "128"))
     min_random_prompt = int(os.getenv("VLLM_MIN_PROMPT", "1024"))
@@ -150,7 +170,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
 @skip_unsupported
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
 @pytest.mark.forked
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
@@ -160,7 +180,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
 
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
-    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model_name = resolve_model_name(backend)
     tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
 
     # For batch invariance, disable custom all-reduce to ensure deterministic
@@ -369,7 +389,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
 @skip_unsupported
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
 def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
     """
@@ -377,7 +397,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
     Useful for quick smoke testing and debugging.
     """
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-    model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model = resolve_model_name(backend)
 
     llm = LLM(
         model=model,
@@ -419,7 +439,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
 @skip_unsupported
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
 @pytest.mark.forked
 def test_logprobs_without_batch_invariance_should_fail(
@@ -434,6 +454,9 @@ def test_logprobs_without_batch_invariance_should_fail(
     The test will PASS if we detect differences (proving batch invariance matters).
     The test will FAIL if everything matches (suggesting batch invariance isn't needed).
     """
+    from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
+
+    vllm_is_batch_invariant.cache_clear()
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
 
     # CRITICAL: Disable batch invariance for this test
@@ -441,7 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail(
 
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
-    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model_name = resolve_model_name(backend)
     tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
 
     print(f"\n{'=' * 80}")
@@ -659,7 +682,7 @@ def test_decode_logprobs_match_prefill_logprobs(
 
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
-    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model_name = resolve_model_name(backend)
     tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
 
     from vllm.model_executor.layers.batch_invariant import (
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 7920d117de5e..5dbeb2917434 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -803,11 +803,11 @@ def override_envs_for_invariance():
         "FLASH_ATTN",  # best supported backend
         "FLASHINFER",
         "FLASH_ATTN_MLA",
-        "FLASHINFER_MLA",
         "TRITON_MLA",
         # Not yet supported MLA backends
         # "FLASHMLA",
         # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance
+        # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967
     ]
     if curr_attn_backend not in supported_backends:
         warning = (

From cdeec2e6067613c501f82463d54e420097f49750 Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Wed, 19 Nov 2025 22:20:58 +0100
Subject: [PATCH 205/578] [BugFix] Ray with multiple nodes (#28873)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
---
 vllm/v1/worker/gpu_worker.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index b8339fc4dc8b..7f9cdd221224 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -205,14 +205,14 @@ def init_device(self):
                 assert self.local_rank < torch.cuda.device_count(), (
                     f"DP adjusted local rank {self.local_rank} is out of bounds. "
                 )
-            visible_device_count = (
-                torch.cuda.device_count() if torch.cuda.is_available() else 0
-            )
-            assert self.parallel_config.local_world_size <= visible_device_count, (
-                f"local_world_size ({self.parallel_config.local_world_size}) must be "
-                f"less than or equal to the number of visible devices "
-                f"({visible_device_count})."
-            )
+                visible_device_count = (
+                    torch.cuda.device_count() if torch.cuda.is_available() else 0
+                )
+                assert self.parallel_config.local_world_size <= visible_device_count, (
+                    f"local_world_size ({self.parallel_config.local_world_size}) must "
+                    f"be less than or equal to the number of visible devices "
+                    f"({visible_device_count})."
+                )
             self.device = torch.device(f"cuda:{self.local_rank}")
             current_platform.set_device(self.device)
 

From 613abb50d5715ba693ee9d5b727e8385b98e7185 Mon Sep 17 00:00:00 2001
From: Shu Wang <shuw@nvidia.com>
Date: Wed, 19 Nov 2025 15:29:06 -0600
Subject: [PATCH 206/578] [MoE] Nvfp4 Masked Gemm: Add flashinfer
 grouped_gemm_nt_masked (#25990)

Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 tests/kernels/moe/test_cutedsl_moe.py         | 582 ++++++++++++++++++
 vllm/envs.py                                  |   8 +-
 .../fused_moe/deepep_ll_prepare_finalize.py   |  16 +-
 .../fused_moe/flashinfer_cutedsl_moe.py       | 346 +++++++++++
 .../layers/quantization/modelopt.py           |  30 +-
 .../quantization/utils/flashinfer_fp4_moe.py  |  43 +-
 .../quantization/utils/flashinfer_utils.py    |  21 +-
 .../quantization/utils/nvfp4_moe_support.py   |   6 +-
 vllm/utils/flashinfer.py                      |  42 ++
 10 files changed, 1062 insertions(+), 33 deletions(-)
 create mode 100644 tests/kernels/moe/test_cutedsl_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 98daebcc0693..5309581d8e81 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -921,6 +921,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 
 - label: Blackwell Fusion and Compile Tests # 30 min
   timeout_in_minutes: 40
diff --git a/tests/kernels/moe/test_cutedsl_moe.py b/tests/kernels/moe/test_cutedsl_moe.py
new file mode 100644
index 000000000000..af1a34d17d48
--- /dev/null
+++ b/tests/kernels/moe/test_cutedsl_moe.py
@@ -0,0 +1,582 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+
+from vllm.platforms import current_platform
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
+
+import torch
+from flashinfer import fp4_quantize
+from torch.nn import functional as F
+
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (
+    flashinfer_cutedsl_moe_masked,
+)
+from vllm.utils.flashinfer import (
+    flashinfer_cutedsl_grouped_gemm_nt_masked as cutedsl_gmm_masked,
+)
+from vllm.utils.flashinfer import (
+    scaled_fp4_grouped_quantize,
+)
+
+kE2M1ToFloat = torch.tensor(
+    [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
+)
+
+FLOAT8_E4M3_MAX = 448.0
+FLOAT4_E2M1_MAX = 6.0
+
+
+def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
+    m_tiles = (m + 128 - 1) // 128
+    f = block_size * 4
+    k_tiles = (k + f - 1) // f
+    tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size)
+    return out[0:m, 0:k]
+
+
+def dequantize_nvfp4_to_dtype(
+    tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16
+):
+    """Dequantize the fp4 tensor back to high precision."""
+    # Two fp4 values are packed into one uint8.
+    assert tensor_fp4.dtype == torch.uint8
+    m, packed_k = tensor_fp4.shape
+    k = packed_k * 2
+    tensor_f32 = break_fp4_bytes(tensor_fp4, dtype)
+    tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size)
+    tensor_sf = tensor_sf.view(torch.float8_e4m3fn)
+    tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size)
+    tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale
+
+    # scale the tensor
+    out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k)
+    return out.to(dtype=dtype)
+
+
+def break_fp4_bytes(a, dtype):
+    assert a.dtype == torch.uint8
+    m, n = a.shape
+
+    # Vectorized nibble processing
+    a_flat = a.flatten()
+    high = (a_flat & 0xF0) >> 4  # Upper nibbles
+    low = a_flat & 0x0F  # Lower nibbles
+
+    # Combine nibbles for batch processing
+    combined = torch.stack((low, high), dim=1).flatten()
+
+    # Vectorized sign and magnitude extraction
+    signs = (combined & 0x08).to(torch.bool)  # Sign bits
+    abs_vals = (combined & 0x07).to(torch.long)  # Magnitude indices
+
+    # Device-aware lookup and sign application
+    kE2M1 = kE2M1ToFloat.to(device=a.device)
+    values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0)
+
+    # Reshape to final form
+    return values.reshape(m, n * 2).to(dtype=dtype)
+
+
+def generate_balanced_routing(
+    hidden_states: torch.Tensor, num_experts: int, top_k: int
+):
+    """
+    Generate routing weights and topk indices such that every expert is active.
+    Returns routing_weights, topk_idx
+    """
+
+    num_tokens, hidden_dim = hidden_states.shape
+    #   num_tokens = batch_size * seq_len
+
+    # First, assign at least one token per expert
+    tokens_per_expert = torch.arange(num_tokens) % num_experts
+    tokens_per_expert = tokens_per_expert[torch.randperm(num_tokens)]  # shuffle
+
+    # Each token has top_k experts — start with one guaranteed expert
+    topk_idx = torch.full((num_tokens, top_k), -1, dtype=torch.long)
+    topk_idx[:, 0] = tokens_per_expert
+
+    # For remaining top_k - 1 experts, pick randomly (allowing repeats)
+    if top_k > 1:
+        random_choices = torch.randint(0, num_experts, (num_tokens, top_k - 1))
+        topk_idx[:, 1:] = random_choices
+
+    # Normalize routing weights so each token's weights sum to 1
+    routing_weights = torch.rand(num_tokens, top_k)
+    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+    # Reshape back if needed
+    routing_weights = routing_weights.view(num_tokens, top_k)
+    topk_idx = topk_idx.view(num_tokens, top_k)
+
+    return routing_weights, topk_idx
+
+
+def prepare_inputs(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    num_experts: int,
+    topk: int,
+):
+    routing_weights, topk_idx = generate_balanced_routing(
+        router_logits, num_experts, topk
+    )
+
+    masked_m = []
+    for i in range(num_experts):
+        mask = topk_idx.view(-1) == i
+        masked_m.append(mask.sum())
+
+    masked_m = torch.tensor(masked_m, dtype=torch.int32)
+    # Intialize the hidden_states_3d with ones instead of empty to avoid nan
+    # issue.
+    hidden_states_3d = torch.ones(
+        (num_experts, max(masked_m), hidden_states.shape[1]), dtype=hidden_states.dtype
+    )
+    for i in range(num_experts):
+        hidden_states_3d[i, : masked_m[i], :] = hidden_states[topk_idx.view(-1) == i]
+
+    return hidden_states_3d, masked_m, topk_idx, routing_weights
+
+
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 2048, 1024),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+]
+
+
+# Reference implementation of torch_moe
+def torch_moe(a, w1, w2, score, topk, expert_map):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    if expert_map is not None:
+        topk_ids = expert_map[topk_ids]
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(
+                0, 1
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            m = w1[i].shape[0]
+            assert m % 2 == 0
+            # Note: w1 and w3 are swapped!
+            w3_expert, w1_expert = w1[i][m // 2 :, :], w1[i][: m // 2, :]
+            inter = F.silu(a[mask] @ w1_expert.t()) * (a[mask] @ w3_expert.t())
+            inter_gs = torch.tensor(1.0).cuda()
+            inter_q, inter_blockscale = fp4_quantize(inter, inter_gs)
+            inter = dequantize_nvfp4_to_dtype(
+                inter_q,
+                inter_blockscale,
+                inter_gs,
+                dtype=inter.dtype,
+                device=inter.device,
+                block_size=16,
+            ).cuda()
+            out[mask] = inter @ w2[i].transpose(0, 1)
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+def grouped_gemm_ref(
+    hidden_states_expanded: torch.Tensor,
+    hidden_states_3d: torch.Tensor,
+    weights: torch.Tensor,
+    topk_idx: torch.Tensor,
+    masked_m: torch.Tensor,
+    B: int,
+    topk: int,
+    num_experts: int,
+    *,
+    block_size: int = 16,
+) -> torch.Tensor:
+    """
+    Computes the reference grouped GEMM (fp4 quantized per-expert loop),
+    computes flashinfer grouped GEMM (for scale consistency),
+    and returns ONLY the repacked reference output: out_ref.
+
+    Returns:
+        out_ref: Tensor [num_experts, max_m, n_out]
+    """
+    device_hs = hidden_states_expanded.device
+    device_w = weights.device
+    out_dtype = weights.dtype
+    n_out = weights.shape[1]
+
+    # Flattened reference output (B*topk, n_out)
+    out = torch.zeros((B * topk, n_out), dtype=out_dtype, device=device_w)
+
+    # Per-expert reference compute loop
+    for i in range(num_experts):
+        mask = topk_idx.view(-1) == i
+        if mask.any():
+            lhs = hidden_states_expanded[mask]
+            rhs = weights[i]
+
+            a_amax = lhs.abs().max().to(torch.float32).to(device_hs)
+            b_amax = rhs.abs().max().to(torch.float32).to(device_w)
+
+            a_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax
+            b_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
+
+            lhsq, lhsq_sf = fp4_quantize(lhs, a_gs)
+            rhsq, rhsq_sf = fp4_quantize(rhs, b_gs)
+
+            lhs_in_dtype = dequantize_nvfp4_to_dtype(
+                lhsq,
+                lhsq_sf,
+                a_gs,
+                dtype=lhs.dtype,
+                device=device_hs,
+                block_size=block_size,
+            )
+            rhs_in_dtype = dequantize_nvfp4_to_dtype(
+                rhsq,
+                rhsq_sf,
+                b_gs,
+                dtype=rhs.dtype,
+                device=device_w,
+                block_size=block_size,
+            )
+
+            out[mask] = lhs_in_dtype @ rhs_in_dtype.t()
+
+    # Determine per-expert max_m
+    max_m_val = int(masked_m.max().item())
+
+    # Repack into [num_experts, max_m, n_out]
+    out_ref = torch.zeros(
+        (num_experts, max_m_val, n_out),
+        dtype=out.dtype,
+        device=out.device,
+    )
+    expert_slot = [0] * num_experts
+
+    for i, expert_id in enumerate(topk_idx.view(-1).tolist()):
+        slot = expert_slot[expert_id]
+        if slot < max_m_val:
+            out_ref[expert_id, slot, :] = out[i]
+            expert_slot[expert_id] += 1
+        else:
+            raise IndexError(
+                f"Expert {expert_id} exceeded max slots ({max_m_val}). "
+                "Increase max_m or check masked_m."
+            )
+
+    return out_ref
+
+
+def flashinfer_cutedsl_grouped_gemm_nt_masked(
+    hidden_states: torch.Tensor,  # 3d
+    input_global_scale: torch.Tensor,  # (l,)
+    weights: torch.Tensor,
+    w_global_scale: torch.Tensor,  # (l,)
+    masked_m: torch.Tensor,
+):
+    # hidden_states: [l, m, k]
+    # weights: [l, n, k]
+    aq, aq_sf = scaled_fp4_grouped_quantize(
+        hidden_states,
+        masked_m.to(hidden_states.device),
+        input_global_scale,
+    )
+    num_experts, n, k = weights.shape
+    bq, bq_sf = scaled_fp4_grouped_quantize(
+        weights,
+        torch.full((num_experts,), n, device=weights.device, dtype=torch.int32),
+        w_global_scale,
+    )
+
+    out = torch.zeros(
+        (num_experts, max(masked_m), n), dtype=weights.dtype, device=aq.device
+    )
+    out = out.permute(1, 2, 0)  # requirement of kernel
+    sf_vec_size = 16
+    ab_dtype = "float4_e2m1fn"
+    sf_dtype = "float8_e4m3fn"
+    c_dtype = "bfloat16"
+    alpha = 1.0 / (input_global_scale * w_global_scale).to(out.dtype).view(
+        1, 1, num_experts
+    )
+
+    def get_cute_dtype(input: torch.Tensor) -> str:
+        if input.dtype == torch.bfloat16:
+            return "bfloat16"
+        elif input.dtype == torch.float16:
+            return "float16"
+        elif input.dtype == torch.float32:
+            return "float32"
+        else:
+            raise ValueError(f"Unsupported cute dtype {input.dtype}")
+
+    cutedsl_gmm_masked(
+        (aq, aq_sf),
+        (bq, bq_sf),
+        out,
+        masked_m.to(aq.device),
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        sf_vec_size=sf_vec_size,
+        alpha=alpha,
+        alpha_dtype=get_cute_dtype(alpha),
+    )
+
+    return out
+
+
+@pytest.mark.parametrize("bs, hidden_dim, inter_dim", [(2, 128, 256), (16, 128, 512)])
+@pytest.mark.parametrize("topk", [1, 2, 4])
+@torch.inference_mode()
+def test_flashinfer_cutedsl_moe_masked(
+    bs: int, hidden_dim: int, inter_dim: int, topk: int
+):
+    torch.manual_seed(42)
+    device = "cuda"
+    num_experts = 8
+    hidden_states = (
+        torch.randn(bs, hidden_dim, dtype=torch.bfloat16, device=device) / 5.0
+    )
+    w1 = (
+        torch.randn(
+            num_experts, 2 * inter_dim, hidden_dim, dtype=torch.bfloat16, device=device
+        )
+        / 10.0
+    )
+    w2 = (
+        torch.randn(
+            num_experts, hidden_dim, inter_dim, dtype=torch.bfloat16, device=device
+        )
+        / 10.0
+    )
+    router_logits = torch.randn(bs, num_experts, dtype=torch.float32)
+
+    hidden_states_expanded = (
+        hidden_states.view(bs, -1, hidden_dim)
+        .repeat(1, topk, 1)
+        .reshape(-1, hidden_dim)
+    )
+    hidden_states_3d, masked_m, topk_idx, routing_weights = prepare_inputs(
+        hidden_states_expanded, router_logits, num_experts, topk
+    )
+
+    w1_amax = w1.abs().amax(dim=(1, 2)).to(torch.float32).to(w1.device)
+    w2_amax = w2.abs().amax(dim=(1, 2)).to(torch.float32).to(w2.device)
+    input_global_scale = torch.ones(
+        (num_experts,), dtype=torch.float32, device=hidden_states.device
+    )
+
+    w1_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax
+    w2_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax
+    a2_global_scale = torch.ones(
+        (num_experts,), dtype=torch.float32, device=hidden_states.device
+    )  # assume intermediate scale is 1.0
+
+    w1_fp4, w1_blockscale = scaled_fp4_grouped_quantize(
+        w1,
+        torch.ones(num_experts, dtype=torch.int32, device=w1.device) * 2 * inter_dim,
+        w1_global_scale,
+    )
+    w2_fp4, w2_blockscale = scaled_fp4_grouped_quantize(
+        w2,
+        torch.ones(num_experts, dtype=torch.int32, device=w2.device) * hidden_dim,
+        w2_global_scale,
+    )
+
+    w1_alpha = 1.0 / (input_global_scale * w1_global_scale)
+    w2_alpha = 1.0 / (a2_global_scale * w2_global_scale)
+
+    out = torch.empty_like(hidden_states_3d)
+    # Note: the 1st dim shouldn't be bs
+    wk = torch.empty(
+        num_experts,
+        hidden_states_3d.shape[1],
+        inter_dim * 2,
+        dtype=hidden_states_3d.dtype,
+        device=hidden_states.device,
+    )
+    flashinfer_cutedsl_moe_masked(
+        hidden_states_3d.to(hidden_states.device),
+        input_global_scale,
+        w1_fp4.permute(2, 0, 1),
+        w1_blockscale,
+        w1_alpha,
+        w2_fp4.permute(2, 0, 1),
+        a2_global_scale,
+        w2_blockscale,
+        w2_alpha,
+        masked_m.to(hidden_states.device),
+        wk,
+        out,
+    )
+
+    # reference
+    a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, input_global_scale)
+    a_in_dtype = dequantize_nvfp4_to_dtype(
+        a_fp4,
+        a_scale_interleaved,
+        input_global_scale,
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+        block_size=16,
+    )
+    w1_d = torch.empty(
+        (num_experts, 2 * inter_dim, hidden_dim), device=w1.device, dtype=w1.dtype
+    )
+    w2_d = torch.empty(
+        (num_experts, hidden_dim, inter_dim), device=w2.device, dtype=w2.dtype
+    )
+
+    for idx in range(0, num_experts):
+        w1_fp4_sliced, w1_blockscale_sliced = fp4_quantize(
+            w1[idx], w1_global_scale[idx]
+        )
+        w2_fp4_sliced, w2_blockscale_sliced = fp4_quantize(
+            w2[idx], w2_global_scale[idx]
+        )
+        w1_d[idx] = dequantize_nvfp4_to_dtype(
+            w1_fp4_sliced,
+            w1_blockscale_sliced,
+            w1_global_scale[idx],
+            dtype=w1.dtype,
+            device=w1.device,
+            block_size=16,
+        )
+        w2_d[idx] = dequantize_nvfp4_to_dtype(
+            w2_fp4_sliced,
+            w2_blockscale_sliced,
+            w2_global_scale[idx],
+            dtype=w2.dtype,
+            device=w2.device,
+            block_size=16,
+        )
+
+    ref_output = torch_moe_nvfp4(
+        a_in_dtype,
+        w1_d,
+        w2_d,
+        topk,
+        routing_weights.to(a_in_dtype.device),
+        topk_idx.to(a_in_dtype.device),
+    )
+    out_weighted = torch.zeros_like(ref_output, device=out.device, dtype=out.dtype)
+
+    positions = torch.nonzero(masked_m[topk_idx], as_tuple=False)
+    rows, cols = positions[:, 0], positions[:, 1]
+    experts = topk_idx[rows, cols]
+    for i in range(num_experts):
+        mask = experts == i
+        if mask.any():
+            idx = torch.nonzero(mask, as_tuple=False).squeeze(-1)
+            r, c = rows[idx], cols[idx]
+            out_weighted[r] += out[i, : len(r), :] * routing_weights[r, c].to(
+                out.device
+            ).unsqueeze(-1)
+    torch.testing.assert_close(
+        out_weighted.cpu(), ref_output.cpu(), atol=2e-1, rtol=2e-1
+    )
+
+
+@pytest.mark.parametrize(
+    "bs, hidden_dim, inter_dim, topk", [(2, 128, 256, 2), (16, 128, 512, 5)]
+)
+@torch.inference_mode()
+def test_grouped_gemm_nt_masked(
+    bs: int, hidden_dim: int, inter_dim: int, topk: int
+) -> None:
+    torch.manual_seed(42)
+    B = bs
+    D = hidden_dim
+    N = inter_dim
+    # CuteDSL group gemm has issue when not all experts are active.
+    # i.e. masked = [2, 3, 0, 0, 1] where the 2nd and 3rd experts are inactive
+    # see https://github.com/flashinfer-ai/flashinfer/issues/1856
+    num_experts = bs
+    hidden_states = torch.randn(B, D, dtype=torch.bfloat16, device="cuda")
+    weights = torch.randn(num_experts, N, D, dtype=torch.bfloat16, device="cuda")
+    router_logits = torch.randn(B, num_experts, dtype=torch.float32)
+
+    hidden_states_expanded = (
+        hidden_states.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    )
+    hidden_states_3d, masked_m, topk_idx, _ = prepare_inputs(
+        hidden_states_expanded, router_logits, num_experts, topk
+    )
+
+    a_amax = (
+        hidden_states_3d.abs()
+        .amax(dim=(1, 2))
+        .to(torch.float32)
+        .to(hidden_states.device)
+    )
+    b_amax = weights.abs().amax(dim=(1, 2)).to(torch.float32).to(weights.device)
+    a_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax
+    b_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
+    out_flashinfer = flashinfer_cutedsl_grouped_gemm_nt_masked(
+        hidden_states_3d.to(hidden_states.device), a_gs, weights, b_gs, masked_m
+    )
+    # reference
+    out_ref = grouped_gemm_ref(
+        hidden_states_expanded=hidden_states_expanded,
+        hidden_states_3d=hidden_states_3d,
+        weights=weights,
+        topk_idx=topk_idx,
+        masked_m=masked_m,
+        B=B,
+        topk=topk,
+        num_experts=num_experts,
+    )
+    # Note: just to compare the masked position due to cutedsl may write nan
+    # into unmasked position.
+    for i in range(num_experts):
+        torch.testing.assert_close(
+            out_flashinfer.permute(2, 0, 1)[i, : masked_m[i]],
+            out_ref.to(out_flashinfer.device)[i, : masked_m[i]],
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
+
+if __name__ == "__main__":
+    test_flashinfer_cutedsl_moe_masked(16, 128, 512, 4)
+    test_grouped_gemm_nt_masked(16, 128, 512, 4)
diff --git a/vllm/envs.py b/vllm/envs.py
index 212d68114e46..1ff620af5722 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -157,7 +157,9 @@
     VLLM_USE_FLASHINFER_MOE_FP16: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
-    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "latency"
+    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = (
+        "latency"
+    )
     VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
@@ -1238,7 +1240,9 @@ def get_vllm_port() -> int | None:
     # - "latency":
     #     Uses TensorRT-LLM kernels optimized for low-latency inference.
     "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
-        "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"]
+        "VLLM_FLASHINFER_MOE_BACKEND",
+        "latency",
+        ["throughput", "latency", "masked_gemm"],
     ),
     # Control the workspace buffer size for the FlashInfer backend.
     "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE": lambda: int(
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index e0db248958b4..fea9f49c04b8 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -6,6 +6,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
@@ -27,6 +28,8 @@
 DEEPEP_QUANT_BLOCK_SIZE = 128
 DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE]
 
+logger = init_logger(__name__)
+
 
 def dequant_fp8(
     expert_x_fp8: torch.Tensor, expert_x_scales: torch.Tensor
@@ -187,16 +190,25 @@ def _do_quant(
 
         # TODO (varun): Optimization - Use a batched version of quant
         x = x.view((-1, hidden_dim))
+        q_dtype = quant_config.quant_dtype
+
+        if envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm":
+            logger.info_once(
+                "Skip quantization when using FlashInfer CUTEDSL(masked_gemm) "
+                "for ModelOptNvFp4FusedMoE."
+            )
+            q_dtype = None
+
         x, x_scales = moe_kernel_quantize_input(
             x,
             quant_config.a1_scale,
-            quant_config.quant_dtype,
+            q_dtype,
             quant_config.per_act_token_quant,
             quant_config.block_shape,
         )
         x = x.view((num_experts, -1, hidden_dim))
 
-        if quant_config.quant_dtype is not None:
+        if q_dtype is not None:
             assert x_scales is not None
             x_scales = normalize_batched_scales_shape(x_scales, num_experts)
 
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
new file mode 100644
index 000000000000..2747ef04a349
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
@@ -0,0 +1,346 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate,
+)
+from vllm.utils.flashinfer import (
+    flashinfer_cutedsl_grouped_gemm_nt_masked,
+    has_flashinfer_cutedsl_grouped_gemm_nt_masked,
+    scaled_fp4_grouped_quantize,
+    silu_and_mul_scaled_nvfp4_experts_quantize,
+)
+
+logger = init_logger(__name__)
+
+
+def is_valid_flashinfer_cutedsl_fused_moe(
+    hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor
+) -> bool:
+    """
+    Check if the given problem size is supported by the FlashInfer CuteDSL MoE
+    kernel.
+    """
+    if not has_flashinfer_cutedsl_grouped_gemm_nt_masked():
+        logger.debug_once(
+            "FlashInferCuteDSLExperts disabled: "
+            "flashinfer_cutedsl_fused_moe not available."
+        )
+        return False
+    # Data type checks
+    if (
+        w1.dtype != torch.uint8
+        or w2.dtype != torch.uint8
+        or hidden_states.dtype not in [torch.float32, torch.float16, torch.bfloat16]
+    ):
+        logger.debug_once(
+            "FlashInferCuteDSLExperts disabled: w1/w2 must be torch.uint8 "
+            f"(got w1={w1.dtype}, w2={w2.dtype}), hidden_states must be "
+            f"float32, float16, or bfloat16 (got {hidden_states.dtype})."
+        )
+        return False
+    return True
+
+
+class FlashInferCuteDSLExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(
+        self,
+        out_dtype: torch.dtype,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        super().__init__(quant_config)
+        assert quant_config.quant_dtype == "nvfp4", (
+            "Only nvfp4 quantization are currently supported."
+        )
+        self.out_dtype = out_dtype
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.BatchedExperts,
+            mk.FusedMoEActivationFormat.BatchedExperts,
+        )
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    def supports_chunking(self) -> bool:
+        # This refers to TP chunking; DP chunking is handled separately.
+        # TODO(shuw@nvidia.com): Set to False to be consistent with
+        # batched_deep_gemm_moe
+        return False
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # We use global_num_experts due to how moe_align_block_size handles
+        # expert_maps.
+        """
+        Compute the shapes for the temporary and final outputs of the two gemms
+        and activation in the fused expert function.  Since the gemms are
+        independent, the workspace for the first gemm can be shared with the
+        workspace for the last gemm.
+
+        Returns a tuple of:
+        - workspace13 shape tuple: must be large enough to hold the
+          result of either expert gemm.
+        - workspace2 shape tuple: must be large enough to hold the
+          result of the activation function.
+        - output shape tuple: must be exact size of the final gemm output.
+        - Workspace type: The dtype to use for the workspace tensors.
+        - Note: in order for activation chunking to work, the first dimension
+          of each tuple must be the number of tokens.
+        """
+        output_shape = (local_num_experts, M, K)
+        workspace2 = (local_num_experts, M, N)
+        workspace1 = output_shape
+        return (workspace1, workspace2, output_shape)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,  # Not used
+        workspace13: torch.Tensor | None,
+        workspace2: torch.Tensor | None,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool | None,
+    ):
+        assert self.quant_dtype == "nvfp4", (
+            "Only nvfp4 quantization are currently supported."
+        )
+        # Ensure w1_scale and w2_scale are not None before calling view
+        assert self.w1_scale is not None and self.w2_scale is not None, (
+            "w1_scale and w2_scale must not be None for FlashInferExperts"
+        )
+        assert expert_tokens_meta is not None
+        expert_num_tokens = expert_tokens_meta.expert_num_tokens
+        assert hidden_states.ndim == 3
+        assert self.w1_scale.ndim == 3
+        assert self.w2_scale.ndim == 3
+        flashinfer_cutedsl_moe_masked(
+            hidden_states=hidden_states,
+            input_global_scale=self.a1_gscale,
+            w1=w1,
+            w1_blockscale=self.w1_scale,
+            w1_alpha=self.g1_alphas,
+            w2=w2,
+            a2_global_scale=self.a2_gscale,
+            w2_blockscale=self.w2_scale,
+            w2_alpha=self.g2_alphas,
+            masked_m=expert_num_tokens,
+            workspace=workspace2,
+            out=output,
+        )
+
+
+def get_cute_dtype(input: torch.Tensor) -> str:
+    if input.dtype == torch.bfloat16:
+        return "bfloat16"
+    elif input.dtype == torch.float16:
+        return "float16"
+    elif input.dtype == torch.float32:
+        return "float32"
+    else:
+        raise ValueError(f"Unsupported cute dtype {input.dtype}")
+
+
+def flashinfer_cutedsl_moe_masked(
+    hidden_states: torch.Tensor,
+    input_global_scale: torch.Tensor,
+    w1: torch.Tensor,
+    w1_blockscale: torch.Tensor,
+    w1_alpha,
+    w2: torch.Tensor,
+    a2_global_scale: torch.Tensor,
+    w2_blockscale: torch.Tensor,
+    w2_alpha,
+    masked_m: torch.Tensor,
+    workspace: torch.Tensor,
+    out: torch.Tensor,
+):
+    """
+    Perform masked Mixture-of-Experts computation with FlashInfer's CuteDSL
+    kernels.
+
+    Args:
+        hidden_states (torch.Tensor): [num_experts, m, k], bf16
+        input_global_scale (torch.Tensor): (l,)
+        w1 (torch.Tensor): fp4 weights, [l, 2 * n, k // 2], uint8
+        w1_blockscale (torch.Tensor): blockscale factors, e4m3,
+        w1_alpha (torch.Tensor): (l,)
+        w2 (torch.Tensor): fp4 weights, [l, k, n // 2], uint8
+        a2_global_scale (torch.Tensor): (l,)
+        w2_blockscale (torch.Tensor): blockscale factors, e4m3,
+        w2_alpha (torch.Tensor): (l,)
+        masked_m (torch.Tensor): Masked dimension indices
+        workspace (torch.Tensor): For gateup_output
+
+    Notes:
+        - Assumes max(masked_m) <= m.
+    """
+
+    # === Assertions on dtypes ===
+    assert input_global_scale.dtype == torch.float32, (
+        f"input_global_scale must be float32, got {input_global_scale.dtype}"
+    )
+    assert w1.dtype == torch.uint8, f"w1 must be uint8, got {w1.dtype}"
+    assert w1_blockscale.dtype == torch.float8_e4m3fn, (
+        f"w1_blockscale must be float8_e4m3fn, got {w1_blockscale.dtype}"
+    )
+    assert w1_alpha.dtype == torch.float32, (
+        f"w1_alpha must be float32, got {w1_alpha.dtype}"
+    )
+    assert w2.dtype == torch.uint8, f"w2 must be uint8, got {w2.dtype}"
+    assert a2_global_scale.dtype == torch.float32, (
+        f"a2_global_scale must be float32, got {a2_global_scale.dtype}"
+    )
+    assert w2_blockscale.dtype == torch.float8_e4m3fn, (
+        f"w2_blockscale must be float8_e4m3fn, got {w2_blockscale.dtype}"
+    )
+    assert w2_alpha.dtype == torch.float32, (
+        f"w2_alpha must be float32, got {w2_alpha.dtype}"
+    )
+
+    # === Assertions on shapes ===
+    n = w2.shape[-1] * 2  # intermediate dimension
+    num_experts, m, k = hidden_states.shape
+
+    assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n, got {w1.shape}"
+    assert w1.shape[-1] * 2 == k, (
+        f"w1 last dim * 2 must equal k, got {w1.shape[-1]} vs k={k}"
+    )
+    assert w2.shape[-2:] == (
+        k,
+        n // 2,
+    ), f"w2 shape mismatch, got {w2.shape[-2:]}, expected {(k, n // 2)}"
+
+    assert input_global_scale.shape == (num_experts,), (
+        f"input_global_scale must be (l,), got {input_global_scale.shape}"
+    )
+    assert w1_alpha.shape == (num_experts,), (
+        f"w1_alpha must be (l,), got {w1_alpha.shape}"
+    )
+    assert a2_global_scale.shape == (num_experts,), (
+        f"a2_global_scale must be (l,), got {a2_global_scale.shape}"
+    )
+    assert w2_alpha.shape == (num_experts,), (
+        f"w2_alpha must be (l,), got {w2_alpha.shape}"
+    )
+
+    aq, aq_sf = scaled_fp4_grouped_quantize(
+        hidden_states,
+        masked_m,
+        input_global_scale,
+    )
+
+    workspace = workspace.permute(1, 2, 0)  # requirement of kernel
+    sf_vec_size = 16
+    assert aq_sf.dtype == torch.float8_e4m3fn
+    assert aq.dtype == torch.uint8
+    ab_dtype = "float4_e2m1fn"
+    sf_dtype = "float8_e4m3fn"
+
+    c_dtype = get_cute_dtype(hidden_states)
+
+    # Gemm1
+    flashinfer_cutedsl_grouped_gemm_nt_masked(
+        (aq, aq_sf),
+        (w1.permute(1, 2, 0), w1_blockscale),
+        workspace,
+        masked_m,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        sf_vec_size=sf_vec_size,
+        alpha=w1_alpha.view(1, 1, num_experts),
+        alpha_dtype=get_cute_dtype(w1_alpha),
+    )  # in logical [m, n, l]
+
+    # SILU and quantization
+    diq, diq_sf = silu_and_mul_scaled_nvfp4_experts_quantize(
+        workspace.permute(2, 0, 1),
+        masked_m,
+        a2_global_scale,
+    )
+
+    # Gemm2
+    out = out.permute(1, 2, 0)  # requirement of kernel
+    flashinfer_cutedsl_grouped_gemm_nt_masked(
+        (diq, diq_sf),
+        (w2.permute(1, 2, 0), w2_blockscale),
+        out,
+        masked_m,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        sf_vec_size=sf_vec_size,
+        alpha=w2_alpha.view(1, 1, num_experts),
+        alpha_dtype=get_cute_dtype(w2_alpha),
+    )  # in logical [m, k, l]
+    out = out.permute(2, 0, 1)
+
+
+def flashinfer_cutedsl_moe_fp4(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    quant_config: FusedMoEQuantConfig,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: torch.Tensor | None = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+        create_flashinfer_prepare_finalize,
+    )
+
+    fused_experts = mk.FusedMoEModularKernel(
+        create_flashinfer_prepare_finalize(use_dp=False),  # could be swapped later
+        FlashInferCuteDSLExperts(
+            out_dtype=hidden_states.dtype,
+            quant_config=quant_config,
+        ),
+    )
+
+    return fused_experts(
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=inplace,
+        activation=activation,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 38ab7cd4f115..f684c17452a9 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1468,7 +1468,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         gemm1_weight = layer.w13_weight.data
         gemm1_weight_scale = layer.w13_weight_scale.data
 
-        if self.allow_flashinfer:
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+        ):
             gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
                 gemm1_weight, gemm1_weight_scale, dim=-2
             )
@@ -1746,17 +1749,26 @@ def apply(
                 workspace=layer.workspace,
             )
 
-        elif (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-        ):
-            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-                flashinfer_cutlass_moe_fp4,
+        elif self.allow_flashinfer:
+            assert self.flashinfer_moe_backend in (
+                FlashinferMoeBackend.CUTLASS,
+                FlashinferMoeBackend.CUTEDSL,
             )
+            if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+                from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+                    flashinfer_cutlass_moe_fp4,
+                )
 
-            assert self.moe_quant_config is not None
+                flashinfer_fn_moe_fp4 = flashinfer_cutlass_moe_fp4
+            else:
+                from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (  # noqa: E501
+                    flashinfer_cutedsl_moe_fp4,
+                )
+
+                flashinfer_fn_moe_fp4 = flashinfer_cutedsl_moe_fp4
 
-            return flashinfer_cutlass_moe_fp4(
+            assert self.moe_quant_config is not None
+            return flashinfer_fn_moe_fp4(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index fdf330329e20..36e8599dd948 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -10,6 +10,9 @@
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
+from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (
+    FlashInferCuteDSLExperts,
+)
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
     FlashInferExperts,
 )
@@ -17,10 +20,14 @@
     create_flashinfer_prepare_finalize,
 )
 from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from vllm.utils.flashinfer import (
+    has_flashinfer_cutedsl_grouped_gemm_nt_masked,
+    has_flashinfer_cutlass_fused_moe,
+)
 
 __all__ = [
     "is_flashinfer_fp4_cutlass_moe_available",
+    "is_flashinfer_fp4_cutedsl_moe_available",
     "reorder_w1w3_to_w3w1",
     "build_flashinfer_fp4_cutlass_moe_prepare_finalize",
 ]
@@ -36,6 +43,16 @@ def is_flashinfer_fp4_cutlass_moe_available() -> bool:
     )
 
 
+def is_flashinfer_fp4_cutedsl_moe_available() -> bool:
+    """Return ``True`` when FlashInfer CUTEDSL NV-FP4 kernels can be used."""
+    return (
+        envs.VLLM_USE_FLASHINFER_MOE_FP4
+        and has_flashinfer_cutedsl_grouped_gemm_nt_masked()
+        and current_platform.is_cuda()
+        and current_platform.is_device_capability(100)
+    )
+
+
 def reorder_w1w3_to_w3w1(
     weight: torch.Tensor, scale: torch.Tensor, dim: int = -2
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -72,15 +89,21 @@ def select_nvfp4_gemm_impl(
     """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""
 
     if allow_flashinfer:
-        return FlashInferExperts(
-            out_dtype=moe.in_dtype,
-            quant_config=moe_quant_config,
-            ep_rank=moe.moe_parallel_config.ep_rank,
-            ep_size=moe.moe_parallel_config.ep_size,
-            tp_rank=moe.moe_parallel_config.tp_rank,
-            tp_size=moe.moe_parallel_config.tp_size,
-            use_dp=moe.moe_parallel_config.dp_size > 1,
-        )
+        if envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm":
+            return FlashInferCuteDSLExperts(
+                out_dtype=moe.in_dtype,
+                quant_config=moe_quant_config,
+            )
+        elif envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput":
+            return FlashInferExperts(
+                out_dtype=moe.in_dtype,
+                quant_config=moe_quant_config,
+                ep_rank=moe.moe_parallel_config.ep_rank,
+                ep_size=moe.moe_parallel_config.ep_size,
+                tp_rank=moe.moe_parallel_config.tp_rank,
+                tp_size=moe.moe_parallel_config.tp_size,
+                use_dp=moe.moe_parallel_config.dp_size > 1,
+            )
 
     # native cutlass experts currently don't support DP; TP case won't call this
     raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index f22e17945d1f..7eba8359b92f 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -25,6 +25,7 @@
 class FlashinferMoeBackend(Enum):
     TENSORRT_LLM = "TensorRT-LLM"
     CUTLASS = "CUTLASS"
+    CUTEDSL = "CUTEDSL"
 
 
 def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
@@ -273,19 +274,21 @@ def flashinfer_cutlass_moe_fp8(
 
 
 def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
+    backend_map = {
+        "throughput": FlashinferMoeBackend.CUTLASS,
+        "latency": FlashinferMoeBackend.TENSORRT_LLM,
+        "masked_gemm": FlashinferMoeBackend.CUTEDSL,
+    }
+
     flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
-    # Prefer CUTLASS on SM90 to cover both SM90/SM100 generations
-    if flashinfer_moe_backend == "throughput" or current_platform.is_device_capability(
-        90
-    ):
+    if flashinfer_moe_backend in backend_map:
+        return backend_map[flashinfer_moe_backend]
+    elif current_platform.is_device_capability(90):
         return FlashinferMoeBackend.CUTLASS
-    elif flashinfer_moe_backend == "latency":
-        return FlashinferMoeBackend.TENSORRT_LLM
 
-    allowed_backends = ["throughput", "latency"]
     raise ValueError(
-        f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
-        f" expected one of {allowed_backends}"
+        f"Unknown flashinfer moe backend: {flashinfer_moe_backend!r}. "
+        f"Expected one of {list(backend_map.keys())}."
     )
 
 
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
index c3f26cc77411..44c5b027daf4 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
@@ -5,6 +5,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
+    is_flashinfer_fp4_cutedsl_moe_available,
     is_flashinfer_fp4_cutlass_moe_available,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
@@ -32,7 +33,10 @@ def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
     """Detect platform support for NV-FP4 fused-MoE path"""
     cutlass_supported = cutlass_fp4_supported()
 
-    allow_flashinfer = cutlass_supported and is_flashinfer_fp4_cutlass_moe_available()
+    allow_flashinfer = cutlass_supported and (
+        is_flashinfer_fp4_cutlass_moe_available()
+        or is_flashinfer_fp4_cutedsl_moe_available()
+    )
 
     if allow_flashinfer:
         _logger.info_once(
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 1209d64901bf..9f9976d52b4a 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -114,7 +114,17 @@ def wrapper(*args, **kwargs):
 flashinfer_cutlass_fused_moe = _lazy_import_wrapper(
     "flashinfer.fused_moe", "cutlass_fused_moe"
 )
+flashinfer_cutedsl_grouped_gemm_nt_masked = _lazy_import_wrapper(
+    "flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked"
+)
 flashinfer_fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize")
+nvfp4_batched_quantize = _lazy_import_wrapper("flashinfer", "nvfp4_batched_quantize")
+silu_and_mul_scaled_nvfp4_experts_quantize = _lazy_import_wrapper(
+    "flashinfer", "silu_and_mul_scaled_nvfp4_experts_quantize"
+)
+scaled_fp4_grouped_quantize = _lazy_import_wrapper(
+    "flashinfer", "scaled_fp4_grouped_quantize"
+)
 nvfp4_block_scale_interleave = _lazy_import_wrapper(
     "flashinfer", "nvfp4_block_scale_interleave"
 )
@@ -166,6 +176,14 @@ def has_flashinfer_moe() -> bool:
     )
 
 
+@functools.cache
+def has_flashinfer_cutedsl() -> bool:
+    """Return ``True`` if FlashInfer cutedsl module is available."""
+    return (
+        has_flashinfer() and importlib.util.find_spec("flashinfer.cute_dsl") is not None
+    )
+
+
 @functools.cache
 def has_flashinfer_cutlass_fused_moe() -> bool:
     """Return `True` if FlashInfer CUTLASS fused MoE is available."""
@@ -187,6 +205,26 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
     return True
 
 
+@functools.cache
+def has_flashinfer_cutedsl_grouped_gemm_nt_masked() -> bool:
+    """Return ``True`` if FlashInfer CUTLASS fused MoE is available."""
+    if not has_flashinfer_cutedsl():
+        return False
+
+    # Check if all required functions are available
+    required_functions = [
+        ("flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked"),
+        ("flashinfer", "scaled_fp4_grouped_quantize"),
+        ("flashinfer", "silu_and_scaled_nvfp4_experts_quantize"),
+    ]
+
+    for module_name, attr_name in required_functions:
+        mod = _get_submodule(module_name)
+        if not mod or not hasattr(mod, attr_name):
+            return False
+    return True
+
+
 @functools.cache
 def has_nvidia_artifactory() -> bool:
     """Return `True` if NVIDIA's artifactory is accessible.
@@ -472,7 +510,10 @@ def flashinfer_disable_q_quantization() -> bool:
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
     "flashinfer_cutlass_fused_moe",
+    "flashinfer_cutedsl_grouped_gemm_nt_masked",
     "flashinfer_fp4_quantize",
+    "silu_and_mul_scaled_nvfp4_experts_quantize",
+    "scaled_fp4_grouped_quantize",
     "nvfp4_block_scale_interleave",
     "trtllm_fp4_block_scale_moe",
     "autotune",
@@ -480,6 +521,7 @@ def flashinfer_disable_q_quantization() -> bool:
     "has_flashinfer_comm",
     "has_flashinfer_all2all",
     "has_flashinfer_cutlass_fused_moe",
+    "has_flashinfer_cutedsl_grouped_gemm_nt_masked",
     "has_nvidia_artifactory",
     "supports_trtllm_attention",
     "can_use_trtllm_attention",

From 88f5b19f0bc681c016eaaa17502d3bb4e2b59b51 Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Wed, 19 Nov 2025 16:30:04 -0500
Subject: [PATCH 207/578] [DeepSeek] Fix DeepSeek V3.2 Rope Embedding (#28968)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
---
 vllm/model_executor/layers/mla.py         |  6 +++++-
 vllm/model_executor/models/deepseek_v2.py | 14 ++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index c4c44b83ae6b..6ebfa47a9dc3 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -24,6 +24,7 @@ class MLAModules:
     q_b_proj: torch.nn.Module | None
     q_proj: torch.nn.Module | None
     indexer: torch.nn.Module | None
+    indexer_rotary_emb: torch.nn.Module | None
     is_sparse: bool
     topk_indices_buffer: torch.Tensor | None
 
@@ -80,6 +81,7 @@ def __init__(
         self.rotary_emb = mla_modules.rotary_emb
         self.o_proj = mla_modules.o_proj
         self.indexer = mla_modules.indexer
+        self.indexer_rope_emb = mla_modules.indexer_rotary_emb
         self.is_sparse = mla_modules.is_sparse
 
         if self.indexer is not None:
@@ -153,7 +155,9 @@ def forward_native(
             )
 
         if self.indexer and self.is_sparse:
-            _topk_indices = self.indexer(hidden_states, q_c, positions, self.rotary_emb)
+            _topk_indices = self.indexer(
+                hidden_states, q_c, positions, self.indexer_rope_emb
+            )
 
         attn_out = self.mla_attn(
             q,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 6675b2133f38..c0ff621d8408 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -837,8 +837,8 @@ def forward(
         )
 
         q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
-        q = torch.cat([q_pe, q_nope], dim=-1)
-        k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1)
+        q = torch.cat([q_pe.squeeze(0), q_nope], dim=-1)
+        k = torch.cat([k_pe.squeeze((0, 2)), k_nope], dim=-1)
 
         # we only quant q here since k quant is fused with cache insertion
         q = q.view(-1, self.head_dim)
@@ -987,6 +987,14 @@ def __init__(
         self.is_v32 = hasattr(config, "index_topk")
 
         if self.is_v32:
+            self.indexer_rope_emb = get_rope(
+                qk_rope_head_dim,
+                rotary_dim=qk_rope_head_dim,
+                max_position=max_position_embeddings,
+                base=rope_theta,
+                rope_scaling=rope_scaling,
+                is_neox_style=True,
+            )
             self.indexer = Indexer(
                 vllm_config,
                 config,
@@ -998,6 +1006,7 @@ def __init__(
                 f"{prefix}.indexer",
             )
         else:
+            self.indexer_rope_emb = None
             self.indexer = None
 
         mla_modules = MLAModules(
@@ -1015,6 +1024,7 @@ def __init__(
             q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
             q_proj=self.q_proj if self.q_lora_rank is None else None,
             indexer=self.indexer,
+            indexer_rotary_emb=self.indexer_rope_emb,
             is_sparse=self.is_v32,
             topk_indices_buffer=topk_indices_buffer,
         )

From 22e44ad589d951f440ef98141a2a6f9df97f6873 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Wed, 19 Nov 2025 15:31:33 -0600
Subject: [PATCH 208/578] [ROCm][CI] Fix Weight Loading With Multiple GPU Tests
 on ROCm (#28984)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 .buildkite/test-amd.yaml                  | 5 ++---
 tests/weight_loading/models-amd.txt       | 3 +++
 tests/weight_loading/models-large-amd.txt | 3 +++
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 tests/weight_loading/models-amd.txt
 create mode 100644 tests/weight_loading/models-large-amd.txt

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 0049f3540340..37c6bd427672 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1323,7 +1323,7 @@ steps:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
 
 - label: Weight Loading Multiple GPU Test - Large Models # optional
   mirror_hardwares: [amdexperimental]
@@ -1331,13 +1331,12 @@ steps:
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
-  gpu: a100
   optional: true
   source_file_dependencies:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
 
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
   mirror_hardwares: [amdexperimental]
diff --git a/tests/weight_loading/models-amd.txt b/tests/weight_loading/models-amd.txt
new file mode 100644
index 000000000000..e31e904c08af
--- /dev/null
+++ b/tests/weight_loading/models-amd.txt
@@ -0,0 +1,3 @@
+fp8, amd/Meta-Llama-3.1-8B-Instruct-FP8-KV, main
+None, amd/Llama-3.2-1B-Instruct-FP8-KV, main
+fp8, amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV, main
diff --git a/tests/weight_loading/models-large-amd.txt b/tests/weight_loading/models-large-amd.txt
new file mode 100644
index 000000000000..b6f5b4b16b37
--- /dev/null
+++ b/tests/weight_loading/models-large-amd.txt
@@ -0,0 +1,3 @@
+fp8, amd/Meta-Llama-3.1-70B-Instruct-FP8-KV, main
+None, microsoft/phi-4, main
+fp8, amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV, main

From 8f4f77a7275ecac594f84bdb41b67c95cf3eb26d Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 19 Nov 2025 16:43:54 -0500
Subject: [PATCH 209/578] [BugFix] Fix false assertion with
 spec-decode=[2,4,..] and TP>2 (#29036)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/config/compilation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index ca01cb3fb55d..1c3ef502f0f4 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -921,7 +921,7 @@ def adjust_cudagraph_sizes_for_spec_decode(
         self, uniform_decode_query_len: int, tensor_parallel_size: int
     ):
         multiple_of = uniform_decode_query_len
-        if tensor_parallel_size > 1:
+        if tensor_parallel_size > 1 and self.pass_config.enable_sequence_parallelism:
             multiple_of = max(uniform_decode_query_len, tensor_parallel_size)
             if (
                 multiple_of % uniform_decode_query_len != 0

From cb0a7b4bea26657da989562a10055b7d0b59fd3a Mon Sep 17 00:00:00 2001
From: Max Hu <hyoung2991@gmail.com>
Date: Wed, 19 Nov 2025 16:54:15 -0500
Subject: [PATCH 210/578] [Bugfix] Move flashinfer kernel check into
 ```__init__``` function of ```FusedMoE``` (#29018)

Signed-off-by: Max Hu <hyoung2991@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 7b15e63e9e35..be1910266c87 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -574,6 +574,9 @@ def __init__(
             is_act_and_mul=is_act_and_mul,
             is_lora_enabled=vllm_config.lora_config is not None,
         )
+        self.moe_config_use_flashinfer_cutlass_kernels = (
+            self.moe_config.use_flashinfer_cutlass_kernels
+        )
 
         self.quant_config = quant_config
 
@@ -728,7 +731,7 @@ def use_flashinfer_cutlass_kernels(self):
         return (
             self.moe_quant_config is not None
             and self.moe_quant_config.quant_dtype == "nvfp4"
-            and self.moe_config.use_flashinfer_cutlass_kernels
+            and self.moe_config_use_flashinfer_cutlass_kernels
         )
 
     @property

From 0075bfffd4201d1377f0d048848f82911e917639 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 19 Nov 2025 17:22:43 -0500
Subject: [PATCH 211/578] [CI] Fix precommit `rope_theta` issue (#29040)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/models/deepseek_v2.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index c0ff621d8408..c50fc327e760 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -991,8 +991,7 @@ def __init__(
                 qk_rope_head_dim,
                 rotary_dim=qk_rope_head_dim,
                 max_position=max_position_embeddings,
-                base=rope_theta,
-                rope_scaling=rope_scaling,
+                rope_parameters=config.rope_parameters,
                 is_neox_style=True,
             )
             self.indexer = Indexer(

From 8e38e998298364b0a94cddf7ccc59d8466c2396a Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Thu, 20 Nov 2025 00:30:08 +0100
Subject: [PATCH 212/578] [Feature] EPLB on Qwen3VLMoe and
 CompressedTensorsWNA16MoEMethod (#28849)

---
 .../compressed_tensors_moe.py                 | 27 +++++++-
 vllm/model_executor/models/qwen3_vl_moe.py    | 62 +++++++++++++++++--
 2 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 22b3c477f420..fa254030a271 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1921,9 +1921,20 @@ def apply(
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet."
-            )
+            if expert_load_view is None:
+                raise ValueError("enable_eplb=True requiere expert_load_view != None")
+            if logical_to_physical_map is None:
+                raise ValueError(
+                    "enable_eplb=True requiere logical_to_physical_map != None"
+                )
+            if logical_replica_count is None:
+                raise ValueError(
+                    "enable_eplb=True requiere logical_replica_count != None"
+                )
+            if not isinstance(layer, FusedMoE):
+                raise TypeError(
+                    "EPLB is only supported when `layer` is a instance of FusedMoE."
+                )
 
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -1940,6 +1951,12 @@ def apply(
             routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
+            num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0),
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
         )
 
         return fused_experts(
@@ -1956,6 +1973,10 @@ def apply(
             quant_config=self.moe_quant_config,
         )
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
 
 class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
     """
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index 5c3205faf9c2..e2c129120b1a 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -15,7 +15,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -29,7 +29,9 @@
 from itertools import islice
 
 import torch
-from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
+from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import (
+    Qwen3VLMoeConfig,
+)
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
@@ -44,7 +46,12 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors
 
-from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
+from .interfaces import MixtureOfExperts
+from .qwen3_moe import (
+    Qwen3MoeForCausalLM,
+    Qwen3MoeModel,
+    Qwen3MoeSparseMoeBlock,
+)
 from .qwen3_vl import (
     Qwen3_VisionTransformer,
     Qwen3VLDummyInputsBuilder,
@@ -344,12 +351,56 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
 
 
+class Qwen3VLMoeMixtureOfExperts(MixtureOfExperts):
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.language_model.model.layers:
+            if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+
+        self.moe_layers = []
+        example_moe = None
+        for layer in self.language_model.model.layers:
+            if hasattr(layer, "mlp") and isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None:
+            raise RuntimeError("No Qwen3Moe layer found in the language_model.")
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = len(self.moe_layers)
+        self.num_expert_groups = 1
+        self.num_shared_experts = 0
+        self.num_logical_experts = example_moe.n_logical_experts
+        self.num_physical_experts = example_moe.n_physical_experts
+        self.num_local_physical_experts = example_moe.n_local_physical_experts
+        self.num_routed_experts = example_moe.n_routed_experts
+        self.num_redundant_experts = example_moe.n_redundant_experts
+
+
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen3VLMultiModalProcessor,
     info=Qwen3VLMoeProcessingInfo,
     dummy_inputs=Qwen3VLDummyInputsBuilder,
 )
-class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
+class Qwen3VLMoeForConditionalGeneration(
+    Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts
+):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -413,3 +464,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.deepstack_input_embeds = None
         self.visual_dim = config.vision_config.out_hidden_size
         self.multiscale_dim = self.visual_dim * self.deepstack_num_level
+
+        # Set MoE hyperparameters
+        self.set_moe_parameters()

From 3aaa94ac99f4b295ba95f14b4968620b2127044f Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Wed, 19 Nov 2025 18:47:13 -0500
Subject: [PATCH 213/578] [Performance] Reduce DeepGEMM N dim restriction from
 128 to 64 multiplier  (#28687)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                | 20 ++++++++++++++++++++
 tests/kernels/quantization/test_block_fp8.py | 11 +++++++----
 vllm/utils/deep_gemm.py                      | 11 +++++++++--
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5309581d8e81..71249a9543c7 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -550,6 +550,26 @@ steps:
   commands:
     - pytest -v -s kernels/mamba
 
+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  optional: true
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s tests/kernels/moe/test_deepgemm.py
+    - pytest -v -s tests/kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s tests/kernels/attention/test_deepgemm_attention.py
+
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
   torch_nightly: true
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index e9973c1fcc15..d0e4f6554a91 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -22,6 +22,7 @@
     fp8_gemm_nt,
     get_col_major_tma_aligned_tensor,
     per_block_cast_to_fp8,
+    should_use_deepgemm_for_fp8_linear,
 )
 from vllm.utils.import_utils import has_deep_gemm
 
@@ -157,10 +158,6 @@ def test_w8a8_block_fp8_cutlass_matmul():
 @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGemm kernels not available.")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
-    # only aligned sizes
-    if M % 4 != 0 or K % 128 != 0 or N % 64 != 0:
-        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
-
     torch.manual_seed(seed)
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max = fp8_info.max
@@ -168,6 +165,12 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
     B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
 
+    # only aligned sizes are supported by deepgemm
+    if not should_use_deepgemm_for_fp8_linear(
+        output_dtype=out_dtype, weight=B_fp32, supports_deep_gemm=True
+    ):
+        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
+
     A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
     B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size)
 
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index b5ab37534dd7..6b0a383a0e28 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -365,11 +365,18 @@ def should_use_deepgemm_for_fp8_linear(
 ):
     if supports_deep_gemm is None:
         supports_deep_gemm = is_deep_gemm_supported()
+
+    # Verify DeepGEMM N/K dims requirements
+    # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
+    # test inside kernels/quatization/test_block_fp8.py
+    N_MULTIPLE = 64
+    K_MULTIPLE = 128
+
     return (
         supports_deep_gemm
         and output_dtype == torch.bfloat16
-        and weight.shape[0] % 128 == 0
-        and weight.shape[1] % 128 == 0
+        and weight.shape[0] % N_MULTIPLE == 0
+        and weight.shape[1] % K_MULTIPLE == 0
     )
 
 
From 5031cd5d55ad99e8f9b31dd0020a06b346f6e493 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 19 Nov 2025 18:53:15 -0500
Subject: [PATCH 214/578] [Refactor] Optimize `select_experts` (#28069)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py   |  5 -----
 vllm/model_executor/layers/fused_moe/layer.py       | 11 ++++-------
 vllm/model_executor/layers/quantization/modelopt.py |  2 +-
 vllm/model_executor/models/longcat_flash.py         |  2 +-
 vllm/model_executor/models/openpangu.py             |  2 +-
 5 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 2e042d85fcfc..f44328418f1b 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1246,7 +1246,6 @@ def eplb_map_to_physical_and_record(
     expert_load_view: torch.Tensor,
     logical_to_physical_map: torch.Tensor,
     logical_replica_count: torch.Tensor,
-    indices_type: torch.dtype | None = None,
 ) -> torch.Tensor:
     """
     Map the logical expert ids to physical expert ids
@@ -1260,7 +1259,6 @@ def eplb_map_to_physical_and_record(
         expert_load_view: The expert load view.
         logical_to_physical_map: The logical to physical map.
         logical_replica_count: The logical replica count.
-        indices_type: The indices type.
 
     Returns:
         The physical expert ids.
@@ -1310,9 +1308,6 @@ def eplb_map_to_physical_and_record(
         index=topk_ids_flatten.long(),
         src=torch.ones_like(topk_ids_flatten).to(expert_load_view),
     )
-
-    if indices_type is not None:
-        topk_ids = topk_ids.to(dtype=indices_type)
     return topk_ids
 
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index be1910266c87..d9525a7439c3 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -68,7 +68,6 @@ def _eplb_map_to_physical_and_record(
         expert_load_view: torch.Tensor,
         logical_to_physical_map: torch.Tensor,
         logical_replica_count: torch.Tensor,
-        indices_type: torch.dtype | None,
     ) -> torch.Tensor:
         # CPU fallback: no EPLB so just return as is
         return topk_ids
@@ -1509,8 +1508,6 @@ def select_experts(
                 routed_scaling_factor=routed_scaling_factor,
                 e_score_correction_bias=e_score_correction_bias,
             )
-            if indices_type is not None:
-                topk_ids = topk_ids.to(dtype=indices_type)
         elif e_score_correction_bias is not None:
             topk_weights, topk_ids = fused_topk_bias(
                 hidden_states=hidden_states,
@@ -1519,7 +1516,7 @@ def select_experts(
                 topk=top_k,
                 renormalize=renormalize,
             )
-            if routed_scaling_factor is not None:
+            if routed_scaling_factor != 1.0:
                 topk_weights *= routed_scaling_factor
         elif custom_routing_function is None:
             topk_weights, topk_ids, token_expert_indices = fused_topk(
@@ -1536,8 +1533,6 @@ def select_experts(
                 topk=top_k,
                 renormalize=renormalize,
             )
-            if indices_type is not None:
-                topk_ids = topk_ids.to(dtype=indices_type)
 
         if enable_eplb:
             assert expert_load_view is not None
@@ -1549,9 +1544,11 @@ def select_experts(
                 expert_load_view=expert_load_view,
                 logical_to_physical_map=logical_to_physical_map,
                 logical_replica_count=logical_replica_count,
-                indices_type=indices_type,
             )
 
+        if (indices_type is not None) and topk_ids.dtype != indices_type:
+            topk_ids = topk_ids.to(dtype=indices_type)
+
         assert topk_ids.dtype == indices_type or indices_type is None
 
         # Compute zero expert result if needed
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index f684c17452a9..dedab33c1bdb 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1706,7 +1706,7 @@ def apply(
                 intermediate_size=layer.intermediate_size_per_partition,
                 local_expert_offset=layer.ep_rank * layer.local_num_experts,
                 local_num_experts=layer.local_num_experts,
-                routed_scaling_factor=None,
+                routed_scaling_factor=1.0,
                 tile_tokens_dim=None,
                 routing_method_type=routing_method_type,
                 do_finalize=True,
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index fafe97cd2be7..c5441283f971 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -118,7 +118,7 @@ def __init__(
         router_dtype="float32",
         router_bias=False,
         topk_method=None,
-        routed_scaling_factor=None,
+        routed_scaling_factor=1.0,
         zero_expert_num=0,
         zero_expert_type=None,
         nextn_use_scmoe=False,
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index f814cdfec5a2..4124a181a14c 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -625,7 +625,7 @@ def __init__(
                 bias=getattr(config, "mlp_bias", False),
                 prefix=f"{prefix}.mlp",
             )
-        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", None)
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
         self.num_hidden_layers = config.num_hidden_layers
         self.first_k_dense_replace = getattr(
             config, "first_k_dense_replace", self.num_hidden_layers

From 537cc635c77ac63f643c5289137debdd8f9591ac Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Wed, 19 Nov 2025 16:10:22 -0800
Subject: [PATCH 215/578] [GC Debugger] Simply and improve GC Debugger Utils
 (#29029)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 vllm/utils/gc_utils.py | 7 ++++---
 vllm/v1/engine/core.py | 5 ++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
index 160ac9ac263a..3436e450a269 100644
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@@ -68,9 +68,10 @@ def handle(self, phase: str, info: dict[str, int]) -> None:
             # Before GC started, record GC start time
             # and top collected objects
             self.start_time_ns = time.monotonic_ns()
-            self.gc_top_collected_objects = _compute_top_gc_collected_objects(
-                gc.get_objects(generation), self.config.top_objects
-            )
+            if (top_objects := self.config.top_objects) > 0:
+                self.gc_top_collected_objects = _compute_top_gc_collected_objects(
+                    gc.get_objects(generation), top_objects
+                )
         elif phase == "stop":
             # After GC finished, Record GC elapsed time and
             # optionally top collected objects
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 6be19894d332..8657a95b5e6e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -206,6 +206,8 @@ def __init__(
         # Mark the startup heap as static so that it's ignored by GC.
         # Reduces pause times of oldest generation collections.
         freeze_gc_heap()
+        # If enable, attach GC debugger after static variable freeze.
+        maybe_attach_gc_debug_callback()
 
     def _initialize_kv_caches(
         self, vllm_config: VllmConfig
@@ -645,9 +647,6 @@ def __init__(
                 assert addresses.coordinator_input is not None
                 logger.info("Waiting for READY message from DP Coordinator...")
 
-        # If enable, attach GC debugger after static variable freeze.
-        maybe_attach_gc_debug_callback()
-
         # Enable environment variable cache (e.g. assume no more
         # environment variable overrides after this point)
         enable_envs_cache()

From 9ccef8e333ccd988a587990740405503e76c8c20 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 19 Nov 2025 16:26:04 -0800
Subject: [PATCH 216/578] [Misc] Colorize logs (#29017)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/test_logger.py            | 94 ++++++++++++++++++---------------
 vllm/envs.py                    |  9 ++++
 vllm/logger.py                  | 51 ++++++++++++------
 vllm/logging_utils/__init__.py  |  3 +-
 vllm/logging_utils/formatter.py | 50 ++++++++++++++++++
 vllm/utils/system_utils.py      |  7 ++-
 6 files changed, 152 insertions(+), 62 deletions(-)

diff --git a/tests/test_logger.py b/tests/test_logger.py
index 01672358902f..8900e9c2a1e6 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -49,10 +49,13 @@ def test_trace_function_call():
     os.remove(path)
 
 
-def test_default_vllm_root_logger_configuration():
+def test_default_vllm_root_logger_configuration(monkeypatch):
     """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and
     VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default
     behavior is activated."""
+    monkeypatch.setenv("VLLM_LOGGING_COLOR", "0")
+    _configure_vllm_root_logger()
+
     logger = logging.getLogger("vllm")
     assert logger.level == logging.DEBUG
     assert not logger.propagate
@@ -70,12 +73,13 @@ def test_default_vllm_root_logger_configuration():
     assert formatter.datefmt == _DATE_FORMAT
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None)
-def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger():
+def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger(monkeypatch):
     """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and
     VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default
     behavior is activated."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+    monkeypatch.delenv("VLLM_LOGGING_CONFIG_PATH", raising=False)
+
     root_logger = logging.getLogger("vllm")
     root_handler = root_logger.handlers[0]
 
@@ -99,49 +103,50 @@ def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger():
     assert log_record.levelno == logging.INFO
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
-@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None)
-def test_logger_configuring_can_be_disabled():
+def test_logger_configuring_can_be_disabled(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however mocks are used to ensure no changes in behavior or
     configuration occur."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "0")
+    monkeypatch.delenv("VLLM_LOGGING_CONFIG_PATH", raising=False)
 
     with patch("vllm.logger.dictConfig") as dict_config_mock:
         _configure_vllm_root_logger()
     dict_config_mock.assert_not_called()
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-@patch(
-    "vllm.logger.VLLM_LOGGING_CONFIG_PATH",
-    "/if/there/is/a/file/here/then/you/did/this/to/yourself.json",
-)
-def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist():
+def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however it fails before any change in behavior or
     configuration occurs."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+    monkeypatch.setenv(
+        "VLLM_LOGGING_CONFIG_PATH",
+        "/if/there/is/a/file/here/then/you/did/this/to/yourself.json",
+    )
+
     with pytest.raises(RuntimeError) as ex_info:
         _configure_vllm_root_logger()
     assert ex_info.type == RuntimeError  # noqa: E721
     assert "File does not exist" in str(ex_info)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
+def test_an_error_is_raised_when_custom_logging_config_is_invalid_json(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however it fails before any change in behavior or
     configuration occurs."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write("---\nloggers: []\nversion: 1")
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
-            with pytest.raises(JSONDecodeError) as ex_info:
-                _configure_vllm_root_logger()
-            assert ex_info.type == JSONDecodeError
-            assert "Expecting value" in str(ex_info)
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with pytest.raises(JSONDecodeError) as ex_info:
+            _configure_vllm_root_logger()
+        assert ex_info.type == JSONDecodeError
+        assert "Expecting value" in str(ex_info)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
 @pytest.mark.parametrize(
     "unexpected_config",
     (
@@ -151,26 +156,30 @@ def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
     ),
 )
 def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json(
+    monkeypatch,
     unexpected_config: Any,
 ):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however it fails before any change in behavior or
     configuration occurs."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(unexpected_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
-            with pytest.raises(ValueError) as ex_info:
-                _configure_vllm_root_logger()
-            assert ex_info.type == ValueError  # noqa: E721
-            assert "Invalid logging config. Expected dict, got" in str(ex_info)
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with pytest.raises(ValueError) as ex_info:
+            _configure_vllm_root_logger()
+        assert ex_info.type == ValueError  # noqa: E721
+        assert "Invalid logging config. Expected dict, got" in str(ex_info)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-def test_custom_logging_config_is_parsed_and_used_when_provided():
+def test_custom_logging_config_is_parsed_and_used_when_provided(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however mocks are used to ensure no changes in behavior or
     configuration occur."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+
     valid_logging_config = {
         "loggers": {
             "vllm.test_logger.logger": {
@@ -183,19 +192,18 @@ def test_custom_logging_config_is_parsed_and_used_when_provided():
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(valid_logging_config))
         logging_config_file.flush()
-        with (
-            patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name),
-            patch("vllm.logger.dictConfig") as dict_config_mock,
-        ):
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with patch("vllm.logger.dictConfig") as dict_config_mock:
             _configure_vllm_root_logger()
             dict_config_mock.assert_called_with(valid_logging_config)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
-def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
+def test_custom_logging_config_causes_an_error_if_configure_logging_is_off(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however mocks are used to ensure no changes in behavior or
     configuration occur."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "0")
+
     valid_logging_config = {
         "loggers": {
             "vllm.test_logger.logger": {
@@ -207,15 +215,15 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(valid_logging_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
-            with pytest.raises(RuntimeError) as ex_info:
-                _configure_vllm_root_logger()
-            assert ex_info.type is RuntimeError
-            expected_message_snippet = (
-                "VLLM_CONFIGURE_LOGGING evaluated to false, but "
-                "VLLM_LOGGING_CONFIG_PATH was given."
-            )
-            assert expected_message_snippet in str(ex_info)
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with pytest.raises(RuntimeError) as ex_info:
+            _configure_vllm_root_logger()
+        assert ex_info.type is RuntimeError
+        expected_message_snippet = (
+            "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+            "VLLM_LOGGING_CONFIG_PATH was given."
+        )
+        assert expected_message_snippet in str(ex_info)
 
         # Remember! The root logger is assumed to have been configured as
         # though VLLM_CONFIGURE_LOGGING=1 and VLLM_LOGGING_CONFIG_PATH=None.
diff --git a/vllm/envs.py b/vllm/envs.py
index 1ff620af5722..614bc94b978b 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -42,6 +42,8 @@
     VLLM_LOGGING_PREFIX: str = ""
     VLLM_LOGGING_STREAM: str = "ext://sys.stdout"
     VLLM_LOGGING_CONFIG_PATH: str | None = None
+    VLLM_LOGGING_COLOR: str = "auto"
+    NO_COLOR: bool = False
     VLLM_LOG_STATS_INTERVAL: float = 10.0
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: str | None = None
@@ -616,6 +618,11 @@ def get_vllm_port() -> int | None:
     "VLLM_LOGGING_STREAM": lambda: os.getenv("VLLM_LOGGING_STREAM", "ext://sys.stdout"),
     # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
     "VLLM_LOGGING_PREFIX": lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
+    # Controls colored logging output. Options: "auto" (default, colors when terminal),
+    # "1" (always use colors), "0" (never use colors)
+    "VLLM_LOGGING_COLOR": lambda: os.getenv("VLLM_LOGGING_COLOR", "auto"),
+    # Standard unix flag for disabling ANSI color codes
+    "NO_COLOR": lambda: os.getenv("NO_COLOR", "0") != "0",
     # If set, vllm will log stats at this interval in seconds
     # If not set, vllm will log stats every 10 seconds.
     "VLLM_LOG_STATS_INTERVAL": lambda: val
@@ -1578,6 +1585,7 @@ def compile_factors() -> dict[str, object]:
         "VLLM_LOGGING_PREFIX",
         "VLLM_LOGGING_STREAM",
         "VLLM_LOGGING_CONFIG_PATH",
+        "VLLM_LOGGING_COLOR",
         "VLLM_LOG_STATS_INTERVAL",
         "VLLM_DEBUG_LOG_API_SERVER_RESPONSE",
         "VLLM_TUNED_CONFIG_FOLDER",
@@ -1608,6 +1616,7 @@ def compile_factors() -> dict[str, object]:
         "VLLM_TEST_FORCE_LOAD_FORMAT",
         "LOCAL_RANK",
         "CUDA_VISIBLE_DEVICES",
+        "NO_COLOR",
     }
 
     from vllm.config.utils import normalize_value
diff --git a/vllm/logger.py b/vllm/logger.py
index 934100829684..772e36497b45 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -17,18 +17,25 @@
 
 import vllm.envs as envs
 
-VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
-VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
-VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
-VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
-VLLM_LOGGING_STREAM = envs.VLLM_LOGGING_STREAM
-
 _FORMAT = (
-    f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
+    f"{envs.VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
     "[%(fileinfo)s:%(lineno)d] %(message)s"
 )
 _DATE_FORMAT = "%m-%d %H:%M:%S"
 
+
+def _use_color() -> bool:
+    if envs.NO_COLOR or envs.VLLM_LOGGING_COLOR == "0":
+        return False
+    if envs.VLLM_LOGGING_COLOR == "1":
+        return True
+    if envs.VLLM_LOGGING_STREAM == "ext://sys.stdout":  # stdout
+        return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
+    elif envs.VLLM_LOGGING_STREAM == "ext://sys.stderr":  # stderr
+        return hasattr(sys.stderr, "isatty") and sys.stderr.isatty()
+    return False
+
+
 DEFAULT_LOGGING_CONFIG = {
     "formatters": {
         "vllm": {
@@ -36,13 +43,19 @@
             "datefmt": _DATE_FORMAT,
             "format": _FORMAT,
         },
+        "vllm_color": {
+            "class": "vllm.logging_utils.ColoredFormatter",
+            "datefmt": _DATE_FORMAT,
+            "format": _FORMAT,
+        },
     },
     "handlers": {
         "vllm": {
             "class": "logging.StreamHandler",
-            "formatter": "vllm",
-            "level": VLLM_LOGGING_LEVEL,
-            "stream": VLLM_LOGGING_STREAM,
+            # Choose formatter based on color setting.
+            "formatter": "vllm_color" if _use_color() else "vllm",
+            "level": envs.VLLM_LOGGING_LEVEL,
+            "stream": envs.VLLM_LOGGING_STREAM,
         },
     },
     "loggers": {
@@ -144,7 +157,7 @@ def warning_once(
 def _configure_vllm_root_logger() -> None:
     logging_config = dict[str, Any]()
 
-    if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
+    if not envs.VLLM_CONFIGURE_LOGGING and envs.VLLM_LOGGING_CONFIG_PATH:
         raise RuntimeError(
             "VLLM_CONFIGURE_LOGGING evaluated to false, but "
             "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
@@ -152,16 +165,22 @@ def _configure_vllm_root_logger() -> None:
             "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH."
         )
 
-    if VLLM_CONFIGURE_LOGGING:
+    if envs.VLLM_CONFIGURE_LOGGING:
         logging_config = DEFAULT_LOGGING_CONFIG
 
-    if VLLM_LOGGING_CONFIG_PATH:
-        if not path.exists(VLLM_LOGGING_CONFIG_PATH):
+        vllm_handler = logging_config["handlers"]["vllm"]
+        # Refresh these values in case env vars have changed.
+        vllm_handler["level"] = envs.VLLM_LOGGING_LEVEL
+        vllm_handler["stream"] = envs.VLLM_LOGGING_STREAM
+        vllm_handler["formatter"] = "vllm_color" if _use_color() else "vllm"
+
+    if envs.VLLM_LOGGING_CONFIG_PATH:
+        if not path.exists(envs.VLLM_LOGGING_CONFIG_PATH):
             raise RuntimeError(
                 "Could not load logging config. File does not exist: %s",
-                VLLM_LOGGING_CONFIG_PATH,
+                envs.VLLM_LOGGING_CONFIG_PATH,
             )
-        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
+        with open(envs.VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
             custom_config = json.loads(file.read())
 
         if not isinstance(custom_config, dict):
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
index 44b40ead973b..8d3354df215b 100644
--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.logging_utils.formatter import NewLineFormatter
+from vllm.logging_utils.formatter import ColoredFormatter, NewLineFormatter
 from vllm.logging_utils.lazy import lazy
 from vllm.logging_utils.log_time import logtime
 
 __all__ = [
     "NewLineFormatter",
+    "ColoredFormatter",
     "lazy",
     "logtime",
 ]
diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py
index 02ba308e1879..3ad4ef8d119a 100644
--- a/vllm/logging_utils/formatter.py
+++ b/vllm/logging_utils/formatter.py
@@ -75,3 +75,53 @@ def shrink_path(relpath: Path) -> str:
             parts = msg.split(record.message)
             msg = msg.replace("\n", "\r\n" + parts[0])
         return msg
+
+
+class ColoredFormatter(NewLineFormatter):
+    """Adds ANSI color codes to log levels for terminal output.
+
+    This formatter adds colors by injecting them into the format string for
+    static elements (timestamp, filename, line number) and modifying the
+    levelname attribute for dynamic color selection.
+    """
+
+    # ANSI color codes
+    COLORS = {
+        "DEBUG": "\033[37m",  # White
+        "INFO": "\033[32m",  # Green
+        "WARNING": "\033[33m",  # Yellow
+        "ERROR": "\033[31m",  # Red
+        "CRITICAL": "\033[35m",  # Magenta
+    }
+    GREY = "\033[90m"  # Grey for timestamp and file info
+    RESET = "\033[0m"
+
+    def __init__(self, fmt, datefmt=None, style="%"):
+        # Inject grey color codes into format string for timestamp and file info
+        if fmt:
+            # Wrap %(asctime)s with grey
+            fmt = fmt.replace("%(asctime)s", f"{self.GREY}%(asctime)s{self.RESET}")
+            # Wrap [%(fileinfo)s:%(lineno)d] with grey
+            fmt = fmt.replace(
+                "[%(fileinfo)s:%(lineno)d]",
+                f"{self.GREY}[%(fileinfo)s:%(lineno)d]{self.RESET}",
+            )
+
+        # Call parent __init__ with potentially modified format string
+        super().__init__(fmt, datefmt, style)
+
+    def format(self, record):
+        # Store original levelname to restore later (in case record is reused)
+        orig_levelname = record.levelname
+
+        # Only modify levelname - it needs dynamic color based on severity
+        if (color_code := self.COLORS.get(record.levelname)) is not None:
+            record.levelname = f"{color_code}{record.levelname}{self.RESET}"
+
+        # Call parent format which will handle everything else
+        msg = super().format(record)
+
+        # Restore original levelname
+        record.levelname = orig_levelname
+
+        return msg
diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py
index 5968884e232a..cc872040b6c5 100644
--- a/vllm/utils/system_utils.py
+++ b/vllm/utils/system_utils.py
@@ -22,7 +22,7 @@
 
 logger = init_logger(__name__)
 
-CYAN = "\033[1;36m"
+CYAN = "\033[0;36m"
 RESET = "\033[0;0m"
 
 
@@ -142,7 +142,10 @@ def set_process_title(
 
 def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
     """Add colored prefix to file output for log decoration."""
-    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
+    if envs.NO_COLOR:
+        prefix = f"({worker_name} pid={pid}) "
+    else:
+        prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
     file_write = file.write
 
     def write_with_prefix(s: str):

From 1d642872a27f1c6bedf28669642928cc7eec6532 Mon Sep 17 00:00:00 2001
From: liangel-02 <liangel@meta.com>
Date: Wed, 19 Nov 2025 19:39:45 -0500
Subject: [PATCH 217/578] [torchao] fix safetensors for sharding (#28169)

Signed-off-by: Angel Li <liangel@meta.com>
---
 tests/quantization/test_torchao.py            |  9 ++++----
 .../model_loader/default_loader.py            |  2 +-
 .../model_loader/weight_utils.py              | 23 +++++++++++++++----
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index fb8d6130c377..f35c3973ab6e 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -225,13 +225,12 @@ def test_reload_weights():
 @pytest.mark.skip(
     reason="since torchao nightly is only compatible with torch nightly"
     "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
-    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+    "torchao tests that requires newer versions (0.15.0.dev+) for now"
 )
-def test_opt_125m_float8_weight_only_safetensors_model_loading_with_params(vllm_runner):
+def test_safetensors_model_loading_with_params(vllm_runner):
     torch._dynamo.reset()
-    model_name = (
-        "torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.14.0.dev-safetensors"
-    )
+    # using this model to test safetensors loading with file sharding
+    model_name = "torchao-testing/Qwen3-8B-INT4-0.15.0dev-safetensors"
     with vllm_runner(model_name=model_name, dtype="bfloat16") as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
 
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index b80026741781..67aa584c6bda 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -279,7 +279,7 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
             if (
                 hasattr(quant_config, "is_checkpoint_torchao_serialized")
                 and quant_config.is_checkpoint_torchao_serialized
-                and torchao_version_at_least("0.14.0")
+                and torchao_version_at_least("0.15.0")
             ):
                 self.load_config.safetensors_load_strategy = "torchao"
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 89634cbf4124..4572ebe2ea11 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -595,6 +595,9 @@ def safetensors_weights_iterator(
     if safetensors_load_strategy == "eager":
         loading_desc += " (eager)"
 
+    state_dict = {}
+    leftover_state_dict: dict[str, torch.Tensor] = {}
+
     for st_file in tqdm(
         hf_weights_files,
         desc=loading_desc,
@@ -606,9 +609,11 @@ def safetensors_weights_iterator(
                 state_dict = load(f.read())
             yield from state_dict.items()
         elif safetensors_load_strategy == "torchao":
-            if not torchao_version_at_least("0.14.0"):
+            # we can't load flattened torchao tensor subclasses directly into the model
+            # instead we reconstruct the subclasses here before returning
+            if not torchao_version_at_least("0.15.0"):
                 raise ValueError(
-                    "Please use torchao version >= 0.14.0 \
+                    "Please use torchao version >= 0.15.0 \
                         to load torchao safetensors checkpoint"
                 )
             from torchao.prototype.safetensors.safetensors_support import (
@@ -616,12 +621,20 @@ def safetensors_weights_iterator(
             )
 
             with safe_open(st_file, framework="pt") as f:
-                state_dict = {}
                 for name in f.keys():  # noqa: SIM118
                     state_dict[name] = f.get_tensor(name)
+
+                # update with leftover tensor data from previous iteration, if any
+                state_dict.update(leftover_state_dict)
                 metadata = f.metadata()
-                updated_state_dict = unflatten_tensor_state_dict(state_dict, metadata)
-            yield from updated_state_dict.items()
+                # due to sharded checkpoints, we are not guaranteed that we have all
+                # tensor subclass data on one file
+                # state_dict has the leftover data from this step and we wait for
+                # missing information to be provided in a future iteration
+                unflattened_state_dict, leftover_state_dict = (
+                    unflatten_tensor_state_dict(state_dict, metadata)
+                )
+            yield from unflattened_state_dict.items()
         else:
             with safe_open(st_file, framework="pt") as f:
                 for name in f.keys():  # noqa: SIM118

From 05c2dee7e9f485f1e76eee084849e07c1c12a68b Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Nov 2025 09:40:49 +0800
Subject: [PATCH 218/578] [DeepSeek + LMCache Multiprocess] handle MLA for
 deepseek model + LMCache Multiprocess connector (#29039)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
---
 .../kv_connector/v1/lmcache_mp_connector.py   | 47 +++++++++++++++----
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index 55831dc56c80..22ddabbf1e35 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -7,6 +7,7 @@
 
 import torch
 import zmq
+from lmcache.integration.vllm.utils import mla_enabled
 from lmcache.utils import init_logger as lmcache_init_logger
 
 from vllm.config import VllmConfig
@@ -60,17 +61,44 @@ def reformat_block_ids(block_ids: tuple[list[int], ...] | None) -> list[int]:
     return block_ids[0]
 
 
+def extract_world_size_and_kv_rank(
+    world_size: int,
+    rank: int,
+    vllm_config: VllmConfig,
+) -> tuple[int, int]:
+    """
+    Convert the rank for the MLA.
+    """
+    use_mla = mla_enabled(vllm_config.model_config)
+    if not use_mla:
+        return world_size, rank
+    else:
+        # Tensor parallel does not change the KV caches for MLA models.
+        # So we need to "exclude" the effect of TP on rank and world size
+        tp_size = vllm_config.parallel_config.tensor_parallel_size
+        # vLLM constructs TP groups first, and then construct other
+        # parallel groups on top of TP groups.
+        # for example, TP=4, PP=2,
+        # TP group: [0, 1, 2, 3], [4, 5, 6, 7]
+        # PP group: [0, 4], [1, 5], [2, 6], [3, 7]
+        # So we can "exclude" the effect of TP by rank // tp_size.
+        return world_size // tp_size, rank // tp_size
+
+
 def create_scheduler_adapter(
     server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
 ) -> LMCacheMPSchedulerAdapter:
-    # TODO: have a helper function to calculate the correct rank and
-    # world size for the MLA and other models
+    world_size, kv_rank = extract_world_size_and_kv_rank(
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config,
+    )
     return LMCacheMPSchedulerAdapter(
         server_url,
         zmq_context,
         vllm_config.model_config.model,
-        vllm_config.parallel_config.world_size,
-        vllm_config.parallel_config.rank,
+        world_size,
+        kv_rank,
         vllm_config.cache_config.block_size,
     )
 
@@ -78,14 +106,17 @@ def create_scheduler_adapter(
 def create_worker_adapter(
     server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
 ) -> LMCacheMPWorkerAdapter:
-    # TODO: have a helper function to calculate the correct rank and
-    # world size for the MLA and other models
+    world_size, kv_rank = extract_world_size_and_kv_rank(
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config,
+    )
     return LMCacheMPWorkerAdapter(
         server_url,
         zmq_context,
         vllm_config.model_config.model,
-        vllm_config.parallel_config.world_size,
-        vllm_config.parallel_config.rank,
+        world_size,
+        kv_rank,
         vllm_config.cache_config.block_size,
     )
 

From 3fb0d90999887949629d1e9bac4d98336a35c475 Mon Sep 17 00:00:00 2001
From: Qiang Zhang <email2zq@qq.com>
Date: Thu, 20 Nov 2025 10:11:52 +0800
Subject: [PATCH 219/578] [AMD] Use Decoupled Kernel Block Size to Support
 AITER MLA block_size=1 (#27715)

Signed-off-by: chiangzhang <chiangzhang@tencent.com>
---
 vllm/attention/backends/abstract.py           | 14 +++---
 .../attention/backends/mla/rocm_aiter_mla.py  | 45 +++----------------
 2 files changed, 13 insertions(+), 46 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index d28bc065852d..188becb6ad6f 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -119,14 +119,12 @@ def supports_block_size(cls, block_size: int | None) -> bool:
             return True
 
         for supported_size in cls.supported_kernel_block_sizes:
-            is_multiple_of = (
-                isinstance(supported_size, MultipleOf)
-                and block_size % supported_size.base == 0
-            )
-            is_int_equal = (
-                isinstance(supported_size, int) and block_size == supported_size
-            )
-            if is_multiple_of or is_int_equal:
+            if isinstance(supported_size, MultipleOf):
+                supported_size = supported_size.base
+            # With hybrid_blocks feature, the framework-level block size
+            # only needs to be a multiple of the kernel's requirement,
+            # even if the kernel requires a fixed block_size.
+            if block_size % supported_size == 0:
                 return True
         return False
 
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index e1864526f02c..6ccc1a341d56 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -7,9 +7,8 @@
 import torch
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.attention.backends.abstract import AttentionLayer
+from vllm.attention.backends.abstract import AttentionLayer, MultipleOf
 from vllm.config import VllmConfig
-from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonDecodeMetadata,
@@ -22,6 +21,8 @@
 
 
 class AiterMLABackend(MLACommonBackend):
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [1]
+
     @staticmethod
     def get_name() -> str:
         return "ROCM_AITER_MLA"
@@ -71,9 +72,8 @@ def __init__(
         )
 
         self.compilation_config = vllm_config.compilation_config
-        max_num_pages_per_req = cdiv(
-            vllm_config.model_config.max_model_len, self.kv_cache_spec.block_size
-        )
+        # kernel block size is always 1.
+        max_num_pages_per_req = vllm_config.model_config.max_model_len
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         max_num_pages = max_num_reqs * max_num_pages_per_req
 
@@ -82,11 +82,6 @@ def __init__(
         # so we can only use the persistent buffer if a cudagraph is actually
         # being used.
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
-            self.block_table_remapping = torch.zeros(
-                [max_num_reqs, max_num_pages_per_req * self.kv_cache_spec.block_size],
-                dtype=torch.int32,
-                device=device,
-            )
             self.paged_kv_indptr = torch.zeros(
                 max_num_reqs + 1, dtype=torch.int32, device=device
             )
@@ -111,36 +106,16 @@ def _build_decode(
         num_decode_tokens: int,
         dcp_tot_seq_lens_device: torch.Tensor | None,
     ) -> AiterMLADecodeMetadata:
-        page_size = self.kv_cache_spec.block_size
+        # kernel block size is always 1, although the kv block size is not 1.
         device = self.device
         num_reqs = seq_lens_device.size(0)
-        bs, _ = block_table_tensor.shape
-        block_table_tensor = (
-            block_table_tensor.unsqueeze(-1).expand(-1, -1, page_size) * page_size
-        )
-        block_table_tensor = (
-            block_table_tensor
-            + torch.arange(
-                0,
-                page_size,
-                device=block_table_tensor.device,
-                dtype=block_table_tensor.dtype,
-            )[None, None, :]
-        )
-        block_table_tensor = block_table_tensor.view(bs, -1)
 
-        # after remapping, we assume the block size already equals to 1
-
-        max_blk_size_per_req = block_table_tensor.shape[-1]
         mask = torch.arange(
             block_table_tensor.size(1), dtype=block_table_tensor.dtype, device=device
         ).unsqueeze(0) < seq_lens_device.unsqueeze(1)
         paged_kv_indices = block_table_tensor[mask]
 
-        paged_kv_last_page_len = seq_lens_device % page_size
-        paged_kv_last_page_len = torch.where(
-            paged_kv_last_page_len == 0, page_size, paged_kv_last_page_len
-        )
+        paged_kv_last_page_len = torch.where(seq_lens_device == 0, 1, seq_lens_device)
 
         paged_kv_indptr = torch.cat(
             [
@@ -151,12 +126,6 @@ def _build_decode(
 
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             num_actual_pages = paged_kv_indices.size(0)
-            self.block_table_remapping[:num_reqs, :max_blk_size_per_req].copy_(
-                block_table_tensor, non_blocking=True
-            )
-            block_table_tensor = self.block_table_remapping[
-                :num_reqs, :max_blk_size_per_req
-            ]
 
             self.paged_kv_indices[:num_actual_pages].copy_(
                 paged_kv_indices, non_blocking=True

From 3168285fcaaee09bc93dce7bc9ae6ee823c71652 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Thu, 20 Nov 2025 02:37:09 +0000
Subject: [PATCH 220/578] [cpu][ci] Add initial set of tests for Arm CPUs
 (#28657)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .../scripts/hardware_ci/run-cpu-test-arm.sh   | 64 +++++++++++++++++++
 docker/Dockerfile.cpu                         | 10 +++
 2 files changed, 74 insertions(+)
 create mode 100755 .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
new file mode 100755
index 000000000000..d0036f24c8d0
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-0-16}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
+NUMA_NODE=${NUMA_NODE:-0}
+
+export CMAKE_BUILD_PARALLEL_LEVEL=32
+
+# Setup cleanup
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test-"$NUMA_NODE" || true;
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+
+# Run the image, setting --shm-size=4g for tensor parallel.
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+
+function cpu_tests() {
+  set -e
+  export NUMA_NODE=$2
+
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pip list"
+
+  # offline inference
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run kernel tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -v -s tests/kernels/test_onednn.py
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+
+  # basic online serving
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 4c961defaeda..eb3807ef0ca4 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
     && curl -LsSf https://astral.sh/uv/install.sh | sh
 
+ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
@@ -122,6 +123,15 @@ WORKDIR /workspace/vllm
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
     cp requirements/test.in requirements/cpu-test.in && \
     sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
+    remove_packages_not_supported_on_aarch64() { \
+      case "$(uname -m)" in \
+        aarch64|arm64) \
+          sed -i '/decord/d' requirements/cpu-test.in; \
+          sed -i '/terratorch/d' requirements/cpu-test.in; \
+          ;; \
+      esac; \
+    }; \
+    remove_packages_not_supported_on_aarch64 && \
     sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
     sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
     sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \

From fcbcba6c70a3308705aa21adebb443bf9015b486 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <bchislett@nvidia.com>
Date: Wed, 19 Nov 2025 22:17:48 -0500
Subject: [PATCH 221/578] [Feat] Iteration-level profiling for Torch and CUDA
 profiler (#28987)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/v1/worker/test_gpu_profiler.py | 203 +++++++++++++++++++++++++
 vllm/envs.py                         |  16 ++
 vllm/profiler/gpu_profiler.py        | 213 ++++++++++++++++++++++++---
 vllm/v1/engine/async_llm.py          |  14 +-
 vllm/v1/worker/gpu_worker.py         |  50 ++-----
 5 files changed, 435 insertions(+), 61 deletions(-)
 create mode 100644 tests/v1/worker/test_gpu_profiler.py

diff --git a/tests/v1/worker/test_gpu_profiler.py b/tests/v1/worker/test_gpu_profiler.py
new file mode 100644
index 000000000000..f7255fae05a4
--- /dev/null
+++ b/tests/v1/worker/test_gpu_profiler.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+import vllm.envs as envs
+from vllm.profiler.gpu_profiler import WorkerProfiler
+
+
+class ConcreteWorkerProfiler(WorkerProfiler):
+    """
+    A basic implementation of a worker profiler for testing purposes.
+    """
+
+    def __init__(self):
+        self.start_call_count = 0
+        self.stop_call_count = 0
+        self.should_fail_start = False
+        super().__init__()
+
+    def _start(self) -> None:
+        if self.should_fail_start:
+            raise RuntimeError("Simulated start failure")
+        self.start_call_count += 1
+
+    def _stop(self) -> None:
+        self.stop_call_count += 1
+
+
+@pytest.fixture(autouse=True)
+def reset_mocks():
+    """Fixture to reset mocks and env variables before each test."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 0
+    envs.VLLM_PROFILER_MAX_ITERS = 0
+
+
+def test_immediate_start_stop():
+    """Test standard start without delay."""
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+    assert profiler._running is True
+    assert profiler._active is True
+    assert profiler.start_call_count == 1
+
+    profiler.stop()
+    assert profiler._running is False
+    assert profiler._active is False
+    assert profiler.stop_call_count == 1
+
+
+def test_delayed_start():
+    """Test that profiler waits for N steps before actually starting."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    # User requests start
+    profiler.start()
+
+    # Should be active (request accepted) but not running (waiting for delay)
+    assert profiler._active is True
+    assert profiler._running is False
+    assert profiler.start_call_count == 0
+
+    # Step 1
+    profiler.step()
+    assert profiler._running is False
+
+    # Step 2 (Threshold reached)
+    profiler.step()
+    assert profiler._running is True
+    assert profiler.start_call_count == 1
+
+
+def test_max_iterations():
+    """Test that profiler stops automatically after max iterations."""
+    envs.VLLM_PROFILER_MAX_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+    assert profiler._running is True
+
+    # Iteration 1
+    profiler.step()  # profiling_count becomes 1
+    assert profiler._running is True
+
+    # Iteration 2
+    profiler.step()  # profiling_count becomes 2
+    assert profiler._running is True
+
+    # Iteration 3 (Exceeds max)
+    profiler.step()  # profiling_count becomes 3
+
+    # Should have stopped now
+    assert profiler._running is False
+    assert profiler.stop_call_count == 1
+
+
+def test_delayed_start_and_max_iters():
+    """Test combined delayed start and max iterations."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 2
+    envs.VLLM_PROFILER_MAX_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+
+    # Step 1
+    profiler.step()
+    assert profiler._running is False
+    assert profiler._active is True
+
+    # Step 2 (Starts now)
+    profiler.step()
+    assert profiler._profiling_for_iters == 1
+    assert profiler._running is True
+    assert profiler._active is True
+
+    # Next iteration
+    profiler.step()
+    assert profiler._profiling_for_iters == 2
+    assert profiler._running is True
+
+    # Iteration 2 (exceeds max)
+    profiler.step()
+
+    # Should have stopped now
+    assert profiler._running is False
+    assert profiler.stop_call_count == 1
+
+
+def test_idempotency():
+    """Test that calling start/stop multiple times doesn't break logic."""
+    profiler = ConcreteWorkerProfiler()
+
+    # Double Start
+    profiler.start()
+    profiler.start()
+    assert profiler.start_call_count == 1  # Should only start once
+
+    # Double Stop
+    profiler.stop()
+    profiler.stop()
+    assert profiler.stop_call_count == 1  # Should only stop once
+
+
+def test_step_inactive():
+    """Test that stepping while inactive does nothing."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    # Not started yet
+    profiler.step()
+    profiler.step()
+
+    # Even though we stepped 2 times, start shouldn't happen because active=False
+    assert profiler.start_call_count == 0
+
+
+def test_start_failure():
+    """Test behavior when the underlying _start method raises exception."""
+    profiler = ConcreteWorkerProfiler()
+    profiler.should_fail_start = True
+
+    profiler.start()
+
+    # Exception caught in _call_start
+    assert profiler._running is False  # Should not mark as running
+    assert profiler._active is True  # Request is still considered active
+    assert profiler.start_call_count == 0  # Logic failed inside start
+
+
+def test_shutdown():
+    """Test that shutdown calls stop only if running."""
+    profiler = ConcreteWorkerProfiler()
+
+    # Case 1: Not running
+    profiler.shutdown()
+    assert profiler.stop_call_count == 0
+
+    # Case 2: Running
+    profiler.start()
+    profiler.shutdown()
+    assert profiler.stop_call_count == 1
+
+
+def test_mixed_delay_and_stop():
+    """Test manual stop during the delay period."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 5
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+    profiler.step()
+    profiler.step()
+
+    # User cancels before delay finishes
+    profiler.stop()
+    assert profiler._active is False
+
+    # Further steps should not trigger start
+    profiler.step()
+    profiler.step()
+    profiler.step()
+
+    assert profiler.start_call_count == 0
diff --git a/vllm/envs.py b/vllm/envs.py
index 614bc94b978b..888a09cf6d3e 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -92,11 +92,14 @@
     VLLM_TORCH_PROFILER_DIR: str | None = None
     VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
     VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
+    VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False
     VLLM_USE_AOT_COMPILE: bool = False
     VLLM_USE_BYTECODE_HOOK: bool = False
     VLLM_FORCE_AOT_LOAD: bool = False
     VLLM_TORCH_PROFILER_WITH_STACK: bool = True
     VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
+    VLLM_PROFILER_DELAY_ITERS: int = 0
+    VLLM_PROFILER_MAX_ITERS: int = 0
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
@@ -872,6 +875,19 @@ def get_vllm_port() -> int | None:
     "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
         os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
     ),
+    # Disable torch profiling of the AsyncLLMEngine process.
+    # If set to 1, will not profile the engine process.
+    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0"
+    ),
+    # Delay number of iterations before starting profiling when using
+    # the torch/torch CUDA profiler. If set to 0, will start profiling immediately.
+    "VLLM_PROFILER_DELAY_ITERS": lambda: int(
+        os.getenv("VLLM_PROFILER_DELAY_ITERS", "0")
+    ),
+    # Maximum number of iterations to profile when using the torch/torch CUDA profiler.
+    # If set to 0, will not limit the number of iterations.
+    "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
     # If set, vLLM will use Triton implementations of AWQ.
     "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
     # If set, allow loading or unloading lora adapters in runtime,
diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py
index 58c668953161..2155b67a3db4 100644
--- a/vllm/profiler/gpu_profiler.py
+++ b/vllm/profiler/gpu_profiler.py
@@ -1,37 +1,212 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from abc import ABC, abstractmethod
+from contextlib import nullcontext
+
+import torch
+from typing_extensions import override
+
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
-class CudaProfilerWrapper:
+class WorkerProfiler(ABC):
     def __init__(self) -> None:
-        self._profiler_running = False
-        # Note: lazy import to avoid dependency issues if CUDA is not available.
-        import torch.cuda.profiler as cuda_profiler
+        self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS
+        if self._delay_iters > 0:
+            logger.info_once(
+                "GPU profiling will start "
+                f"{self._delay_iters} steps after start_profile."
+            )
 
-        self._cuda_profiler = cuda_profiler
+        self._max_iters = envs.VLLM_PROFILER_MAX_ITERS
+        if self._max_iters > 0:
+            logger.info_once(
+                "GPU profiling will stop "
+                f"after {self._max_iters} worker steps, "
+                "or when stop_profile is received."
+            )
 
-    def start(self) -> None:
+        # Track when the profiler gets triggered by start_profile
+        self._active_iteration_count = 0
+        self._active = False
+
+        # Track when the profiler is actually running
+        self._profiling_for_iters = 0
+        self._running = False
+
+    @abstractmethod
+    def _start(self) -> None:
+        """Start the profiler."""
+        pass
+
+    @abstractmethod
+    def _stop(self) -> None:
+        """Stop the profiler."""
+        pass
+
+    def _call_start(self) -> None:
+        """Call _start with error handling but no safeguards."""
         try:
-            self._cuda_profiler.start()
-            self._profiler_running = True
-            logger.info_once("Started CUDA profiler")
+            self._start()
+            self._running = True  # Only mark as running if start succeeds
         except Exception as e:
-            logger.warning_once("Failed to start CUDA profiler: %s", e)
+            logger.warning("Failed to start profiler: %s", e)
+
+    def _call_stop(self) -> None:
+        """Call _stop with error handling but no safeguards."""
+        try:
+            self._stop()
+            logger.info("Profiler stopped successfully.")
+        except Exception as e:
+            logger.warning("Failed to stop profiler: %s", e)
+        self._running = False  # Always mark as not running, assume stop worked
+
+    def start(self) -> None:
+        """Attempt to start the profiler, accounting for delayed starts."""
+        if self._active:
+            logger.debug(
+                "start_profile received when profiler is already active. "
+                "Ignoring request."
+            )
+            return
+        self._active = True
+        if self._delay_iters == 0:
+            self._call_start()
+
+    def step(self) -> None:
+        """Update the profiler state at each worker step,
+        to handle delayed starts and max iteration limits."""
+        if not self._active:
+            return
+
+        self._active_iteration_count += 1
+
+        if (
+            not self._running
+            and self._delay_iters > 0
+            and self._active_iteration_count == self._delay_iters
+        ):
+            logger.info("Starting profiler after delay...")
+            self._call_start()
+
+        if self._running:
+            self._profiling_for_iters += 1
+
+        if (
+            self._max_iters > 0
+            and self._running
+            and self._profiling_for_iters > self._max_iters
+        ):
+            # Automatically stop the profiler after max iters
+            # will be marked as not running, but leave as active so that stop
+            # can clean up properly
+            logger.info("Max profiling iterations reached. Stopping profiler...")
+            self._call_stop()
+            return
 
     def stop(self) -> None:
-        if self._profiler_running:
-            try:
-                self._cuda_profiler.stop()
-                logger.info_once("Stopped CUDA profiler")
-            except Exception as e:
-                logger.warning_once("Failed to stop CUDA profiler: %s", e)
-            finally:
-                self._profiler_running = False
+        """Attempt to stop the profiler, accounting for overlapped calls."""
+        if not self._active:
+            logger.debug(
+                "stop_profile received when profiler is not active. Ignoring request."
+            )
+            return
+        self._active = False
+        self._active_iteration_count = 0
+        self._profiling_for_iters = 0
+
+        if self._running:
+            self._call_stop()
 
     def shutdown(self) -> None:
         """Ensure profiler is stopped when shutting down."""
-        self.stop()
+        logger.info_once("Shutting down profiler")
+        if self._running:
+            self.stop()
+
+    def annotate_context_manager(self, name: str):
+        """Return a context manager to annotate profiler traces."""
+        return nullcontext()
+
+
+class TorchProfilerWrapper(WorkerProfiler):
+    def __init__(self, worker_name: str, local_rank: int) -> None:
+        super().__init__()
+
+        self.local_rank = local_rank
+        torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+        logger.info(
+            "Torch profiling enabled. Traces will be saved to: %s",
+            torch_profiler_trace_dir,
+        )
+        logger.debug(
+            "Profiler config: record_shapes=%s,"
+            "profile_memory=%s,with_stack=%s,with_flops=%s",
+            envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+            envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+            envs.VLLM_TORCH_PROFILER_WITH_STACK,
+            envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+        )
+        self.profiler = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+            profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+            with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+            with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
+            ),
+        )
+
+    @override
+    def _start(self) -> None:
+        self.profiler.start()
+
+    @override
+    def _stop(self) -> None:
+        self.profiler.stop()
+
+        rank = self.local_rank
+        profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
+        profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
+        sort_key = "self_cuda_time_total"
+        table = self.profiler.key_averages().table(sort_by=sort_key)
+
+        with open(profiler_out_file, "w") as f:
+            print(table, file=f)
+
+        # only print profiler results on rank 0
+        if rank == 0:
+            print(table)
+
+    @override
+    def annotate_context_manager(self, name: str):
+        return torch.profiler.record_function(name)
+
+
+class CudaProfilerWrapper(WorkerProfiler):
+    def __init__(self) -> None:
+        super().__init__()
+        # Note: lazy import to avoid dependency issues if CUDA is not available.
+        import torch.cuda.profiler as cuda_profiler
+
+        self._cuda_profiler = cuda_profiler
+
+    @override
+    def _start(self) -> None:
+        self._cuda_profiler.start()
+
+    @override
+    def _stop(self) -> None:
+        self._cuda_profiler.stop()
+
+    @override
+    def annotate_context_manager(self, name: str):
+        return torch.cuda.nvtx.range(name)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c160c7cbcab4..abf2c8cfa453 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -160,11 +160,23 @@ def __init__(
         except RuntimeError:
             pass
 
-        if envs.VLLM_TORCH_PROFILER_DIR:
+        if (
+            envs.VLLM_TORCH_PROFILER_DIR
+            and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM
+        ):
             logger.info(
                 "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s",  # noqa: E501
                 envs.VLLM_TORCH_PROFILER_DIR,
             )
+            if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0:
+                logger.warning_once(
+                    "Torch profiler received max_iters or delay_iters setting. These "
+                    "are not compatible with the AsyncLLM profiler and will be ignored "
+                    "for the AsyncLLM process. Engine process profiling will still "
+                    "respect these settings. Consider setting "
+                    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable "
+                    "AsyncLLM profiling."
+                )
             worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
             self.profiler = torch.profiler.profile(
                 activities=[
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 7f9cdd221224..18cbc3826279 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -36,7 +36,7 @@
 from vllm.model_executor.models.interfaces import is_mixture_of_experts
 from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
 from vllm.platforms import current_platform
-from vllm.profiler.gpu_profiler import CudaProfilerWrapper
+from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
 from vllm.utils.mem_constants import GiB_bytes
@@ -90,32 +90,9 @@ def __init__(
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
-            logger.info(
-                "Profiling enabled. Traces will be saved to: %s",
-                torch_profiler_trace_dir,
-            )
-            logger.debug(
-                "Profiler config: record_shapes=%s,"
-                "profile_memory=%s,with_stack=%s,with_flops=%s",
-                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-            )
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.CUDA,
-                ],
-                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
-                ),
+            self.profiler = TorchProfilerWrapper(
+                worker_name=worker_name, local_rank=self.local_rank
             )
         elif envs.VLLM_TORCH_CUDA_PROFILE:
             self.profiler = CudaProfilerWrapper()
@@ -526,10 +503,12 @@ def annotate_profile(self, scheduler_output):
         if not self.profiler:
             return nullcontext()
 
+        self.profiler.step()
+
         num_new = len(scheduler_output.scheduled_new_reqs)
         num_cached = len(scheduler_output.scheduled_cached_reqs.req_ids)
 
-        return torch.profiler.record_function(
+        return self.profiler.annotate_context_manager(
             f"execute_new_{num_new}_cached_{num_cached}"
         )
 
@@ -587,24 +566,11 @@ def take_draft_token_ids(self) -> DraftTokenIds | None:
 
     def profile(self, is_start: bool = True):
         if self.profiler is None:
-            raise RuntimeError("Profiler is not enabled.")
+            raise RuntimeError("Profiling is not enabled.")
         if is_start:
             self.profiler.start()
         else:
             self.profiler.stop()
-            if isinstance(self.profiler, torch.profiler.profile):
-                rank = self.local_rank
-                profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
-                profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
-                sort_key = "self_cuda_time_total"
-                table = self.profiler.key_averages().table(sort_by=sort_key)
-
-                with open(profiler_out_file, "w") as f:
-                    print(table, file=f)
-
-                # only print profiler results on rank 0
-                if rank == 0:
-                    print(table)
 
     def execute_dummy_batch(self) -> None:
         self.model_runner._dummy_run(1, uniform_decode=True)
@@ -865,6 +831,8 @@ def save_tensorized_model(
     def shutdown(self) -> None:
         if runner := getattr(self, "model_runner", None):
             runner.ensure_kv_transfer_shutdown()
+        if self.profiler is not None:
+            self.profiler.shutdown()
 
 
 def init_worker_distributed_environment(

From a8c536829cb7b5564f54beff97e938666f286dd6 Mon Sep 17 00:00:00 2001
From: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com>
Date: Wed, 19 Nov 2025 19:39:36 -0800
Subject: [PATCH 222/578] Consolidate Nvidia ModelOpt quant config handling for
 all quantization methods (#28076)

Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
---
 .../layers/quantization/modelopt.py           | 499 ++++++++----------
 1 file changed, 234 insertions(+), 265 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index dedab33c1bdb..6b5ed7762eb3 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Callable
+from fnmatch import fnmatch
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
@@ -13,7 +14,6 @@
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEConfig,
     FusedMoEQuantConfig,
     RoutingMethodType,
     fp8_w8a8_moe_quant_config,
@@ -86,45 +86,218 @@
 KV_CACHE_QUANT_ALGOS = ["FP8"]
 
 
-class ModelOptFp8Config(QuantizationConfig):
+class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: "ModelOptQuantConfigBase"):
+        super().__init__(quant_config)
+
+
+class ModelOptQuantConfigBase(QuantizationConfig):
+    LinearMethodCls: type = LinearMethodBase
+    FusedMoEMethodCls: type = FusedMoEMethodBase
+    KVCacheMethodCls: type = BaseKVCacheMethod
+
+    def __init__(
+        self,
+        exclude_modules: list[str],
+    ):
+        super().__init__()
+        self.exclude_modules: list[str] = exclude_modules
+
+    def is_layer_excluded(self, prefix: str) -> bool:
+        """
+        Check if a layer should be excluded from quantization.
+
+        Handles both exact matching (for fused layers) and ModelOpt wildcard matching.
+
+        The ModelOpt exclude_modules list is a list of wildcards.
+        """
+        if len(self.exclude_modules) == 0:
+            return False
+
+        # First check exact matching with fused layer support
+        if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping):
+            return True
+
+        # TODO: This special hard coded logic is not needed for quantized checkpoints
+        # generated by ModelOpt >= 0.39.0 where they are handled natually by the
+        # exclude_modules config. But need to keep them for loading quantized
+        # checkpoints generated by older versions. Then check substring matching
+        # for patterns not caught by exact match
+        for exclude_module in self.exclude_modules:
+            # Skip exact matches already handled above
+            if exclude_module != prefix and (
+                exclude_module in prefix
+                or (
+                    prefix.startswith("language_model.")
+                    and exclude_module in prefix.removeprefix("language_model.")
+                )
+            ):
+                return True
+
+        # modelopt exclude modules are not simple strings, they are wildcards
+        for wildcard_pattern in self.exclude_modules:
+            if fnmatch(prefix, wildcard_pattern):
+                return True
+
+        return False
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        # handle kv-cache first so we can focus only on weight quantization thereafter
+        if isinstance(layer, Attention):
+            return self.KVCacheMethodCls(self)
+
+        # handle exclusion
+        if self.is_layer_excluded(prefix):
+            if isinstance(layer, LinearBase):
+                return UnquantizedLinearMethod()
+            return None
+
+        # TODO: This special hard coded logic is not needed for quantized checkpoints
+        # generated by ModelOpt >= 0.39.0 where they are handled natually by the
+        # exclude_modules config. But need to keep them for loading quantized
+        # checkpoints generated by older versions. Then check substring matching
+        # for patterns not caught by exact match
+        if "vision_tower" in prefix or "vision_model" in prefix:
+            return UnquantizedLinearMethod()
+
+        # now, the layer is quantized, handle it here
+        if isinstance(layer, LinearBase):
+            return self.LinearMethodCls(self)
+        elif isinstance(layer, FusedMoE):
+            return self.FusedMoEMethodCls(quant_config=self, layer=layer)
+
+        return None
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if len(self.exclude_modules) > 0:
+            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def _from_config(
+        cls,
+        *,
+        quant_method: str,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
+        original_config: dict[str, Any],
+        group_size: int | None,
+    ) -> "ModelOptQuantConfigBase":
+        raise NotImplementedError("Please implement this function in sub classes")
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "ModelOptQuantConfigBase":
+        # Handle both ModelOpt format and compressed-tensors style format
+        if "quantization" in config:
+            # Traditional ModelOpt format:
+            # {"quantization": {"quant_algo": "..."}}
+            quant_config = cls.get_from_keys(config, ["quantization"])
+            if not isinstance(quant_config, dict):
+                raise ValueError("Expected 'quantization' to be a dictionary in config")
+
+            quant_method = quant_config.get("quant_algo")
+
+            # Handle kv_cache_quant_algo with proper type validation
+            kv_cache_quant_method = quant_config.get("kv_cache_quant_algo")
+
+            # Handle group_size with proper type validation
+            group_size_raw = quant_config.get("group_size")
+
+            # "exclude_modules" is the key in the legacy hf_quant_config.json
+            exclude_modules = quant_config.get("exclude_modules", [])
+        else:
+            # Compressed-tensors style format:
+            # {"quant_algo": "...", "quant_method": "modelopt"}
+            quant_method = config.get("quant_algo")
+            kv_cache_quant_method = config.get("kv_cache_quant_algo")
+            # "ignore" is the key in config.json
+            exclude_modules = config.get("ignore", [])
+            group_size_raw = config.get("group_size")
+
+        if not quant_method:
+            raise ValueError("Missing 'quant_algo' in quantization config")
+
+        if kv_cache_quant_method is None:
+            # No KV cache quantization, keep this branch just to have this comment
+            pass
+        elif not isinstance(kv_cache_quant_method, str):
+            raise ValueError(
+                f"kv_cache_quant_algo must be a string, got "
+                f"{type(kv_cache_quant_method)}"
+            )
+
+        if not isinstance(exclude_modules, list):
+            raise ValueError(
+                f"exclude_modules must be a list, got {type(exclude_modules)}"
+            )
+
+        if group_size_raw is None:
+            group_size = None
+        elif isinstance(group_size_raw, int):
+            group_size = group_size_raw
+        else:
+            try:
+                group_size = int(group_size_raw)
+            except (ValueError, TypeError):
+                raise ValueError(
+                    f"group_size must be an integer, got {type(group_size_raw)}"
+                ) from None
+
+        if quant_method not in QUANT_ALGOS:
+            raise ValueError(
+                f"ModelOpt currently only supports: {QUANT_ALGOS} "
+                "quantizations in vLLM. Please check the "
+                "`hf_quant_config.json` file for your model's "
+                "quant configuration."
+            )
+        return cls._from_config(
+            quant_method=quant_method,
+            kv_cache_quant_method=kv_cache_quant_method,
+            exclude_modules=exclude_modules,
+            group_size=group_size,
+            original_config=config,
+        )
+
+
+class ModelOptFp8Config(ModelOptQuantConfigBase):
     """Config class for ModelOpt FP8."""
 
     def __init__(
         self,
-        is_checkpoint_fp8_serialized: bool = False,
-        kv_cache_quant_method: str | None = None,
-        exclude_modules: list[str] | None = None,
+        is_checkpoint_fp8_serialized: bool,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
     ) -> None:
-        super().__init__()
+        super().__init__(exclude_modules)
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         self.kv_cache_quant_method = kv_cache_quant_method
-        self.exclude_modules = exclude_modules or []
         if is_checkpoint_fp8_serialized:
             logger.warning(
                 "Detected ModelOpt fp8 checkpoint. Please note that"
                 " the format is experimental and could change."
             )
 
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
+    def get_name(self) -> QuantizationMethods:
         return "modelopt"
 
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
         return [torch.bfloat16, torch.half]
 
     @classmethod
     def get_min_capability(cls) -> int:
         return 89
 
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return ["hf_quant_config.json"]
-
-    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
-        if self.exclude_modules is not None:
-            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
-
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
@@ -158,88 +331,19 @@ def override_quantization_method(
         return None
 
     @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
-        # Handle both ModelOpt format and compressed-tensors style format
-        if "quantization" in config:
-            # ModelOpt format: {"quantization": {"quant_algo": "..."}}
-            quant_config = cls.get_from_keys(config, ["quantization"])
-            if not isinstance(quant_config, dict):
-                raise ValueError("Expected 'quantization' to be a dictionary in config")
-            quant_method = quant_config.get("quant_algo", "")
-            if not quant_method:
-                raise ValueError("Missing 'quant_algo' in quantization config")
-            kv_cache_quant_method = quant_config.get("kv_cache_quant_algo")
-            # "exclude_modules" is the key in the legacy hf_quant_config.json
-            exclude_modules = quant_config.get("exclude_modules")
-        else:
-            # Compressed-tensors style format:
-            # {"quant_algo": "...", "quant_method": "modelopt"}
-            quant_method = config.get("quant_algo", "")
-            kv_cache_quant_method = config.get("kv_cache_quant_algo")
-            # "ignore" is the key in config.json
-            exclude_modules = config.get("ignore")
-
-        if quant_method not in QUANT_ALGOS:
-            raise ValueError(
-                f"ModelOpt currently only supports: {QUANT_ALGOS} "
-                "quantizations in vLLM. Please check the "
-                "`hf_quant_config.json` file for your model's "
-                "quant configuration."
-            )
+    def _from_config(
+        cls,
+        *,
+        quant_method: str,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
+        original_config: dict[str, Any],
+        **kwargs: Any,
+    ) -> "ModelOptFp8Config":
         is_checkpoint_fp8_serialized = "FP8" in quant_method
 
         return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules)
 
-    def is_layer_excluded(self, prefix: str) -> bool:
-        """
-        Check if a layer should be excluded from quantization.
-        Handles both exact matching (for fused layers) and substring matching.
-
-        This method handles both regular models and multimodal models that use
-        the language_model prefix. For multimodal models, it checks if the
-        module name (without the language_model prefix) is in the exclude list.
-        """
-        if self.exclude_modules is None:
-            return False
-
-        # First check exact matching with fused layer support
-        if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping):
-            return True
-
-        # Then check substring matching for patterns not caught by exact match
-        for module in self.exclude_modules:
-            # Skip exact matches already handled above
-            if module != prefix and (
-                module in prefix
-                or (
-                    prefix.startswith("language_model.")
-                    and module in prefix.removeprefix("language_model.")
-                )
-            ):
-                return True
-        return False
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import (  # Avoid circular import
-            Attention,
-            MLAAttention,
-        )
-
-        if isinstance(layer, LinearBase):
-            if self.is_layer_excluded(prefix):
-                return UnquantizedLinearMethod()
-            # Check if this is a vision model layer that should not be quantized
-            if "vision_tower" in prefix or "vision_model" in prefix:
-                return UnquantizedLinearMethod()
-            return ModelOptFp8LinearMethod(self)
-        elif isinstance(layer, (Attention, MLAAttention)):
-            return ModelOptFp8KVCacheMethod(self)
-        elif isinstance(layer, FusedMoE):
-            return ModelOptFp8MoEMethod(self, layer)
-        return None
-
 
 class ModelOptFp8LinearMethod(LinearMethodBase):
     """Linear method for Model Optimizer static quantization.
@@ -344,7 +448,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: ModelOptFp8Config,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
     ) -> None:
         super().__init__(layer.moe_config)
         self.layer = layer
@@ -686,7 +790,12 @@ def apply(
             )
 
 
-class ModelOptNvFp4Config(QuantizationConfig):
+ModelOptFp8Config.LinearMethodCls = ModelOptFp8LinearMethod
+ModelOptFp8Config.FusedMoEMethodCls = ModelOptFp8MoEMethod
+ModelOptFp8Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod
+
+
+class ModelOptNvFp4Config(ModelOptQuantConfigBase):
     """Config class for ModelOpt FP4."""
 
     def __init__(
@@ -696,7 +805,7 @@ def __init__(
         exclude_modules: list[str],
         group_size: int = 16,
     ) -> None:
-        super().__init__()
+        super().__init__(exclude_modules)
         self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
         if is_checkpoint_nvfp4_serialized:
             logger.warning(
@@ -706,28 +815,17 @@ def __init__(
 
             self.group_size = group_size
             self.kv_cache_quant_algo = kv_cache_quant_algo
-            self.exclude_modules = exclude_modules
 
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
+    def get_name(self) -> QuantizationMethods:
         return "modelopt_fp4"
 
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
         return [torch.bfloat16, torch.half, torch.float8_e4m3fn]
 
     @classmethod
     def get_min_capability(cls) -> int:
         return 80
 
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return ["hf_quant_config.json"]
-
-    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
-        if self.exclude_modules is not None:
-            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
-
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
@@ -761,105 +859,25 @@ def override_quantization_method(
         return None
 
     @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config":
-        # Handle both traditional ModelOpt format and compressed-tensors
-        # style format
-        if "quantization" in config:
-            # Traditional ModelOpt format:
-            # {"quantization": {"quant_algo": "..."}}
-            quant_config = cls.get_from_keys(config, ["quantization"])
-            if not isinstance(quant_config, dict):
-                raise ValueError("Expected 'quantization' to be a dictionary in config")
-
-            quant_method = quant_config.get("quant_algo", "")
-            if not quant_method:
-                raise ValueError("Missing 'quant_algo' in quantization config")
-
-            # Handle kv_cache_quant_algo with proper type validation
-            kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo")
-            if kv_cache_quant_algo_raw is None:
-                # No KV cache quantization by default
-                kv_cache_quant_algo = None
-            elif isinstance(kv_cache_quant_algo_raw, str):
-                kv_cache_quant_algo = kv_cache_quant_algo_raw
-            else:
-                raise ValueError(
-                    f"kv_cache_quant_algo must be a string, got "
-                    f"{type(kv_cache_quant_algo_raw)}"
-                )
-
-            # Handle group_size with proper type validation
-            group_size_raw = quant_config.get("group_size")
-            if group_size_raw is None:
-                group_size = 16  # Default value
-            elif isinstance(group_size_raw, int):
-                group_size = group_size_raw
-            else:
-                try:
-                    group_size = int(group_size_raw)
-                except (ValueError, TypeError):
-                    raise ValueError(
-                        f"group_size must be an integer, got {type(group_size_raw)}"
-                    ) from None
-
-            # "exclude_modules" is the key in the legacy hf_quant_config.json
-            exclude_modules = quant_config.get("exclude_modules", [])
-            if not isinstance(exclude_modules, list):
-                raise ValueError(
-                    f"exclude_modules must be a list, got {type(exclude_modules)}"
-                )
-        else:
-            # Compressed-tensors style format:
-            # {"quant_algo": "...", "quant_method": "modelopt"}
-            quant_method = config.get("quant_algo", "")
-
-            # Handle kv_cache_quant_algo with proper type validation
-            kv_cache_quant_algo_raw = config.get("kv_cache_quant_algo")
-            if kv_cache_quant_algo_raw is None:
-                # No KV cache quantization by default
-                kv_cache_quant_algo = None
-            elif isinstance(kv_cache_quant_algo_raw, str):
-                kv_cache_quant_algo = kv_cache_quant_algo_raw
-            else:
-                raise ValueError(
-                    f"kv_cache_quant_algo must be a string, got "
-                    f"{type(kv_cache_quant_algo_raw)}"
-                )
-
-            # Handle group_size with proper type validation
-            group_size_raw = config.get("group_size")
-            if group_size_raw is None:
-                group_size = 16  # Default value
-            elif isinstance(group_size_raw, int):
-                group_size = group_size_raw
-            else:
-                try:
-                    group_size = int(group_size_raw)
-                except (ValueError, TypeError):
-                    raise ValueError(
-                        f"group_size must be an integer, got {type(group_size_raw)}"
-                    ) from None
-
-            # "ignore" is the key in config.json
-            exclude_modules = config.get("ignore", [])
-            if not isinstance(exclude_modules, list):
-                raise ValueError(
-                    f"exclude_modules must be a list, got {type(exclude_modules)}"
-                )
-
-        if quant_method not in QUANT_ALGOS:
-            raise ValueError(
-                f"ModelOpt currently only supports: {QUANT_ALGOS} "
-                "quantizations in vLLM. Please check the "
-                "`hf_quant_config.json` file for your model's "
-                "quant configuration."
-            )
+    def _from_config(
+        cls,
+        *,
+        quant_method: str,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
+        original_config: dict[str, Any],
+        group_size: int | None,
+        **kwargs: Any,
+    ) -> "ModelOptNvFp4Config":
         is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
 
+        if group_size is None:
+            group_size = 16  # Default value
+
         # For FP4, these fields are required
-        if is_checkpoint_nvfp4_serialized and "quantization" in config:
+        if is_checkpoint_nvfp4_serialized and "quantization" in original_config:
             # Check if required fields are present in the quantization config
-            quant_config = config["quantization"]
+            quant_config = original_config["quantization"]
             required_fields = ["group_size", "kv_cache_quant_algo", "exclude_modules"]
             missing_fields = [
                 field for field in required_fields if field not in quant_config
@@ -872,64 +890,11 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config":
 
         return cls(
             is_checkpoint_nvfp4_serialized,
-            kv_cache_quant_algo,
+            kv_cache_quant_method,
             exclude_modules,
             group_size,
         )
 
-    def is_layer_excluded(self, prefix: str) -> bool:
-        """
-        Check if a layer should be excluded from quantization.
-        Handles both exact matching (for fused layers) and pattern matching.
-        """
-        # First check exact matching with fused layer support
-        if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping):
-            return True
-
-        # Check regex pattern matching for patterns not caught by exact match
-        import regex as re
-
-        for pattern in self.exclude_modules:
-            # Skip patterns that would be caught by exact matching
-            if "*" in pattern or "." in pattern:
-                regex_str = pattern.replace(".", r"\.").replace("*", r".*")
-                if re.fullmatch(regex_str, prefix):
-                    return True
-        return False
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import (  # Avoid circular import
-            Attention,
-            MLAAttention,
-        )
-
-        skip_layer = self.is_layer_excluded(prefix)
-        if isinstance(layer, LinearBase):
-            if skip_layer:
-                return UnquantizedLinearMethod()
-            # Check if this is a vision model layer that should not be quantized
-            if "vision_tower" in prefix or "vision_model" in prefix:
-                return UnquantizedLinearMethod()
-            return ModelOptNvFp4LinearMethod(self)
-        elif isinstance(layer, (Attention, MLAAttention)):
-            return ModelOptFp8KVCacheMethod(self)
-        elif isinstance(layer, FusedMoE):
-            if skip_layer:
-                return None
-            return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer)
-        return None
-
-
-class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
-    """
-    Supports loading kv-cache scaling factors from FP8 checkpoints.
-    """
-
-    def __init__(self, quant_config: ModelOptFp8Config | ModelOptNvFp4Config):
-        super().__init__(quant_config)
-
 
 class ModelOptNvFp4LinearMethod(LinearMethodBase):
     """Linear method for Model Optimizer NVFP4.
@@ -1157,14 +1122,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: ModelOptNvFp4Config,
-        moe: FusedMoEConfig,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
     ) -> None:
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (
             detect_nvfp4_moe_support,  # noqa: E501
         )
 
-        super().__init__(moe)
+        super().__init__(layer.moe_config)
         self.quant_config = quant_config
         self.layer = layer
         _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
@@ -1802,3 +1766,8 @@ def apply(
                 k=x.shape[1],
                 e=layer.w13_weight.shape[0],
             )
+
+
+ModelOptNvFp4Config.LinearMethodCls = ModelOptNvFp4LinearMethod
+ModelOptNvFp4Config.FusedMoEMethodCls = ModelOptNvFp4FusedMoE
+ModelOptNvFp4Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod

From 0cca9b4d130b4caddb60086ef26a0d8741582dcb Mon Sep 17 00:00:00 2001
From: prashanth058 <prashanth.dannamaneni@uipath.com>
Date: Wed, 19 Nov 2025 19:50:37 -0800
Subject: [PATCH 223/578] [Bugfix] Fix precision loss in LoRA-wrapped
 RowParallelLinear by fusing bias into GEMM (#28972)

Signed-off-by: prashanth058 <prashanth.dannamaneni@uipath.com>
---
 vllm/lora/layers/row_parallel_linear.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py
index 2ef1bd98fc61..95517b1aee26 100644
--- a/vllm/lora/layers/row_parallel_linear.py
+++ b/vllm/lora/layers/row_parallel_linear.py
@@ -63,23 +63,18 @@ def forward(
             input_parallel = splitted_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
-        output_parallel = self.apply(input_parallel)
+        bias_ = (
+            None
+            if (self.tp_rank > 0 or self.base_layer.skip_bias_add)
+            else self.base_layer.bias
+        )
+        output_parallel = self.apply(input_parallel, bias_)
         if self.base_layer.reduce_results and self.tp_size > 1:
-            output_ = tensor_model_parallel_all_reduce(output_parallel)
-        else:
-            output_ = output_parallel
-
-        if not self.base_layer.skip_bias_add:
-            output = (
-                output_ + self.base_layer.bias
-                if self.base_layer.bias is not None
-                else output_
-            )
-            output_bias = None
+            output = tensor_model_parallel_all_reduce(output_parallel)
         else:
-            output = output_
-            output_bias = self.base_layer.bias
+            output = output_parallel
 
+        output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
         if not self.base_layer.return_bias:
             return output
 
@@ -120,7 +115,7 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         return lora_b
 
     def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x)
+        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape

From fe25772aa97beb8bcb07ea49e06a2892b521a7ed Mon Sep 17 00:00:00 2001
From: Canlin Guo <canlinguosdu@gmail.com>
Date: Thu, 20 Nov 2025 12:38:12 +0800
Subject: [PATCH 224/578] [Bugfix] Handle broken frames in video loading
 (#29001)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: gcanlin <canlinguosdu@gmail.com>
Signed-off-by: 凌葭 <lvjiang.lj@alibaba-inc.com>
Co-authored-by: 凌葭 <lvjiang.lj@alibaba-inc.com>
---
 tests/multimodal/assets/corrupted.mp4 | Bin 0 -> 91678 bytes
 tests/multimodal/test_video.py        |  37 ++++++++
 vllm/multimodal/video.py              | 118 ++++++++++++++++----------
 3 files changed, 112 insertions(+), 43 deletions(-)
 create mode 100644 tests/multimodal/assets/corrupted.mp4

diff --git a/tests/multimodal/assets/corrupted.mp4 b/tests/multimodal/assets/corrupted.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..c355bb932ceeeae13cc2d0a4752dcdf8c5136720
GIT binary patch
literal 91678
zcmYJZV{|A@6D=IuImyY1ZQHhO+qP}nwr$(CZQHqd-uvOl^z7cftGcSXSI=5A0{{Sk
zYwYM|Yvy2M1poj9@ZbISqStdaptG`Nqyqo|fH1Z<G6De9%d#@mbNtnmf`R`06mANg
zbsevXwj@)o;;s=~-?-8<GvZR=8rawy;nMvNic<elG*YrcB2<jHe98j9V}?cszZwA>
zTQ^H1V@F(iT3QAwdRls>U((dk(UzTt#>K^j+S$y|$i`C7n%c(RgyuiL)TWMBmcJSs
zTSqe+YX^2*13i5`11>sTdn02m23$iUeM=hy3obf#T6S7oJ!?HnHwPmwT2}^kT30$c
z23#v6E>j~{Tn8upU&MlI>)`fl`rGQ*8*<T6)BYO$Hn>)1u11DB|9PbQwa~HGvo<l}
zqGQH2FtxX_($o1hrNec!H?p)ebNEH9uB?Uzj=#Xb&WelnHv~OHcN=RXE_ymjIyziq
zJqJe}TL%j>+y4~**MXg_j*YRggOMW_6+Nz_sr|2o0~aH%rHze+p6PE#=l|y%EX@pl
zWBGpxI^6#e|EaC*|2LU|nU$X7e*u|UI~v(r>iw#Jd3{SKdp$QD0~;$_J;&d^!Edb`
z?e)y8e|`N5?e+fC7~AVv89Dq`O<%{>?H8LFa?$@{JwrX)|I*Rd(Kpj`_-_(3d!zp)
z=wf7MV(O^>J7!~RWUXUjWBWV$e@WY4Qwt-vU*BBxOtk+G)Uh(N=3>HiFfg(<GH`O_
zVx;{qN_)NkN^5WAVESuqZ=m!4rTyQyy#bejy)mwp{%?K%%j&noMbAJ@i);5^E?l(K
z%)ij~KfnLG>$!3<v;7hdjz+e>H8HdOU98`Y@w=A4zVz&V_rZUw3g8O>fY@jh8VI2A
zvu{t=><tE$931BCZeTk({t$L=7ss+@!DCa`tP=YEE&J_MKoLIP<PA*ASZTTv6+jGU
zpyUHHKW2Uhr?2aV+(>S!UFYi^@R=LI9)fqSS^NNgmVo^)3?V7)u;f<LL&$RU1x0Fl
zM2)MQ7il<oEdD!`XHSkZJzdV~3278@W5ek<<A|6C`l$tPk>_ak5?>@9<V%0c&5?n3
zgO$MFpGms)!qM(C&(jKuAPm22=ii!`98+@iAJb{mqih+K;hLG<r6KaHZ>YGvrl$R^
znP+^J6xq+sI(DsF*>p0}8m$B!x|G#0)yvyB*iXl?@Or`Yz`G`-HB+OkhDMV1;)PLC
zSz1;b?dP57Djj$Se7RT0_wA7Mm`gZ$YLd-*qO?CtRtY5kPU4BD2v4#CqBPwT7f8mg
zCpO_(cuNn|M$AX68D}i}01(x?OPbRE5-82_*UUFnTfBv{Z<_2J(Wd9-uHKzoeJVP>
zwpoyH!%lTPtfZ_S*j1T!Wgsa}9RPcOzaE20sc0%+TUyvjtt2bliJ6@T6@A5cF>Q71
z6(1X&ZeTWgy^Y?9Wg6QAe*Of`Y+Eu`WIPe{<BV^mMgtlfz+BDL-3&HWU~PJ#d8A;|
z>(tN)?p8`D)SUAt0a0B-2-&K>%ji|YDb3Bf0n3DIi-dAh!6@CXCUM({{(09A1@udG
z|KuI>FZJ(Eo@PX#3kmcv5UESADcZ2)68a`2k~ww)uYaDAv_ll%QAQ9CF3*D^p_gs5
z2<l0p!o#e;d6c20Jl)V;xBHmTq8h*I-q2Bq+gsr-ow~C^`$JHBn@<jt&>Sy!a~{V~
zKI_|2SH8ZAXTk%1jmc{-1wUT15Fj2lB4}#V$GMexgIt%yKODUZm~Bm<e};41X>Xr+
z12#b+=M&P5_5_Td9Qn>=nhszeJ45>+GN{hO#$z_XTo?}%KR&*T#iNg+W=i6ok>QuB
zTJo1d{iR!UH3GN8C-YQ|%~M3qE(Y7$cS(7)(5$-Lqp<2JyQho+&!yNMQf80m*eH(j
zfykGPM2{WSDBKTh&GNg}@iuwKqg_5T+68&sn_^XA?l};k45G$fL|k17?Ri#iK)W*7
zAfqzQ9&oA`ta)IE)!;8xO%F_Y&I(UGgpkNSf_78S#TUG+^rbwx!}u-X^lL?EHh0zx
zw-UUIM%W<NG6NN-M?B;Tt3&$%BQ#QAEo*r@?rOrMy$Kq;y##3<O{tAHq1!oH(iCPR
zb=If2CwaqQ^GpNECF$_1oj=Cu1PM+6urLo=?#8P(-wi!YR-{j;1)<WR<$X0Xu`{5~
zoiHrF_ln$XIl<VnJhtlozj}6o^U3ERQ+7{?ndwBuE!>)w1avc)G0Pk{O>}6bnB`5N
zn!C}F^OMpE_<k5WGSb^3sHofXW$c=b_y*h#nOy!BA}*1yjL}WxL>@zgR7p3}c+9lV
z!9Kff*w{U_bdX9(5#Iw+xqs8KB&=JywC5J?vwRRX?g~m%1>7d1@Fanjw%s_FonMr`
zrZK69#I{l@j3Yl2?RVVz8%gIyJWYa>O#}N}ib6L9{U~rB3D*IDO)I;ZC{<b+3%YUb
zJp7PC?RL`CL14Qt+UDMo1c)^Jm8?C_jF}aUn_^^7`&uO?r1t*c;#9*7t!1A=Cx&Si
zPU$LtYuXy*pG#*u!zM-FV~mV!GJ47m>a~Z^8Y{a}_qp5rE+O7;SLEAvQB!Fc_3+_r
z3HeJlAog}TLRVql6#8+_N3-_$UVe5$$_lBi7vmM9tpur%g6W^{$}4G0Og|CA?+2N;
zWc?OPqla=FXN+fb`KvDX*k;mb;P+PK-V%#?3^9&-X;F?2r(xOv*oUktl0@SFxp3|}
zhS$eBNf{YTuLaYN#Kt4^20}FP;M5R*)&VW~s2bF~3KpoXv`Y<W&CMa+fbSA1YAo9T
zRqEECqIz;NX$h$!?2A@F+|-R%!*}`^2Y2}D=fIL(^CKH}Ff4sxV4o=Wa=x*nUMYWr
zuQ`;jZ?O1-CG#_U4IA2br>8MwxdH6owenHB2J{#O14x_=7{|-%Fi!X!Ykwoh%&@Lo
z+t`3QBq}YA?~&v#i24ilY>AKwDv_j&OKv#@kWrZ`Kl<A)Z@+)=5i*%~iYzLd@NALb
zD3QqZEO{WM7Yke{He#MsD}1=U(F(MzR3UXXHsnT1==Q9!T7<gI-=ALYY>>OP%%t~I
zs`C?efC3iaPKmVCf9F$T#wI$niT~pe6UO5<#}i-FR!fo1YKrx4?_ktu+y%A!QMAC`
ze#>)W^3Qgc%c+HKGMVZ)_Uc8Bdy%D8W!)so2~j)4Fs*4&x|jB}lZbE{NuzR?OpJ4}
z$hzzX6pMb&_QxrTh^`x(Sw?YNphH#5``W`{dyQbMJ|_+UpKC|0JWmcU<eh|1WZK&d
zhjkr@VzLwjan#L-#-gf8%n@>xmO&{{Vr+&)?%7_W>one>7&-OEpwf<`F4b+3p&w7u
z<iXJ%&90Y(_vMOQ;ODr@ybB;21qIzXnL|mHFwR#5?IrQf&tIeK{lJHlwQ09oZj2W6
z!RBWpC%h<JC=U8lamE?dQgVlMwmUJPl>zagF-q*NkB~y~c0`Xdbc6`oq2P-TgbehL
z1jSlN)Ct=+`u-<HIQ|f|PQm$w#iRHE;FY>$P`DLZEcW>cvMvKvF{kSre4LN$d@6zI
zO7a*BflU9;4JR6cceR)y!%M%Y(TadpmM?ZS&1Okh%^$c@%H*sYc~(O|6YlW!=z!N-
zXOfrLl?;4{!gJ38uXRpnEHU5O!a}5h9w!mAqZJB@EeM0az^*@PndcwM(roY{Z=9E$
zhfg}Q@M;$yG~@yq-FCiiwOzbqjr>oLc3nWZO96eXUpPw>MerpWe_@M;x%Y$<aX31L
zWELejyQfc1!OOwh1ouRJ^ybYdWj}N3>afh5aiO5XBv*j3_L!BNiPg=V=1@)SKi%>I
zcu%NE-s%H-K#`u1kO_|ijPp$s_oEbhZ4xAP795;Hk1C_hS@6(%uwzTK(zzsm(oXcD
z_x5Ev!1{kejVRq(Q4c~%Ro7wpQ^Q=0LgQlQimfmbyg(H5GHzM0vpha%-HG!OMF`)r
zbFNq>D1w4Gyf`Hm1KG$fOoA>70_VnP(FXr{XX$4<l7$InE9@U)7Yweg>>E4RSz?z9
zpeeF*LRs|^_HZ;RUhMJkdn}$KO>SKFq5?$1k-DxjK~l=x>ZRuyjd9%%1Q#9)_a(C~
z=DBjFv7_&UjkxR>CUboR^_Tv6aunJl0g@#@UHYEpbDmYH9CCi~%FL|xoIlt&P_+TA
zd^$MBM8AeRN%!wgz4z}Tlb82}<10pgKdGyR-r9(t-^yOzZXD&=>fYzaR?r-KXste)
zT4qp36>;In=yDKz!H7{(1~Sx8D~c%MMP(_dsA^nL*<c@lk|YmJo_a=CbMGz4QN3M1
z@;;E3=ak51f*9UF9PpF|<CShi;AXZI$<?j@qptAeTY3|;!DiMkZFV7)g|5Jj1)-n7
z@?db?+^G~2!GGguwzX2&1=fl`TQRstIIN!~9011~=7GGs-6agGhX7F2q#T6aCSHdj
z@A@J#E@S<~PC;u|1M!4jc%dzO)i0e|v+$VR(rL$3?)=Hdo75mXC@}Z*^*7SVqlR4F
zo{h#$%G=_)(RSVDMDPLioGZj9J<_Maq{ZjxUZn5_B`c`WqZeFWN?1i^tm?6@A3wHF
zutFsrghxySp$rJ5DB(dHz4+9r4_KUi@x=L(ZH(K%+y@4iLNqj}E$is$-4Ha9i(Y8|
zbC@56uds$W&z3?cuiRZ!Tv|zSf4C-R+0S-V-<kCq!i&)mZ$c7WYT~Vilo(XWO=Sq~
z%cpNJfCh=EA?bDa3@w0zdDjplJo?g^(@CaPq(_hADfitniRJ4GAC@v!%4o5zVW+t8
zBMKan-#HzIZh`~o`hHL_v=O=ke&V35QMhkWh{Gf_#fth=wQ3mF1oi&fDGqIjHPMkL
zaZWN&jwFG?=1(A1V%NwO@<j8pr{!*2n+v!;*+oFUtAF{n0fB3aNi5MO?p2U9CVwmo
zS6;3jF0ziRr7^;5Q&tu);`8jo4yUn`q0H?8GR7e7hy{C2CqfK)nf<sPN`K>LeRlF^
z!+6_pEq;i{B>m%pZ%8+oO=$w&U8MQ#nUqEM6QR@E<$`@$P4oU5HM3mrAt5H1uCbz5
z_xqt;%+aF2tiYOp+uh=*CZPGS1*+^}M&<#);~H2+#PUuCe(3Ny+c5}`KY4~iqAu{z
zOu|Vy#VgG{_O=67L_Lp)K*(*Cn%oGI81N?j%|nKo199#T1PdP8zFNqrB{y{(WyvM1
zvx7EdoZNvXE_b!vAx4-*Mj+>rbeB4+ti^`xzQtZT=WG!}^MxY%R6rr)(cBrawttK|
zvl~wR%Hq{Kx)+N_a@M5Ja}S+r7tOpi;EjpBsMw!Zip@Rnr$UNeNzMU^&U~ejOEI@-
zp(Re=#HmHNu;nGnxaF5bk~1x%&p&YpaOIXbiL|`!q#_#3YesksIcg@WVHqfoao^{i
zVow|x3m%gU0qc|Re63+!RE1M#U$iWp!ftqHB_+DuyC}P*BG}T<MxG2W2zjdpp0R(i
zXNLX$9N^6O6)uiP=?h{CtoP$Osxre$D7#ElHnSUNz-^}5vkYV9#_+^xJt}1tO1QUp
zlL-MIO<{*%LEPRuQW2!tvJwT;g?HL+D>-YiBkwj(2241!A<{_=&nQUFDyd(Mve+jA
zef6HrI>C>RzJ(m-$KCdAv78V_hmfoXC9!ok8?c8(X(y6kYd2^!qp=CEiB8}zbN?gs
z=+D-N5#eP8iPQ%PnZTT2@fY<`=A)kk<BlF)(5Fm5ff0@8{vt7`F}%2G-`+`)m+U><
z3*F&J$33}{SIn&%8%2kCytn_$jEF{Q-x@L)L6V8s&-Ap8Mm1W}vQKrVORnv=dq&0k
z{juc2_SyanDs4Yt<h=jF%N(*uh%aB@%;y7qdd<m70*6O~ZU|#qxZufIoEgI|Y=xsO
z>MUd4UI^76$49x&Ym9%1tXOLY_OErSyQ^afqSwpK4ZZ5roD|pz)SQHm+FD9%C{1~M
zpnbbl;JbzEniy^#yep>sA+cj-B38OxqZ_+6su4OXJ^qO+%|@BUE!$%(xabAK?7R6i
z(cs|6EA2@ZM6}n!kq<jWzDN%?$dt&Bcc)1`=dNlOBH(L~p!U%AA3{Ihxlo2flzxb0
zFG$8&>KH>rnIlN8^_(0LHZ!U}+qQxScef^<m8|+ss+_QnCb)?FWbcZGV4)7LQ9H?&
zuLLfnY>YxM6WK}Wbc$w8&kFiS^OZ2(AW3WP`?N@9L7m4J+l03*pMLBTr|s=<+KWgW
zpE8U}@5qW_p#TZ5o}G9196NYq<9R;wZbu1lhW8*vVy?cjb1qrSnOmePp-YdSt2ToL
zQ>(<P!V%ek{-GF&JmHCpziB>|W^B3k_vWfO0L$OA;S@}EPgwF!k%>t@!*yH2k$#t9
zyH=gImzsz81DozoWb+?pJd-p}vemdE3;K=JZGpM4w01KY2j_-G#Yl-3=D7<;MYPOi
zpcfuSR*OezqLTz(&r`*y`58m4=O<i;!#1fes3-*G-~-TZ&J@MA+XZt0&@34YKXJgs
zzDKgU_ZN00S4(E&w<UY6SrIOBY2Tz8L?~n~*^L$cQ8LWn<6(w`pP9NR3ygJELusFf
zfj`<C5I+KY0Ou$ZG0BdujE*e;$ha~y{k|KlYv4dI@^u+fz*+1cl)e#Za3pMDVdLj>
zkqUki?^cB#tQVUC_;PRYuK|v%V)eL@l3|?TuVbHDlpk+%D~i=c)ck3<6v~=whozm9
zcC2Pl(l)0nnFF58;2IU-+#CIG?F39Hvd=h6^=E_W>$hQ4B;ywvFYU@?Jx9Z;A4!_P
zj^UjkpmiDHb;r`r>OY{@qg(xHRwk&o`T^dR&SdbHX0nCOgJ)O8s`k473_0G((F|h*
z*x1~p?9ki*0MNjUfFMG7L3-k7r%EJC(Mo5&k0I#hCIPR<8NCv(a6|N3_9dsjr)o4q
z_#N8BHmCaEA!R?@jeS@{0(}0EM@O+G?`GMh0zn9AeE5A%d_ON8A*{RBk2T0_gtqQK
zF}_b@ffY8|04otGV}!*r-6QR@pT}eqt5Tm0#~j?z5`GA}I}Z!#Y6rgC?5+YRSgZ}j
z+d^LL0m#0ZLGnvl)=a?8CwNZ1zD9dE*Y*i9cD49pJ;Kk2#cKu79G9M0^y}iQvy1e#
zA-9j<#+2qH;PDsE{DFJ4a}0G$pQF^*FreagQ<Pn#;_^R@F#8jtS#K%>6rTS$>)_#2
zoredMWSa+pej-HL7VVcFMmv>x$gOr>T!}xZ+VLGK<OSUq4Z*2>0(!5)V2mqJ0V8b&
zcyj0ge8-l`Ue(RCeMg?C(l&Fq1A1zu$`=i}uXzv$D7OrmUt;FupmwC&j-bT5BXkG+
zS}R$+=M1Zp*M!q$0{Zk&q?C*v+iBpt<kr3E6C8`Xi>r##XcWm!8&z~e>K}_43dSYP
zhxl)^V>ooukUT+#-siDBB|ZTf2$%=$l$qWVR7XI?yNK*HBN8Z=8mcllA=K41wHh(X
zCc443ZXJwRDD(G$I~UQ8V^QcGt~kPu4|+z4^^TmnbD92phOI5hh*aa~!BCJPU}1Rn
zgA88Uvha|D!jMpnd0vbNnl<onBatzdE>DpOr)15KVCAb+gw&ALm*783hJf>SZM*z0
zI-ze0pAn_nCjPHnG}dck_V<YI13+bt75XvQtDm6(S7ET6zI0nSVFIUifQr~nEhwpC
zY17fq4OuST9r8HVOQbDj+*^^rtx+tzokyOg)zr57w%k}=LrOpJT1JsarRYi%E3-t*
z1xyZ{|5x9>$fb^!JfQiNbu1RX?eJG4<TH28m1b8EDwsF$rEg>297A<t+JS``y;GTQ
z3sY%`Nk3QBa|0`*@J{6yWA;|}ZhQ1VlW22GMM?s5Ai)Y@<|mk-Od-9qs$^fgByGM8
zD4kJ8;rW>M`GDoOT+z`w-FNMy0I+bjDf|KdvjkQhv9+C#oW&a0XO9F+E80uLDu$`F
zKZWRKa3+p?O@GI;z;~5^g1&^pZDRWOm|-M`Yzm-!pg`i_{X1PL_x#(k5|M~Q4-;S8
zV*tLUTCS%kd1odM`jFbBi5PL1Q96j0tUfQrmV@`6g9u4PosDt(sg*R<N0uN9Wh+}K
zw~UH&Y^fL@*kGqj4&Z#7LFS<j+7ZIctIc*(Rb83YYNVXnNU!eN_rQQpF3AGo)?hy2
zv*sY-)D;!jAdU#ha@UH7DeKTIFBX!}TM$sEUfeWqsbTyr;ZLi}Bs>)MLE55Xp}SnU
zZJwoFdQ^r_SgoAI7u450)r^CObQUNngDOtgP4(`gwJ2@C2Vn?1PzbSIA#N9b)B?8j
zbQZ2-W{G?Mr%9EG1^+DB0*e;)L7(H25D6lH?H1EaCmNbA@9vu9L|Yg%iP^-KUv{95
z5e<`A37zO+aw(|Gj-y%PL7%#>TCMQaWUyv#E(r$_uez{i?s*vh7{<?eGlN{Y%-2Gt
zO1+N_b>Op|r32enH$Vtpr06<z9iu4ifO~yTppNvO`+fcBPatvlGv}V@Je2U)Uf~xy
zwQ5l0cnnFjbAV2TZ7<H}rSwWbOYS|YR1k3r-n{<{;MpeiwCdRipY+V5MOJ&a_~*I$
z$4J3#v1lPB9Um5bS35lGw@QASH4slPw1(vs5zh>QY%rcc%6i9`?7P!V(7XZE(^_;K
zWF-eI7V^r=&r<u0gBm^z_dYB*`&L^rSMJ#Yal8(MBf%6-yz0U9n;Q6tiAI?UC;A-i
z2~-KM0D`Kw9AU^w`r4RI2@$O-HrEl`;47Rkhc7#6pOV7O&A#R7r!XdR1kl`|8%dZ&
zEotlhMO^e$sf#5e^r1!8Y}M0tc|jyYpeiH-WOw4AV<q13or-+4>w4(sP#&+pYpUPq
z*_GQxVWbCAmHYMA^LpH3{Nx{ZuMM)TRq7X+fC{whmv&M2ST--n3y_?Lt62=M-kDhF
zMMe|4tG2bIL|bPP#<B*vft4N;z|c`L6fg`K>`hw~mtaWuF>i?zSiWwUE??_<n$VEt
zauxwVL+YyGfneD=%Id!j9y%+oLC92&Y$>{)v%BR*s<xP^luvNLtU4)O)sNfW`u}Q8
z$xpEo?w)8TA<*RW0!_<f*}OFd;U&^LG@jw!L2gWXQ=s)h0M*&zB*>BJB_RXubp2$D
zGx)ne4Nerz_$ceR%%pTRYrE33)~VXaPXZ^}WL6R&+2vKlcM3{Mtf#70jBKF0J#T_6
z^wO|0Ds#(TaXqm{tKJT&BL3>1xM>wcIpVb5hovS_laCEV+u5<KO7WxZH_E`lpV?bN
zAd5GI%r0F)ZQ~)b)Epii1BAk{(S>~b3kivuelLipk_eaXRH7BI!sN9*b$H#A_r1_E
z=~EsQT`A5-GFm(}HokC+L3H#L^7r=@B}I46@nP}O_|4o4*rHF6Xe;NRM<O^6@cQ-S
ztu`*A7!@~&dyF(Xr)~t6CpFDd2+J}o^AN=zbj><&)-4~=ixof4^--|pG+()wT;jsI
zzUSuYUV*iwFyH^&*)J^yo_#O-Ml6@VlhnKJfkFR0_6dBIdmIWfU#`Am9g(eS0^@<U
z7O)ez9UVA16#`^DmCYe)#C~m?iuN=22N667ukq<5Bi#hKL+6Mp6KM0m$&1^JFf?u-
zN8Vmc`h3Y8vRgO36%Ydca+wIDWxc4AvF@V<C6+CNftag{EQ9@&bf6SS0fAl@-!RZ#
z9*{_0row*|v3@wFfmm>{rVAIH%z=4)-F@t1zJ19XY&*#Y1S?pMkVJ~ej<a5q{)5oC
z(*WwQs=YwYFV4a4)`DbzpE(^E^``F%q0R%=tT*AwgiK@B%Zxd!1X$Kpen8(sqKQAQ
z>w+6CB}`P>QEnB6z`WVRuDvSZV<8KgQyUbq@kjsCszTm}>VcAN&$n~7`L#pbNe2bH
zMmszN60;-?^{N3$2wm$K`bkM<Q>!uA&z$ALXJawu`HGNttFM7m)ST<kvG`)C-STmg
zc`pECS<{x-^7Fj#dAG-ps#W|l#a)X<Pk3biA5;s8-U(LQ-ZI4B!eMLAQsq&)FJ^8<
z&BJ@G?UUO#AkiJ)n%7y9>P}^#^&pS0ClVU&@u<b5)BEHQvTck4<K-@{RmSo)vQX|S
zyEuJJ0T;T_lCh#bB*P$8cbip5Ui+(Qp81`!9?SGKl!3r>Lh&Ok1QFmSwDTPEqQk;R
z)78UR%<$s5<d-aFA>Z+%y#_=x*={2)AdpQ04lVUm=W*y~O%=c#3|3R#P2)l_?cBC)
zoy-v6fO#DZ_diosKAnEXX0Lb$@h*TAGLXEs2d+r`-YTUBgAxJ-$#Y+2JwnoZiyVbm
zLu&*?u_ex^E5zXqZvC+EUZF7bVTjKHx0qPCiv${miOobAJi-HOUgZeEy$#T^zD&&N
z!4rtdhAyOTR6~NFctiEiYC+YfeB_|ugAj!hWz#(PdR?SJIDdCaa?Usyj!g#|ID~`H
z4)d<~m-48h{Z-rd&Swg=f0Qu*19d3%OcN`Qe&%SPfxy)#R06t3oFts+5fOG#=tKvn
z`Z>4GlGYHjnA~tpxAR=l<Vw1iF6h%-QKR`OTng&h4;SX8BDfT?0`CI{=}R?)&Wr`p
z`I8{SEv8B7!b(0bP@vAeKDK+;9Af{9xyBio6`46kSM^wPf2$!2+UsDb&nLu6{X?8#
zHq-3sg7}_ji*n6cI3Zr|(6Ht*&nqPICcX4V%tl6>__^q{;|(BKB4(Mc+8+I@G*!yl
zGAc<ieWq?rfVYb194&d!6jr#*jrQzAkFxK9VWh1mG$*Vc6?M2DG{gErkMF|RU0itZ
zVIg*~k$Oh6V+O<gr$mZ~3hWDFj5;*HUGhthrO$zX&v1=n`^=I&ss5mKk`sixE8eq#
z`pFS)8Tp+Y(ZOMaETN_y>0l)v8tFT>Cm8_(Kbbqx>|%uULFZ9C?(+i}HQr|2KvyMe
zr7-a+ea#69zle5sxkDVCH-1q|)X*yg{s5R0+cOS%1X)d!EohQcS9(BdC$ogr{g{kr
zz4PSjMZ}k=%iQAo3SmJ-0KSKQL@SYJ`BgNyUko+l-eHA)CUlA08)Zu6kR^qTtkD|l
zOhhKCW|vVs;j$71rqhWc{MiVe*10v#pstYwp>KS1v-vNatSFSW|6i+{)i1*!-tGE$
zeKTk-aQpYV<(nH;M}%<iB?ENBZ~XW>8dFdDP1x2gW5Pvczp1orygkDK%yt#aym3>*
zV^NS7qv1iGz&B<aBx&S;hU@pG3$7B2H5v{kx777BMD^U@M?T>Y$vM_XWm8#m7vTLx
zI(Dr+k;Z6xP`et(ObMo~)`Nj}Tj_rhf4SWG1?-J*a-I&QDr*>}zvB->JMI~ov(LHd
zSt382IUl(F`rC+@Ytq!4npOtinuy{}jZsyy{@NBWudW%g3$S!B!-}eK!~3&7Zu@A2
z=D&@j4QNZRl}j?{F3P?eO+Wpi#a+m^rw>M)%Iu#I8e4Dk6kNuv#^FsKduq+?c=OBX
z1(Nt_wW*2?CgO@(G-A1$Zyj(xRP*y5oqpAY0EV@&_=>9%huk&@hOvaoaw5h@Mifb2
zfNb{|puLKYogCmsH@zLgy0>+eSiC*b9&i}q{YUdIi`H3HfeW@#O*7%d0eWu*Q3hzL
zQv(MPt7xk_{+I1TTO6mN88?HfSEL!{@6z9Pa`b!{F&+rt-S-#Mp{|OjH)zJK6CXO+
z*a@7%n{X7-rP+>A@Rn?YscXA37_V4#VcN=s-a&zc6Brrk*I+&ft?+9TDMX5#sAK4m
z>N2VNXlRpUi6RbOzo!gTg*CFtI#-VTGv*y158WSaeX>>sci$z%KPwLY7UjL|i^30~
zjul{6lsXRLP<lfc`CUFZ2YT0Uf!!vjkhM&mu8Pb<A9bpfKY!l`NG6K(63V5x_)FxG
zb_YNaEtaIkYnPtIuNmW3a#MsH0Tg{K>+Z>M#GAb_xp4y_`elicydihF8Fn^eC!O^<
z*L>&ylB6}Rho+Gbx!NOl4HWSvyuW;@>Nh3Pi=g&QnjtX~(XGOsP)cCF_4lAmq`XWL
zhxMp%whZ-%;FxE0uc6Lc6><;IbgWv5uoIzn_OL4_HCVsoD`DC`P6b-y8EVHf?R%`}
zAMf&U!cji{U>_BtU_MpA9cz}={;8=I+>>|O-<B1Et?SHDYPvF(lNTHPJemN(8MR!d
z<elRww&G-Nln_|`139*4+5R&^twT9=3s|`0${{<$KTS^Xa3TPt6xV|4AR)xIzvOib
zO4nUj>b7n+ptFH<`-0efrT6r$pXT!iN~+X)0o5W~K-j&(5o0uRm!#G%cg(U7`u@NV
zo-<8&SBuKsU$Qx{)Kx#|R5l_Z4Sdda4W%mS{iVE$EY5K$d+>)yu2uM6q_;ti9;)I}
z>0dCf3$R^_OKuuZ=#;}S?9;%p!OwCeozg38uR`xj&SxaZw}I#V)t=`fvxQeFv%4O0
zDuHMe7U0rf0M`>uyL|7^C-Yc>%z`_%k!omuJ&A7L_kWrU_C9-=ij6+XmEgb*@em>g
z(BR868$LeN_mxSFYTkjotDX*wXHYfQ4*Klg$VUjL!L(Flv#|UdA0Qbo4#8c{6uYP8
zkvr=l3Iv2=4uIlkD|N{pb-E!ES_2$EY+TCIKlyST$p?!9TKjce_NUsE6KOoSg&$A#
zD!p-yra~f&9}bR<9I=YUvyN9cEjh`g_Rw&9^8Fk!7}6Kv(N*)5Dsks{eOW7W|9WL<
zQFwa~F<g*QZL0;lz6G5>Ej12nT^^AwaK3(QpwK&(jho7aeAhxqzXN~pLuK4M5QG}m
z3I3s=yUbj}lsyh=oA)rWMI`v#kPO@o6OZm@L)<0#v1JJkfUGKu1|xmt8H&bt*n*(3
zKW>FC=9XBY+AR7_(#YHHj%wkSYweNNEN0}UJnf{G{V`?o<d&QgjP;+3WVa%Q5O9e1
z$TD=i46&x@)`KSAk-+b7YblQ_HX-wm$b5;5je(*t{9<HWP5CN}R;~1Rh~K_mL>y4+
zy9?(r><F-@6?NsetcBC2KLEzp-)hEfki$%a(KhlpZrbn=uY4<hsHWD98}t9hKP(zq
zGY}j_qirBG^O$ccXzfh31Q<Xo$l(8yv;yh<`U2BlfKpzQ3O;UGy|&|Y&1dLg8vX|%
zAf7A50!@f<hytdk2#mFOP2qs$XPH=m{^%-3zf3qUcMY=m2fP}-HOlV?h;=>Et|e*T
z9_S`X`}TezsUpNpvT%3gA8gfM3$qH<_sad(_a+UD5!<dv*MjKUV<`CIx%t^Q8fA2t
zn`dae>-ldu%x})m0*%90ZR>=;?9dD*)v&;x4;KEuIv{q*(#lhYURPM|casCw!##yj
z1D}m+EN2*Au!2sDk|M*Wo+HbNrxwXCd#Tay<4hV}c7xUkoJWM6G+6{mg5R#n$(9?w
zMA6I0iToIs69wtSQ61za<Hlabw&0`7y;}L9-b4;%-*R}lM@#s-E=i7it@x7Li$tie
z65C+JYaT-#f^ps>V}qsNas`(Fsby4o*l^x?-Yb7+)2Bxz=6UWK91jcvTSbu8*35i0
zEGOGPA)U<6UCQ^>AXCh)qJ79PQE4;U1kn?NO6&;xhLpv6wI`F;(L+B5&+-e5cWtkk
zWN{P%f=|B40%9yX;*`=SxiVrf=UCmFr}GfH?;^S=2m)iyhrStKAay*k&OC>A#t)~s
z@4qhP#!J}#2r*($JA|?HJ6T$@dO%{a>}Y9+aRYE#<;^n9iD}$fhxZE4e<D+u9PgM5
z>9p<QMMSL}SBk-ep`cy>IWoFigLj-1u@cyWc9az2WXF1CR?3RWEz>cD%UDBWBt;1)
za9p81lneYJW^BY|YW&0wRF2#~vwlY}-N*_DIX>yw!<HPVirp$ERR=pc7WSHEw#&Cf
zY9X^dDAQgSZ(u8~oM!j@r~h6>F>oSs5;`#?zk3~A!qMAfZvlqp6kpbYm9g-23-Z&u
z=;jwP!818B90?0TY7vLUE)_45*@_4|UOG@7@el5<GbV<;dg_ehN~zxq9E|jp2Iws&
z1W@Jjc1*~GH<PlaSEB@T#Xxpk^+qGue<8dZHN;fE<@2HPA1vU=%_(-B60<O)8Jb<B
zBSYN<ecuE%5fMT>|G-`nX`W78MI$QhR04PXtw7M9!*SXUXjbhse-?6D;5js}JBawO
z-@~?9^ew3Xch<{e%QOpr3kwotlZP4dS|9;|5$8R`_-@3>d_9WS)K8OM(DV~6y5f*I
z(Jh}LpA8RO;uvBOM?E{ypIN6N{N}>!0H|k)OrDDF$hS;;9EglxA(?faRg+u1?y)}0
zq6K#jfBB#NK)UXLI4P3!?S}bCyXU_AFvxmGAO%O^KSQGvn__YfV4p!vIcI@Ou6yX6
z=>?G79Q`VZc-I>S&LW>FDPas)yCS&8FkRn~5r7DN&Amdon<^45#_?=V+va4NDq-CF
z34gNq^=P7W+VDACHRfw8YX#f}6%e?}lQ04(vlgAtaSz-g+D9H5@l&20SO$mw;tK<1
z=BiXsGZ>Y`TsKcIIOiu%{~xG86!Vtcv!wOgo^ABD+ELB*bp@gDpoWvnu2BV)L4HC%
zO5hcmW-SHgLM+v>t~}z;Oqa(3oPzkf{wngrwj(amDN{ipXMFuB1-Oz(ESPum;RkBY
z`<V+TC_?&bP-tF<LQHvKZ7txg{!=tns`kdRsA@9$-krt+<?w&l{Nkp_paX2ixFy~0
z3v?7lFdaz^TmqhbTa}JEJ3Z}Dq`lRADQd*xnkio4??f)#%hy}~vO)=AyTOuR%$2>J
z^gpH4mKLt$_N;rlLm{$$065`vh}(mrKn=>4roX@M=YbQqVKzgxK%miB*dHtP&sp*c
zVD!&ZiGXPEkCpGCzlld7nhcuU{S%s9gSdP3Na0+M%Le|9Cuz$)iVBsq2t%Un$cK%8
z0SgBz4QWdovu5cdMT4-Km1PbQjF}q+vw8O_GQ~t!hi#Y3LbmZijp3<aBp=MTQcw+P
zv=8Io2uV7$FP>B1)KTs8Y)I`qz#Vs7VZ<QX>MQ7Q{eB5>H39^}M*yTrBp2mM;-y(j
z2YGy@J*{1hrjPb4qNa57v-M^TDAfZMBR6e2N950qSZFx1jjHKr+<aQ~%j|~PZ?~>E
zUY$1)n5r^uK%uT>S0Y)4_%}1jDK@<-=iR0O0GY^$=dt?pw0!eXM~{32Bmq^vqmMBG
ziYmlj`KZ>PIvizvwn$Fzd|#?V{bD>?oiPS=#aUdB;KQu#E~vwtB4rdLoq8DY?c?DB
zH)V6(q3B^V@}rA)+EWoEkYd!i)Tx|2qAMk(DOHgqx|!$RO*Km)+xztCAn9=r?y7Z*
zv@!5i?~a=6Kmq7*7?d+1<hZ&3FA-tKT^QgJ+fwtcL8h7Afa!-mDfuRs6R|l(8O~!#
z;E%QApxyD3TIKXv%g(OwvzUCW?uF$V<}viS{hs-=sd$t_Jsr+MqhDkHv87j^I97V$
zY0wLZ7~6ke?L|0qWi$LHEB7Q1bbwcI@&1h&K+8P5FjHt`z@Qt72-|6Lj^iGomU*&5
z@KFWHn-QqicEYgT&l7l^5|KZVdF~-uK71te;9e=`ma&lBGTx(|-Jx`-UQQ%LtFCKW
z&}S15?}PF#LYM}WQF;{EhG#>S(UM-FTS)-`Em`NBDHJg>eDq8q&XYnhv9QF2CSIqj
zzioXPj>Y|VSR8Ln6t)gZaP(w>Mc5LDlhZ6dtnn`utWT(__WqMzDGnj~x^3y@y`MX{
z+vI^h*0+97pQZO!TdyqGY5kJ3lt`^<kaWrUmaH3{4iaaZt#9-Dgbjh7{E#zkLIk8?
zK7&g6oicIeDV(*QxV$D4kuNdMjl^GBGmx%UmdQP0Pr|OQ_EZ|*8XR7uWBQk;ZP*VQ
zEXsnsAuz@xJVW1jaw3S`S^Hwati$L5I+D>^k*wk7`*kPmziw|kXgD9JWTPD;LVcS@
z?+%KU6=Br*Df5h<v4kp#A(Dz!f)2cfCU{N57vJo@BoVl9CIS<anUMe{;x1oJVwqCL
zht&LoimlxA(R*$Zi};MQSJD&03J1up`>v-DBT(u<TuU}8(c_*fjIciX@8aFg)H`vz
zREibR<#0i_K>VSeml+vk>Cq0^od7S|>|F0a*Fo#c*3_(Y32603tr=ftSoG#WKX_Wp
zqI?dQpd4v4NBKbl{Z^q`dlZaKfGRZNl|z==Bj9ggQR6G)v?Fht>`B!!&_VVAFH6c3
zMrc$&cl0|f5P2KQx~dI`d)uT{W7$q)fv@UO2^9pH(&rRTv-A*7Emi~D#E~X_ybX4g
z+uHM<Jz(1X0iyPhaB{qkP>gFs-imF0mQMdVq?%T;#LOZqtTgnh3QuQw^Z40<AS+ja
z>12_4P*n}QflXD?!fp-L1sy?I;Uh2&AQ|ACHe`bKJHt4DsVz?DI6ANO5X)2H*|u(#
z2f}$NQGprRdu|2s|Cy5yG(01dYP5@WiiINd{1e9I?FVd=_@f|S#_Jfsxl3h+8F&Zi
zqt0_`)WYxTPaq&jSXz9pQnlH0XSo~M<(1%}L6r|JFKwH&SwG{Ya&bo3lFFVcWOT^$
z!-_7!+!59~0<xvRnYmx(aR|9|%L!C4aBCAI7^!6plYlNI(Z#lc`e|fA34e}AR<8%>
zqhaj&FBM2zx=6+FqKD`su4C6=Q9+8z?2Ncx!1@B(tZ7=~2^tEMlBYm1g?u>}t0z+q
zG0Aaf2c13z_COV0JZbMpZ&DAg(W<S*tfmj<+gFdG#Kfc9u3ZMlQ+Lx)gM*&xrnDsQ
z){4q{`<3VC1h6mG7_x5#y2leL3SbiB%lK8*z+O?fgS8^a+w&#j^TD0f*i#^t8GkZq
znl%hRs<0?;gO3VhW8l_$+8b1@=7e*p7LOYo=^`k)gJi>M%`2sDEiXNtFOpy4^@FPp
z0aQgxwK`0)68+knFsJ^sbi^V=?@2g~qwNV$<2Z>&+p|uagou*5i@1mIgvk>IW9TX0
zVK=?__c-i&F_7bIzppGsq$tw%?b#Z2!m0+Ay}1F>k~-F4nX?#Z!KJ_dRz-pc+HAMj
z&F&y*yu$3%1`A==F>yzXnI5^2r1|%QR(L&AwgVH+;p(mlb^4Zne633pCef$06-rwa
zyj4!xHh4D^*RjKH+AC(249f0zDLmg^r;nusm*uDRH5QWI#p*>|PA=OP#lIt^;k#vD
zBx=D5A<B4o!Te*c_~q7t4mOnQvl&)BJX_Mt?|J_>ovzhyLz0~Y&LVbfGxOG^$l&h8
zoq8u^VgN6Rqyk|7Flb?nK#VH`ax6wmtuwH3!pK12b~!))h?|0?R4WJDD<wz9y>*LR
ztkxuTZr)p7?pdoHo*F9#618$~EXc*jgfdO>uJTM^BB5^8Y%^s4xt78{RUUgxps+{E
zzTwfq?_rfu4aZCWw&=Dux*?VEYspGb$xr~Po@y*Z)`ZpmZ(7#Z#85M?FJ6od`}Vt2
zD@9>elu?3Qvi-9f@iDTrMB1!f)jjQ#Vozd=^BF^Hs?}KKgGPWBsWe!os_n#gIts!k
zzrg`x5M|z=%J44QXoMgTz_-S0kg&zN@V-@;#*VTWP%PIcTL4O#!^1Ao3_>do<rt-4
z>ya<u-q;@Z9~OIzM2CZ0jpI9kXp&$ZB>Cp%jT=}gb2NqFoMufnh#7x(vCwwhnc@i5
z&^)Rhes_SZW$gHuqD)-5=4F392%IV2`y#E?Ld3sLAbCJA-d(jqzdvEPuWmxZSWWm<
zxjbC|*#?EJlg8xLMx988v23pGaPDH}2ycQ0WWEj;7Q84(LJq-(^zFo33J72!KbPK4
zOhJI;rMk;5OB&a!M(J!9d)Fw?$<tletytiX8uf>#Tr<(kmg90=YE7dA$r8Y_&aAGS
zQ;R4`8Y09*_ZV>l(!ogu;WYbt*FNYv-$MnOTjVmGO9EF!nJ&6kQ8?M`y47F7&jXb7
zGe+@7AeAEX;wrU5SCr??XiB+xBkp-cu#f{5r52rp5e2Qg^J4$Y*RQ=G$<RWIv7Lb5
zTD&qHQ?(eZp}J1CMvfPr(9Fv}KY$$6<VMhsCl;NRiQ?_7p$vQUFc@PO3^y1d@6*CO
z4GvABt{?6C(YpGGbd0#UoG;_6-?zXkx9YvtWs!9PsnG^Ltct?17ispS2ISL`8dibl
zO-1NdFHrkW8_v_CS!*Uf@DN3nVJwMST4SQ~5KqIW$mA|7WbzuLg0)g;k(KJ7@@Et_
z##fUsO&}&KH1Lp@9$HV51;MvJ3qlH5n9ea5bRRh>5zo4`Qx|sJ?p&92uL?f<JuLwx
zn$pS-%7O;wf}0N*%kc_9$_TS4Z#IfO1=|b=s)m|KQYnRG&7_=lC^Px%n!%kTlV038
z%S^S}>?%Ilut>)zWMcH<$a!w9F4E=6x;;+t9k&4!3OMj4C4lra6VMb7M5dFWV%z;s
zsol|x!#@dTy0<MsNMe$$je%K5N+7Y;=P76iel0>#R2O+n_|a|#Pg{v-;ahSR=&>Xm
zu0qTlIJ=bqZd$YMp>CZQ1cT+AY7oz{H{3F}LiO+=p)0ESyn=e_8qScgW>nI{k`rye
zV+Bd9hp;L+3ahbtohx?S=2T?di@!L!5@Il9x!2$5pz>_ST-xRshFE+>Ug<20C^svY
zY%%Ib>=bYKx}d|M4zmi%*9Ks%pnB*5;r!_d`c4K_9a-qoomO|xqoOYAq_i&44UONi
zjaO!Tu_r*~>U~hU<we=CS-LAt58es`_}!wYo7lwJ^wp%dE3xY6Nq_BD*Wic&q6&fG
z^DpccX~9e*dC1jbE@p^E`AQ&2JG@>B5(GO_%nyV;+e89mAhg)sl-sK<J<77hSZYe7
z7&KxW=?c9Qn#P`^VCdTQmK+nmNvrCLiX}#-bRpf%$1EOQ&_a;~iCXT+zNeu&E{Yv1
zWuHGxgFhn3Aw}>Y`U8+6tghh*%nr6X)=ase+WWymgjXFo7ZvU>f%Apa^P<GNIa+d0
zC>0e>vYxk!R)w1fw{TmvAaAK)sHL-{tB$IaryxfhFL!eAP+peI`*RO4FGx1;H|r`b
zV&D=4#d#gS^_=c)kcQ~ro1PPw`q^yVaXo_6F=^QKL`}&BRe5C0YfNHUyp=xfVz}1t
zbR~7^yh18<_C}MuRZcbPlgYAUUsR`BZY5T(zKPJr&h{YKZWPusxFRR<ZWiy#yItH(
zd{Cf)p|0iBipNSY5^*$q%wdl4XOfk10dT^=jd^GWi3mP1$~*rIAh~ee#--e7EoYfB
z-=2S3?%HF7^JX`>IxBrNv|_!$@FmNnXJUbT(%goHIz8k2%4XS9*^)fsY{I;F#3{eL
zIGE+jx=`wuv($h#2Z_(c;=NMGINKXp0juDBg&j(#7UL^Ajw72hnxGllorp3J`QU33
zYCJ?luyy(rx+12+whnj+&o4Ao$GLH)y~Rxjz=Er5`dgye=!n!qgJLhVU(Gfq@1B1)
z4=d9-yg#%Sl`1mLA?%8($gG_G5i3CWeJeh0<sufNh&YvqmNVY+xh@PC8lD#kI#!CB
zRR&=&MGn%?gWw8eXftE-V`)1oVvo|cb_gsrJSt-Pbdslo9O^0?md{-WYcwCjD6Y-3
z&DYdvCpvDj_0};r{`ph=!(zbu%t8>LQF=jL?J`l(Fum8<kfZ8C?KDwfWDse=)o}ii
zCIzlt2zj;+qfK(O4_v;y;@9`Xee2~0G%;WwQbOf9XdUi|GeH65Q}LyJEUv?_p?uQW
zg^?ZD$?6}Fb!?b%Lk>x-ge4|?THx~Gd%-8om~=jGE*(~R+Gqw6?$~CB@HJ+p<w?ZK
zzj24?F^BSP%jz)my>n{%TDV|Ae+$(xj^D(RIubb(+<dQQ$RQVvDvvvgCq|T7LhC6O
z?+|JyluC-`0xVZad&BKPPEliST-E8iB96TKIoyMf+-3r-N~Qi&J6@)CsVjjk(vIm`
z>(U{l5BrXC=~cm#u}h)laEd;L2SwnkzF)PGKql;CK}O^PN2NMwa=QY!=~nNo-UKfl
zQ&c)>d$Upz9plsTGQ?Yf7)4mpwEwb%7Udv(Prt`_i1i_9^GHyxg(u+*c|xeVW^lXa
z4|dpJ!Y=C$q)}$?Bi0c2#U*5M90ePyLepbLrmW=p=1CjJo`kzWXzXZ{QiiQDK*1>g
zCk*V3Ni9NsS|Iu7Bw9eDwD;2WeowWPk=*I+QBEfn{zUtt<yhbaS9~I_XFjT^;`|g5
z?96kk^lwpqZ7*mw0ql{?*w^65mwQIZg;yyW7`;wGJfd{!gHlJt#rLX5K`@Dk9)k5T
zE+M%N20Bta49Yc@N!kX0P4PT}SJ69uJGpB2JdVc`LZGwG&Uq|h->KW0qnSdt!x<2H
z&NWwo0G~_<lkE<d`t%`|u0-7+gmF;S(|FvVX2Z-j&ff;PeV|P*ALYj~JXIDLiD6l&
zTVmPMx$8n;OyeYD=1mV-X_r7jElTYHZnp$)ZsFnfb6>E37k<s>hUG|Qpksm_>g$Og
z|4KNnSVK7?!T{#4CQ-DYDtpWSy8Li4w2F8uM~VaHeM{8RRF|32&r};m*&^p7JR()$
z5oD+vTRPz5x{<Le3w(m@d-eSb6&oIG2F5!Y(nhERcz-<h<3a%RZeBt?{gXxDC8XYk
zJm=~LRWCS9NMl2LybaSI-$nQ!pbuEjpA-1qxbzvInzxVn>fXPk^3G_q+wD|bh?ln1
zhLz6cq#Io&ZYiKo9@q8|Smkk&U$gf0=WtD}x}`^Tia`t+PfYWt9fcrrM6?CnKc4;Q
zOUY`90-2hKpyT#>-z3+n?M05C4FRpSSi4$2lu3FEu#{h6%+xJIDrFt~U2ax;@Ve1H
zkYq@=lL2szX`jMtCyI3t2hEtKXb=aKsNA#|t#Bs%1-i8U93cwUc((1)SL9+iZ-p$B
zLmzKjN%)$F1x!e;3XAcbjHD#CNnn=xfC+|Z==FluC>bo~k<h=GY4%&!JcpHSC1UQY
zRy%Hy&6EG*S|Rl6VFuZpqL)-Ith$$GH*wK9|6i!-QgYd*KjYHJ7B!+dCb*wZI5r!5
zx#ee1E=jwF^U?-N37oHSiyRBllxGI>0H_1@pcL2n>TcdJHg2>*LC*|{QCC-|3L@=a
z(wQDwy8OwWfeQ5lzzv(#><1vinmlpY`Ygw)YyR%Bd>^!#I6fC@N_x)SP$K0&2Dq`d
zY*GY>wL_2Z_QP}U*<1-o$l7VAwYFxTVXUi`{bFsy(z?1qJ=EW_hxIBwS}2>9*_83~
zjwnQ`+zhKdE1ChO^k+{x{{6~P*%n)%n<xcl;}dl?xv7rMwCw!`#O5=9RJaqA-h_s`
z^)SIw9YWLk8ysEVxx9^7jvTGNZ>ur-5(m%D=%7C$`bZ}M2u2H;Y}nVG6{Qx=5YB9j
z6&m&_EZ|xh$FWBi;DMm|aM5*0Y~e@IQ>O_q9>pcGQ5TcxW3W34I4Y~3F3oYvMX;<V
zI7V1^0ldu``zDgvb)-JeyDY&t@KHOegXCUKsK1cu;z`D^qoJT-L3ogKiB7!1ePYNw
zQgbpn#p{UpoOf^|LJ!!zs-7@UdxdnNuUC`d0To-cJrN$k*={77v5{#w5nm49ydoxX
ztCuH0R7A1|pCW5+w`>U=n3betR}KrXQ;%_rK9Y#?4zN61Y#<G(v#qd7b@q<u%h5K2
zwYB%C4hWP@o2HlRAESI(fCnB;nOW62&y$9YQ{B69`k0K(A4%2Zd!p=RKCy<QM9bJV
zbFsk8E?YkCL(&OA=;^gpnsXw?u-KpZYmh&NbT(Q2%)1!u;qWjVgx?mP6mb(KOC{C|
zM^PqT0=*e#!1?ss+|V}cM$eNx{fPwT#)nqi7f?no2ZtXzOKj{217PGuWRYG=b+>@!
z=3Sl~O>X`V08>D$zh<2<7t}!fIGrPfbR|Kg@YST{X7-&HPgdU(Eo?-!zKKGB<jm!|
zl&AKB>g0w(dmok1Xqsl9#V!!@y3gYqACoj=aATAm3kbD-YrVRdo_7rHbvyNs(X?WQ
zq-(&3QWlGx#wf1>ODX5y7vmqx#N~)7bXyTO=+Z5d%B_+|#^h*?itkJ%+kl5&vJi41
zG!H++$rQN+q}?dgu2K;?#2iyj+KuoVzr{v9uJdYqh(39`_RoFfW4NSl+9x;^gj|+z
zGZlhCxSB|CT(TB~eQpGXPDCAfq#=X$Hx#qv3D#VP7jt+QU}^Y?J{8}3{!EOHvV3P{
z&=d;CZ+5Ow(@!ad;;*`mm7Gh36ms1`$LDv|L9O%=Ovp`|OM-A1ZfrHr=@pEXs|W;T
z@|HjLi}Coy!js~c5>4?m7X^5597JNOhaNR=!ecX%x2%&8nR1hJqu_q7L9EV{tRNaU
z75H}2&nw|Oh51x%>LlcfXuS;HvIpBzLVl1EAr*%6d>mXqI;~n(zt&T*to}@hgC+GC
zB6}WzijIx~xllplsyx=%Gix(s*_Coo0=Iv)HrECHkJ2d>^9Pf?Iu|8@<#Fp_A~X?1
zgxQ;vF3kaj0~&qq#R7kl7M+DJnm`z14<u(Hx(tTtG-#Jk2D4Sz`WELXx*XJBP~V|}
zXs}wx^QcT`nD6DL@VKTG1X&MPc$RWc%6U3P+&5=60+|*UBPwsK>@yVW;D90AM6M`>
zYs9FYG@nyCBTh0})}DJfpxnPelmYnrHwnG}p<HINmL`PDhu0AD$@ULb_)v?FW9(^j
zI6bKbe~f{<y^1-j2K|S_>q#)M4g{sUf4eI@;VzzUrr@4lR5G1p>$_?^+BR*K-8Qs~
zwsV!hy_UbgdS4UhN-J>s=-rO^Hy$)y9<IBck10!?O8nI-tyU&6!pXyJA%$00T{=3&
z%Ng`rm$_ax>Zfl_vHjDR5Ttc122l~Fy!xS8^ZX&aJsuU1d?)(hUx94jVpEAfiGau>
z<6wu#=o`pp2v&nUa$!)Qu-s(2fn(5!e!Nx94WX#tl*9tNq*6~+b85p}cxaJrjPq>M
zkdPwg(}rXklii+(`7d%PQycBigvL7mr>q)LZ}Y&XN@d$dfE*=l=FC>fHjmHTn$O0p
zWb<n354NE`zOq;!u#4ZSVqHN=5>c?sncMfcu7+`HlI!A|I5q`NfKay&m3i;Z6NPef
z>HvYMo9eOTuM3vM2DWsRg`J+4u*<$SH2dCibjB%YbB7QszF@^caM}9PyPr;z%qBzR
z@dhzJaJZ}-0KWdQP(E5ebtnMFC_apoXM`(Q*isAc1fDuHEko5mo;a6!>Dq0>tFpCk
z%jQbB^Y!ONt)%pVCbH_QM^$+#j1TtkmL$CLL+$o$;2%#{*jzB%`nwLkmTY$V25;(I
z_bQ#fp$WTQW_hV<DQ+TAjPzOw000t_WdVXB5CLy--c)h+fEQv?6F*8t#s>DIyPSqO
z-tZb#htUT}-S6%(bpObLdA6OU|AfnNu8j<y(Z16Q5W(vMu$SkwnVD&2UpiS`-*g^0
z9h5VAGh&Lc#=>>gNUU-TzS8a-{eG3EX!*X(`J6j-5axhqc=mJzyK`=&Qb?@d$$E}@
zlV;YO9>U}5S*^??4H2FH^~o@{T8*t&F>KZ@ZIi2=M{d8N(?cWS&$JIgTyGfWyPDF~
z{3yLLgS9jAbdgFm!|)VTC_Yl?L9rByM2&_j?MtKb#A{A8)8C}Vtc5y=V-qWh=HP6A
zEtm7Z?f`t<VU7Z2<G~u+sXV1Iwq^ApDACF8WK=fRt}ad`_3UU5^>u~*Ra;3IW|d!M
zslPnkBtMXyRSRUfgr?<je9@-nsnpE#Y4ev+JYI{tcxqj`TRvO+>PS-zD00q2?(irP
zySr&(0-xr%;?6!i6BYQnsTtr8SUmb#JgO&DHfM#=9V)~5j{2^2%F6`%VGT*>Fb<4D
z-{9T%aROAb11;j`*Pm#Noh7qyotbuIb38iRzhEYj=_(qiSODRHWC6*kVNU^ATdX6u
z?6YCobWsL=Jb>R50)bA#ZZ+F}4^_Gqley|Ek;IKcMV5BxtOVie@9tLe$TouzQq&p<
zUeKZ$zP4Gz0D2h)M0^O(!d|fsM+R(Z2LG#y8(NDr058w{5jKmM6f4pht`18aLt00A
zrO68ac07}aN)`wzSy|$dzU>Mw#;QsOcFA+eVWOy8EgsxD!5^#QVwD|wJ7BttO6`D1
zSPwq=T(Pl!_-rLqFS`a%_#eB^nGQ0)kVq2~P&__XCg0U^AW|<H|I)Y|N@NkcbzyD6
z<*GdhBr^=F^Y+$?qB5Rt*mNZM2nL9rghWK+TA$)PKUA8#M{V9p^sDTnaAd<eBh=Vx
z!|IMkd%ij{w<-w-c>(Hp{~U#8#r$1I{4|SBSmYgOz0MRjHphKYs@|kl_J|*irI6y@
zIg~%`<ha0xBa4ZR&VZR~D_v|DucGtNSrIEyFPw_O<Kb~$H!E<Vjxs*nGDg|biwpO0
znvplH_trSKP(|bCT0?26O+`{#zO?>YKl@h@E*E^r)VumzbP!vR>mqvWwg&27QO|T!
z^h&e#ahq4)tO&S3mwqg=8bnm55RsuMe(3OOh}OT9rF#H;`ki3%DECUdw;zfEikh-S
zrPH}n;^a9EHO0y*l(%IgjXLU?UUa6`A=Pi`uFs;o>~1<Bh7d^`zg`YUEj%wpKy`T?
zzWFTYN?Ea@=J1FwkJl~*fw{xDY$Gs-O&sjt>>5aHbI5Bk3|y8J^|H=+=xxX>S3;5b
zt5>4aY8BuzYq4c^umN?UnU%l1hl9#gYa5G)SO3a-vQ?K@#~4TZisHY`Q(-Lk0%Ux3
zN#6f7mG;1F3TU_@p{ddC_z!8eFr4LsX-w@iG>`MzKvII~mw+^V)`~!HxyCww*z|lL
zT`4;4#T!#*{|fnJ{d2K!`X~`Cw+%Kp)h7RmpXWD1lJMe>-GTgH*c%pCk=0>W?dw{^
z*luarPb-}VQNp~9wATJN^ayEM7Q~-M<nrbP9+@Sc$TCqbdY$;@nBUe{yDt(kSkyY4
z2JHk4kLMtrt~lNiN7h*^$6rG#VB@(k3utfp<UnqkScfAzoZlH|#N%NTQ0B8+l}`}b
zUA-UgIsYam{|ZTE6gC}iZkcutdsF~R5kxmqA2w1i>OL095qR_kv)msol}eJ&WX6$Q
zc5uBvo|8|9$JdXcHf#8K*lq7y{*^~IqgTw`3~RhAUJUHh%%Igle2~=lJkv|&(HYE4
z`AB8yW4J!llvoLG0I0oSA+8h!P>@N;o}<|!PSa6`2bsu#{&u1*%C|_hRMio?32jN<
z8%k?2jawo-$@)1=BH5_`!BzQGa3)^Y_>t8r6RetvVBbjs(Cj4Op$6$5VM$_^qLhBX
zLjVlfx;`bj=m7T6z7{G4!J~QX*(E3^vRG8d4w<-hpk~BtB*xMLL_ljD-~hnm=B7Y8
zzuhd*Uqnsprpc1JX)?EGzlw$FF%-xx_?B@9os3gKFd%?&>muqfTEcEuzH*54w%zdS
zB><=RHCx9ifzk}uX7pR!P)n|Jo=7qkO>|$S_ElfZ@rAYzWUA0#3xjT7Oa8PRPp;L5
zImIPUqb;Urxv>|!9-v~3?25Y;eisXw*a#!5s9qErKOa)a+J=ujlN6I`l~V#Q6uAZh
z$}oYOA+s3TWBBUO(jS*nM8hf0H>r`OgQDXv?sUdLfh;_E1#w=Qg9EaQe6yxu8c+od
z1UHA_c>&c`Fa$w-nKs7as>9Fywv!Vr$jL|qjQ!2bZflJ*)Px$^mH|G1_mM27aLf&X
ze{RUY`}^z(4i>Jk!LBaaQPVmJR)v?2D|Zm{HL!I;XL-NsJ9>!p`i!NNHPo%O?4dRb
z>tk;W!~eMdJ-EPUhr!UCLzoYtMMV{x_dbKq@xQb<Sn(`Zj6*n>mGP>)-KM!GdDbk+
z68%0kS~WtR%&fx5e&wUC4(n44KV{G7@jPtFi}C-qu>QLuNQTeTZ_5RTaLD!1s*LJ*
zs*C481SsX`XYNlsCj1h)*HVwqukM61C%ozDCng@zM{!q8vOjwEP+F{ynzR)iPIh2t
z%V_YYWDPQs{-e5dE0?J@uC^E9SgPu1cqlKSG9!YYasR4wC=6l=I35hT<gdn@tzNA#
zXurmAN1CC!R{j9Wqx<MSs^o{l47l!RgR9!T18h1smPH@}#fzRRs2cxMW$SfOvmu<0
z<1QQV9YTO7z6C(?=eA}lGTj9$DZBHI=qn2hI%3ZNE}iikK%rz5|J=Osh1EEinz7d8
z_ywc&Sm&vls4?}c&3Bnw>HHS3^pnZV@?R9NSc|$1OZ3_WchTeZj|D0Al^nvWxPN)v
z-AEm=Z7FDazX{g+n06a~CZWSxW<1fHc<gosw=)<!${GH1DKpVd8zgi20}K`U-Xock
zWdwuJ2w419xA@>FV)QN5)ZsdkIc}i#*L*Ebc@O{28i^beP@w<DlSb?2q1e-ASmb_I
zQsqV4mIW8$7@(p!FKq_Cv(@-=Yw~5y%}}nsrjQrQa)0uEZ}igl6a<%MGFwtQKzR1}
z5O_A5Xn0`D;d^mCAzJ(Fqjnc}1(yNHG&szdS@}TC>soH7Jp?&}1jEOW5m<+sMk8g;
z{nszd$YxJ;G-_l*rH#atrBnX{)$H*%QWNSafzAKD!^p0<;mbuXYp_I(4ybkbFyLdU
z!|5mx@<7SlIk2ZFxWmaNju{#mIR0i_So(MUlgUWO1p_RQnjj%#m%XN#dAaxgGVKq%
zPq;#ohRT609JlSGtKiPN9&k})YIJvul`%&yQ8k)|qY%v(6t<Ki<;$E+c!??%>Rljn
zy*2>H-Ws2ZNO>$=q1YCrvhv;g_jC5L{zM-j>KAeB7`pC9da-OsR#`xBcT2*I$wWR1
zEqnwL8r!|=A0$7Dk56It*|FU5eM}5(9()b<41r?drluoig^HeJ-Wd{V)lT&x*ksb@
zILVDYl?TW@t()esXaK@8dc)pRVs1H2To?qBRtLcRm|O;|Zeg28kGI!VUeW${Au3QQ
z>Kqc(6-LDaJdxU1_Yw9SjZC1SPQZZ#>Qa0IVP)~XVB}0xGzBkpvW1J8!i0l2w<;Tt
zJ*^_;$$Jx1S}4Lk0zd*qV3D7`g5X_t)qE-v*r=i%YxA3utE19hKA~o6DR-jzy8bv7
zIp|?H!oMH2_4%%&X2w`YKa(K*h!+GVRZ<>H2<_KuJlDowF+oVIX-&`sv#be7tuTap
zNbRhLt?v*6LYt!QmitZAB(&c;cxw+j^|vBC-=MA^F)lk!^Na{G!c`vDTiUVbF28B`
zcoJ!r@bj?#z(q-dv_iB&XEWnkb7QA(sao$a5_EGzTkFMCTz};%*2gZEO;8X!IC{~2
ztxmO1VurFX4d&QV_teeomN)LWeIfG7Q#8!UX7)4j?sSc>Bzdx`^d27;#e8p~r75w3
zh{?g}Vd;7kwFw@;9YW^Q(Xu4hjHBbj8^?yD1TBs;9%MOJ5sKrV7oPN~kthfVyNp%K
zQ{Wo5zqmQ0cc8R8?HpBe1V(sE07YNa@*iu{L^HpVNsDSKYRShlKQoY-KS^l8+u>+S
zm=HOY$9?nF;=l;ELN~plxvV<O9wlK9if`XvxupQw!yXmaC?S*`aw=%-bo`985ygWs
zsTN#ld@s?`z|Ayjm@~Izi`fyv;g5e|$<H+%)?(_@d!AZg0(RoD=&SVp;-Nuc1=-E&
z7WrjKh3>C6C#Ld$p3dobeU1Z6&ILe2kN#s%EBu-pF;@_)9+tw|O)X9w*#)E~o(g-5
z?z6Y%jzLH3!eT3{$w!^O<;V8&0|mg8<Kh>j?v-UHbj7*h4lj)!D=N6wezk4O=gQXe
z?B0zeIPM_s+MUzEbpyTnmi)8Aq=TI7gC_BA*a1+jmg>Vc(7u;0-oTjB=R?n;SZI_C
z`9sjR*0QGx(8*={2OEbg(e=Ln6pNqK&qY~HEysA;S0dAyMvjM=Ri>@J3nnTzA0e9N
zn4j+B4mXG1>7|T@9V&?pVs<?)c#`0~|AVzVS1gmRT73!U&CA<mS*-&~6mEfqQw25G
zLFU-hQnTq|$#?GgWYC1OBGW1+ctcgGt}WPyY$U0zGp8H@RCvKS=O5#d`S7>vcNl6^
z9<1@@cL%2tK2!WfXTIx!L*At_o~|f!^w!v>n6=Nw;=L}%R*eWJnpobrifVv@p&7sO
zX^dh!cUT;u9(>M0{UIISj#9AIXQH>H)(lCx^EZ(#nBdxnWpO?Yr$<Xf(1BvQYs~3V
zSm5D|j?Dbro}C)>+%ZeLwD@vo-<Wv0RM^z)VTQSvq*bCl*cRyBqoj=a3T6P!T4e-D
z&>280jv_OY4%UhH`~?qUz!s?v5`sDZt_X;^=E{T;@Ji%yC+vlgh(~t6@C~7ZM3CTU
z1{UwP{9q;Y2KZbh9}IXZ?6u8;A3nq?@~L1sy<NhAlc6?ft-Y_3Dwtai1Th3KBp?k(
zbmqfADwI52X7sHX!VYJJ&bApVtJ<1@)i+Kx<Vn+pE?n=M-lzA2Rm(i@mN)yMxKs>f
zsL9*`iYC^^bTyglkI(F%fteu#Wv#1aHt!q8R`wSlFO%BOkPZ{So0LUeXQMmYiWfzF
zPA3u)3Tncm=l2ovh>Lx>ED=FNrEPyYy?s5yGXIOr+X#?X<bDm<fw9$$K98Fu657>m
z(8+=yWre}KTosh`qQy!p$}Zm3a}4AM2{jEC|A8<T=dKbfXTjr)d-d$P>cAi~d1&<H
zz|U^tM;Pw}S1<)Ua_4H{BEt2<&TiJ-8yaS(5lQt!;4{Hjj*y}apA69=MKpho#lboq
z6jVVm4c~h8k}*hAt78vuMp9eb+7198_FpRKd_r>^a9K7e;t#ee%6cd>dXpY*k_fif
zFvbLcxHsk-4lmn_SQfb2pa)$v8PjeX59=UM@ayjslz?;i&b{|h<X)}v{aL-B5LgEa
z(|w)el2M5C*(kK69WEk7b;w+9I?XhH$$rDwpjHKBY>?u(-gZOpmj;Wu#^1e;jC-ph
z>#595o$0~FYAq(i70}qsz-neZVp|#!E8xABd4P}xY1h=OGWt6<6yP!3sjm1<l{ZP2
z76r@za-`*3ik&d0Wzl3UzMq{MJ^1bQNaH16PcT?#w69hRS98RZt$+X{OBi9rA5fEn
zVSQAPPtT>StpiSec2${{hp@b|M;Nmj@>o#2RZjAk!4yj}HhYKkqXo)wY|#DHzx{Tu
z4Ztjk42qlbfkw91vbJQe1vk~3HsU@F3d9y*h5a(fb>`+uM%1MxaDu=e*5G|8hk+_W
zQbbFzb?iGlx>cARKImOwB6~+P9{+5{gAOT$IZy|sA@oc680bh~1)U`)=6YJNY^k&D
zR6I?Ti_XuFzewM+pP2PPRuKlt?wuV1h77;pfX*@2S7r^Te8y<h6c}A*5et(!bI$_H
z&{6kKv&{>zORbXCz3jAvZQ3GYz0OebaWiU#{wbNw{sXwpnGI^S{ARQKS^gw-a>Hs+
z(3nX_(nQ?gGDylfu^poGX~TeKV*GTq;-er${KdUHo;_qLL#L!s<)i%MJ<{Z$@J3B=
zxvyWrFu$DMOKUus;pRd*Q{TfR(%1c_+0$4;r~9IwAPxWtCzD9^-d+$Zg0M<6P@}Q=
z&kjnLvOZ!ky-+1yM%!5~`QK3R=&Uf<b#KtZEMz(_heaDKw+Uvs3#v%W-XQ=02_`|B
zBy2>#Xg5LhDdkQ%KEg!aj_sL?ZsB{~(j)Y{8&%*caEj>LkLsY9|HmriDk30?ebsDs
zfQ&i8c!2qwjkgJ})Z@(-WUoIb!VWs`!%r?@NZ4mQgInltJ)gqP(E`EW<{92wo{m54
z=E2kPf<^o%pOpril?>43$!e4EG+3qWi$_lYoSl#O2fo!~U9^;$DiU)b-JBX2a|#Kr
z9TsM}zi4wyFKKBG?RRzRzvTQULlUdFM9l#V+Dp#B5S3qyLdL4m`M}`VP}NhbP{8(k
zfP$`Jcvn#&kKLHh(HPH29Ebs|K7rom(Wj?K-^HJn%j5<JkP?PKxmcJC>5E&8p~ziR
z2^lK<+M=%M0$x4qsRkKTWhaok!BS+~M0Ewnu(C&rJ8UZq|Bxy&QzC@rkYmylg8X?w
zQW|v#BN6xaF%@cjcPzDa4U22lujqxUl{mcXdYI175$yte)^MIok*ioU5mC*0XHH00
z9a<GocRC64joElRq${-Xfh(t)3w9;MOrlYn#i8YwFsO98+-faF@b9a<cGNtdJL~zT
z)n;@E+L7Lg*mR8-%R$X<*ajgdd-MnTG1kUa95uyV?3e51UhHo}X7IW2xDL{rx~=cz
zC2ct*xWYn!^^ND3;Oh@lQaHYl4FD@0tQ*~}llwyo@z3Hx)-_yMgnxL<MS`41i=SEr
zNX3+tD1IMz5qViunv<`Sm<c2eg@j3J!VZ=jOic^W@qSnjLdN+;bv|$1hrdT3U^(GN
z^Ev42PFs_#4PDTw#so`7H*Wz5T1Q)+x&@vd1o1PLf90SbIBtq8&B)Ex0h0y&zuz{f
zCAGQwQMUp|Z&_{g)V4=r9{Ti5#fHi>ZY&^>yf*CpVfbH~pDsRhI1k4t4I!C<*ICg%
zIjCmAR>6(Wdb8HIE>^sV3{k1(&_d$4O(QT77x}$U>fAkOzr8Y{S_y8ef$xo$52T2J
zE`kE%ckRkvtK^@*zUY8CWv^ty6V9K;y5tq(WG+M8GJM(>-?Aks;f%=XA*y6=7l;hx
zLt{3`4Ns)JxHlSTidP9`aGjq{)G=2+y|&bv)%6N<!1P1_u8J|Jv)IU)z=$FXZ6wg0
zO8Tr$;kc>_AxFSS=l8a&UihsleT$VK*u@KsQ9HaSqu5*R@GPTPPDm_Cz3Bq!MpP*j
zi<1?zRQ5c-p8xqG(9A&;rC4+&y^O~uf?{Kbo!bIa%u<wSjv(A$G~bJy<r*K0I2ed6
z(zKhtV}nhs3UW(YTlHg<iy^)39;}zvDn%V*>$yNLTx=hL;yT^2#3ND<o11ByL*Kg>
z7nY?XUpF<imI`9R5LxX~SfA=+Lrp15thd<tj8~>&&{M<<e!=J{xsc+d>T~)=aD4uu
z#LBtaca0GYR>s?W(4skoR_^$S+Jv~V4sz^o25wkFrrD>J+q^&ac-uZQ`5G)RpyYIe
zO;7u{{04V0k8}*>Sup0$0<kM_RK|*K=#(D(QiMiDZc5i-kBFFL@ef5wS}GKGRpIwz
z_Q32j1~*ZFVl4@(LU*nI*9Oj0(s}HiMB=o>XbSg|v3g_m7G}%;I<UgJ;JYGL15hWl
zV!sOK>od##I|5`m+Q1drfn(n9b&||CF;Rsb^q$<UkyG9DZT`H8Ogx?oo_H=AQa@B7
z83v%hLBil<EM;6-jEZdZ!=t5mVvl19WIiiCvn+FH&o?qdXDd56T)-WG@``me-%Wu&
zmr^|bW^?D%sq)x|7yR9QYrFM%5FC~>k%?l(m(}G8?BUzz7^khZ8$LIAg55I-14RhT
z@!+gf<}7t+N5l7;S}IXA&=K?1u5o{A5oQi$&fp0`p&tRF^MzCQ+&rBJ5K;=floMfI
z0f1Y-TcdNBAES8#q=x=$@mW8qC5z$>@A|~yiK^mrx-uK26s}!A>uc>LR0~#K8tBA5
zn#a-J!pKlqV5q~Y9`J!maEnt7iJ9z)7<F#*tHnor?%|v$Q4yKPj(o6#gF%jwr^D1*
z2@|X5ArQ1gT@!F?M7=ak5cmoGI2;r<n*o?0q*}YFa6=cCZkY5?h#e|KabU+@Cil(a
zCLt@tf{SikZ>}?s{H28l8G6*uPvgXUtW|{wF1y4SFjBR(Lutrdnu~(-lV;27Y8A{{
zsDp(g6H@0-et*IC<B8VBgb@J9eXE4$HlmPf_nCEjfXt}*J+XXPU+zZ2TJ}Iwms*;Z
zbqTC%d4unqT=X8c7Dw$|q+&ezu`nGwxxiXvCot8Tg7$+{K`e4!+i6>o)vNN#QMy<@
zTle(!#X?py$smy3ZA)Y?)0})yn45eYdXXO<ct<hfP7(6_=R~Z+$YZ6(E0zI;0?m`W
z<SYVHuM<}1UPLz*$JfSqY~=^8ZOs_-LYt1Pi$%WZ_+K5}k@HRW#jS8aD~xrhI5Nev
z{=Gb65lvwAv~YkxG&yBq?92_UFvu=F6Q4k+0%0+}K@?F$fK{Z_vu|9A*E;fo&kPTM
z9|jkaQM_CM=Ui3R6g#;c8(~ccYuHUuVUN8;{i@Kxd~Oj{0vx&z9sA&nMMSF9k4;=_
z@UR5%;5LYA&b@2DliBSf_nYX9%ot;wGcUp-elS1hEG06$`uG*63swrn2|gNQzx)nX
zmjbZ>OGjpn^u$#rqqA*@Shylz=&v9v&R(V!PNHQRfKB1*YmIFMs;Gi>bE?BQd~viA
z<@4$cf($sR!#GaX0r(X!H^Rz|q*aEvjkAN78WQk^wAJ-FDwl<2W8U*fjxsPZ?W6ch
z_*aRo%l>o|(Df>0>7T~T-~L{Tory??NxsE~5^YB76TlbT`wia5CGQ(r=14w?1bJ6k
zi$Hu=yi;X_c;zZ{@>0JgTJYQALMnx&Jwg@!)X`KMsQy2NJ1#lF6c0vmpkoSig)-ly
z)iE&+vDU3s;hAM!C|N&RQ8Za^t;AIi-y}|Ywdklk_CoU=;0y=ip{q|np(`FXYn=w=
z&lTZbvQWx3cPlS>6-=t>3<&;Kcb;OQ?kKoXM+FM8b+9qM=YtXiFt&F#*U%esj?pB>
zFth@+I>OX4JAxOhzv|?u;q(E0j@@1Sr4Q+(<|GC{XhPEAgt%G-5!#9@{!HY-7()Tv
z000V6K>`V)M#S%7a3L+~EALcDRNBjYve83mZ?TPURj4-=M>EZ>JJFD&$q5KI{R}V7
zjHHs@fG)~ATfI6-eLt%&s!d^T;1KwIW0pDBzOk)^T|?6G1*b)9y5=yv$(L%Xci#`R
zOI#Mm1!hz&LwIb&6IgxVL^&0V=m%Fk;on=ZgWh3&P>TA#3m2Omep!^5h}IW@-p7Qo
zu^O0L|AhT31`_n@^R(J}w;n4ebQ`u=&~OYqk7q=yoxzib7I@mMdf@rV>!8SA(R3>c
zGTL1J6)2K6TLnybN-R%x1cY6Tx`C@wpu)~#MBcJZfLoN2TGCz8s`9Tc@m3e@kti8m
z;8kAZ;jXLM$zn|_WyOL{F?}3nTmg3hY!jZ|dk71+eVL6}9et!#xi(%c@3dOn8>`mD
z_J!!dTS$I_)K%PnO_yjHRQ=DYMw7{BwuxG`6cYvFXk5O-K`a0^l^}hZ$5H`h{##sL
zsR?7=3$Wgjx2G*$w;V#oE`RaK6agU=gwBGR%SrAzpl>X8p@Qgna@i1<PON&PBQadt
z*ZT-$MqUE81^&kDc6~7@Fj`emA~jH8X1MP!bhm`ZfI7-|L!x;6J@ZVOtJ3WeP8TK!
z&^DU4@^TyUvYzUO)h30#$y0c;A`q=jM3rR)NGIOPsisOY7F_&y6%pgUkiJm!Z3gc?
z8mq6}N)3TfyvM>s2OQg>QSfnzpc(M65==Y9+6lFLh!!z+?7XiG6I04jt!Ab*GpjhH
zhn}?RutbkojQvrFlLdfEfJkujTd`->rDV1SpA!%FmZW-H2Io<qEk~_aYDm4d23232
z!>7dtkFXB%jiBXNPEx1hP5*aFP|?|t%75TtIz!OvH%VvBNWo<pA33=E9(?p<8zI9}
z=5-2arjkBs&~ZZ5ET)JqS&ww(HcF_ftis)*nA&}*!^f|Z0*JdL^=6dhdG=C&3@ZZs
zU5<=2{P<}?4dHF;z|kG;FLwqcJNS0nWA((~CCq=X0fIHkIrjU2M}WxeQ%GH_MBBtE
zrjo{LKtxFS|A0FgEffCtj%i_n;QR~*-9Rk=<p2n`IIg;0DqpGZwovsRVLnZ(Mq^A<
zLxAQ(aDoMA>E+qKj>|@+mv36x!WrP8(vg@@s|}+-o9Bq;qr$8bO*7ghG_GG?vspxF
z?*9cInjtFMQr59d8DchxY|(dbh+1M#G29<qpp+Y|A5xQzTNIW5NqU6n-?Kk^P#k~6
zH$Y;Kk2jb~76s{v(IG0(`Q4B4zo>yYry`EBvaL@M{AvnG;Y9}!lGxD$M8|Y$Oy09z
zpeU`*Q-gRMZ*to^VbWbZ;Kq^^Fd@DSGTM^a_U7D1k&cG!x8c4o9RdO?iaQO-P3yEF
z^uY3u3&wK&{f0o@(QTZ<O+f|>_6I3>`ZX8Keg#eJ-aW8y#0{}VvM&67N}UB<ndd)b
zre{adJdGqr$aLNpjA?#*IHe7L#7F>EbWu>2S8N7;iQ$mTKp6vQ8DICHvoIF-UZhIM
zrSaoJw3e;*Myl3IVPm!9y26va<$bYLRWkK*L~@+;zTT0wg{k)*mTgtuHtf6OgW75A
z5FTIDg(-x=-VHGfF|TL@8f5BnF}R_M$e@M5p6teYkfj|e3J@4&+GhgIH1fFdpD{Sq
zKk^-j(J@7;;Dhi35K29%w21efg@<t%8;GBxbPeDpbNV&(#wayU7RT4xTUK)-&494o
z#WG8<N`ngq=+>Z-GILU;>pNUm$u%ZOVfesVvQzuN|17kNr$nKiqoo7BoA+z++`V4j
zX9cBttu<BECq7>}zYSvs{b&_Ackn^dB9U9awx(?M`EEy+lF2p@JfgyW>uSh}t=WmS
zRXt9adE6HKjEZ>Fx6^TZenAL6DLmrG4Wx{v<q}^d_z5$vT0u#=p+i_Ib`0@{Q#P20
zApbL9zx-c2yAhXl*tPEL4c8t&;9UwssDJ*^cO~7}Zo1;B%It5Q_UsXsx3N9O?Ks6%
zg`IuyhY;!VxNZq#71IeCy>_a@PwO^AV>fB`5pn3@wQh0ur1btS#(Lr{fQOq;nZaZ-
zhtL$!hgfP;(0dOTRvkPx0NXs*ucZ&Nh0~z?*6-~Qh`ct2I)1)$idXGZ4YjMWZf6+h
z{<`hak;0h9L)C+DMJ@<3Ty`tEu}&-yIY<)8dWF(_VPOl$+?^Itjoz5>X3(s1H6^P#
zQa-wMk*>{ibwTx8@y?L0?$tT+3g>LKI&N>mp#oS>J>2*TNG4L<iPwu3R!xx6w3;ED
zLrf|9SuEsburwEHlm65Y@MK-U^A#QIGzdICj^)jLd+q1_^Oo^87S&*Y7i2XJo9!$B
zyis1>goyhU(F(qF&_eBQAbU-IRjZA?3{iA|!`rY->eksv!CHi>2<CNL0b8bmgTY9y
zr0#CB;eW2jUS^Yz!Cma8U=>e4BBge<Sk(O|*Y22M|Gb`OS`jlf^K*8VV=jCxY<6UN
zK5)<<fGbKO%2+6*)P*cgjoEHEa47{gIWb^)J+?8~<iA!qF|UViVs?vO85LCj^>?-N
z;87zYI*!J;sU8U&tc5O*bfLMbk}NX9-dD&X?Y3eE>4^XT7Q=OYhB^1o^LAyNAiYx~
z{z#!BPo{d5$e|`yQUhpPMO87<NgjvD@t*pEJ1!SUM?2D~S5-ucvdnlhjkfnK<6|N6
z32mbIFCRKT6x9#zZ!xL4Pcqkyf#cC*Y--);>T9jZ@rE9YoQA*(LKcnlNOBZu+qkxl
zrtCpX=Vr#16I-c|dg<g!kDrTBfsOY~BwSv{5YOkV-ti`Ahf1VP;FrdT7aEQYUwWx-
zxNehF*Cy5Ldv0yTE6C<UQaQevFZ4UQ<wRNJO3cF?6=NJf$OfgE+HboGjUePh4Q|_m
zf%YAL^8yxN9^uz@PDs=A+G%1iw2$;8uPB~P$O8O-Yr~NBMqWMNHf{x$BnhfB<&OCA
zztP5UX~7N`Nr|syB9rTUqg!>vr&5=>-euwVDF9~7>Ow)D3uC>SMad2Vut^{xP(YwG
z+IY+m@?jG<99I1>p3i(T`OyNHHM4|kT4?kPG{CAZw;L1StWG6(i@>^UZ!SMGUy>jS
z;6j^LL&5$s-Hn2QxRwcLGb3B*>>^&lkz{Pp08L-FZfy(aaGH{;qMzwJa2k3vFZib`
zpy6s<^ifiQ(#d2|ZP*(7zYh9L0D}BuzxXJo^%(;6N$qWzRbeAVkX%vi9b5olA-%XH
z(F+E2(5$p5T4p`-Fhl7zOX|ZA$G?Y3le(%O5T{SERHU+aQqY{)mVx{`uQ_vlIvqRL
zeRHDPR+gK(y;5Knf0*jtq{%BX$R`h>`^x>)`1cqTBp&MK$xKMa6TVzWB!AcxiaHuY
z#Z-1rD&KwWS@~bph#0x4MMGKEGw?n{kEjoj-&W8wFLP?}SMNGgdiB`^&N0ymALgpx
zOCtimG}ZU=e(BYu!tS$D3rO_UK22juzn0y6gFx3*!}6tHcsRkza^Y2qrT_p3xIqB|
zXpyly|6u=K(av;BipeLZ#&G%AU<Zv@_>HR1`yUg2fqyz=c-{de?cxZ{=1jGZ{$iSO
z%#9C_uk<xVlF+4{Uc&<_+g*)tq{byZcn@^|;|jT3?KZs^V)L3M5P=?GuhPg6lCs^-
z7WjjRt;a1rgL3YU&ymRKQwPn}lQXWBuMU9H$#_wjj(_vXjkD4BBDKL5QcpfXq)c#k
zVE05ZD?^u89DfvG^&Hbk4MV4y#O2T&6g}?dCmE+eaHzSfjrh$O;Hqt)V2lw9X7r$S
z%09YVOXA#tqNO%N;I9`I;fJiQ0xk48Ju|UmW@rdC+x8UUCj7#D+~}Qrh`ZI*q-S98
z2Y54MX}pVkXEeZSV2*0iz_~oRm~`pb235f%TA+HhVdnvJV1)*`DGR(@CX_Ovi!Pqf
zd)Io|M<fHcZbq}V3;?0+q41}UW0AswQbiPYeo0zmU_vqb<DuAv7RKQ+j4;7U0=e7Q
zE3dHbu;<ZbZ(ijqdHzRCAuDSs;)+HtwtT!UQL$S4%zA*!@|tII34$#^=c2~ZQRXMl
zn}l_qv|6k#p%|bwa~46&mlcQIk?{&Qr*rBBK$&a@XKknRBIU+RNBd94VPuj#a4#V&
zE6DBGmltMW;39&&F!PwLu)$L(py8At0F|#aPY8*Xt^}If>ykw__~xW;ZhSbYK*^-X
zANFn~orjo?&QU7K8a0K@kVrcUc3PkHqAz`?JAtD<)e;tRp9|V3xiVF(Zq0nPT+39O
z0BV$#tyjG2tkzpXk3M{d&w(DxKE7~wH)nsP%jQz?N@Pd9r?w}9P={}v&M=d^%tA4k
zYP&Aj$^B7{jVwomJXz>y`G8jQo3)=>?xfZRW9&@CJ(#{kkDxg`H>)xoWlG050oqlj
z<hqX>n0k0w)Kb3x)r3Mf^S^?H%?aTo0P>o14w%)6=<cNV-%&O#=_e$g`KHw+4Mm?b
zqw|qQ)Q~$6w&Ab|^Y9L*FFjBnU-@7yM)V@N*N%<4V5F@L_uS&xrXc?=L)knqMfnk8
ze<DD(u%C|q4hWcZ9SG3Bf$&fyTtq5D+u9K36|P!!FSZOO*)lJHL`QyAMGbL4R_92V
zP%Bf^DhR3ctoEk}*I%?VY{j5tF&(FN30hG|X4@<9*Rl1?S<K}tJsnOMVjyR^k5iUL
z2MYfR!bjgQm^2$52|+iP5O^2NaVqSG)V`?s^B9MJ)<Y3n^4QU17}Zu+fPh&6mSPRK
z6^7p67A9lAEfF3r+a-VMkEgOluz-dcFYX131?9hHovy>5z$O7?@9W??&Clxu4iu&3
zgluQgAkM?E@V?B6ahMt`r>C2<eNKa5C8!;Q+L^3Q_RJF`my-kOiUJfVckw_1@NrBR
zAUZz^&1L1{q#p1~I6U>{2<2OA-sg$*-36;U553HYuoUD))z4eG7x`}#Bus)*&Sz>_
zT~t$&gC<sFg|mqz7!g||wjoF1ctNP>{=$W~An~T%@>K(7f(v^ZgoJx*p;;WJWR(VM
zbX&@D5X|tZTFJ=Fe^3^h0D<*qK#AI`3tH)o=dzhQx8`w6gL?UW)>~5}Hs5k_PqhOh
zkmc;c++Psv2`9s+`^Fxw3i9&kQfQ7Pb5E9`z+jBtw;lJe92*6)Syb3*5RnX9rM?fP
z^C{s2<T;|W{?++<B!W_nLCudgJI^IdP<12$mPiGx_cj)zio<=6IU1>FXn<qBR!1Q_
zSJC=<;qj0H#een!vw|7rGee9*Psz2{jKcJp9EX^sM${n-0V)eLhxPtl8pmNr!fP&W
zbG7#0eGJtQ$I*JRQ>ByaYO3=>xIOpqAa~+1-hvx4b{N>uh5Q^0KjV(lfE9s*2dwZ?
z6c|psRUFyFMq^$&*FUS>X1GG+#xjgOE40P$9EkNG4NG^*LvDa%RwRZYa2@tV@EiVA
zK-$Bc5mI2hF3?h4;1woYuuu1^_r4{3P`LkeJ`3XA?nTgm>qqME^U|ETWH?z2`mdKi
zs431v%NJlIG;}Q5s^KN!`}%UOOZ26t&QC_jp}>?GfU6aeF6eE(R>*#<(TF~$iY^?d
zdLLV{oCLEZb1I}%MA}?J^BTC@voksF753X#wrWbOct}LOIl2luw)p!le|7JK95|fx
z%qJ<>DhRp8cJ+bi5h2`u>aw@|I&OS|{zambVOoqMpko1A%=Z2j2bV&#%$?m4YpW4p
zWLG|};vA8px~TQlA^F1<5~~kk0XA2EcSBWtBL{ctCrTw_cU;OOV$O;rkDhPx^WTDr
z1eE&?($uoBg+|ktL|;Wn1iSk`T+;b>!dyGsF<e7lDf(BfDGw>mJfOU`NB`CCx{<=t
zFw6aV3D$_KLFql}aIo04pwaZ0RjLtWuu88=a%!Qi7z}ev6BWLY3RsQdmMC&Yu>Alo
z!uL@Kg+jscZ8sYZx79e{p4N3@J)^map73D-MVUtQEUjP@xJxMIMIhy0Z@{m)cFJNc
zpd+B_l4Yqu(T=ivs+wvqjJ!|{Rt8vr9TCZS_J_+>XICLcAJHcfyH`GgHZ?g4V~*nK
z$ERR6rD@W@0^C2UyM?gLMOeZikHq{x%BgY9FBQ(=scX}y0oou^?;zsYS!aI6FaQ7t
zcR>MyXpylN|408$Zatop+viUo(g4gFRR&uz)&!5U)%>}j!q{?k4p%dWW?Dr~6pSUg
z2a}JQzVb79@FwYnoeRF=fg%ZoPAtny{QQ<v{&U#;fW56?BBNc(`!|qe$jw3szOZ4U
zA3jBPK-$Bojqyp9mx4`pmcu$fak<8DrUg|Q8I<=G4j&!cmU!9aBtz8YH4=6gix%Xc
zXijDJKHF}tR*%xlU-6*>j|x{WEw5)4rr%rnu*;H%6h4H>ujD9HA`wU9s1Rl34>7KE
zeP}EILErTWUN{-6!^WKoZK_pjdnzuWdHWQPB>sZ@y2{ar!;)zXVJ(?(c6VnE7KIn;
z9Er7izX@U|O96bBJWJaaY5NmB>Uy5k3==8}_Rc3&(5272y@tOH5q+@g#)x0@v~a-y
zu~qUEt?fal@Ac}q=&@1=7dHui0i;!i;6GLd<##F4yIllX7&R1#`iG2KK)DY0Q;y$h
z8@Y8H?t(44GFMLm3XkqyG$&Lva^G<0ztZU|i6I?%V_7eZuH?Q<MQcVVZ5fNq9g}Hk
z+FZy3x9Yk%N7^W3P8>kvL*@Ylwu%xMG-F|0bxWbLTXU1g%uS4+bhrnFMu2bR92_y^
zrh|wIrGDt<*0nw+-)!4Lxnx%rj4<0=S-*3Q1tb{x_Pa_$3CtS=v1N;PPMm1cvA<+)
z8Jp(;f!5b=(?axXF3%MBY;iXgt|vs08EyVmXXaWX^up${bZEB?8*2HZ#gpV)Xh4~^
z^s0Pd-23(Axy$C<=md5HTBm-Yw=yTOOI(bx|CjOnbcxfTi?ScDeP1f)amw^|OlDi!
z(d5ja0;tS2o?(C9igP<J<I@3iSy=oG_6bYPEsU}!Kdip%cB`Iyi6dx_vazHMsy;8L
zjEgu)uQB}mPaR!zt0U7=d?An`8W-<{9Ef7-nr>_}sJW8yvU&#Puy2=ms1|J<SQ#(M
z4dD_I8#oLW+tH$708?=}9ot&s1ZpoY&ddvH9O%88Iuoz5TFH|O;`xb$=VSD-(rk{u
zHd)4BJV}?N60&%Jr5_4~f-@BKM9ar%Gy#d%;bMtojh3V7duIOGk%j<bI`{)cM>wa)
zN9RI4B7#%;AAM^W0N9ueV_FPQxgN+y8J2eqKTbim-XIBycC0QYs)e3RZOxRfF`@Uj
zbYF0bc3HpAsI<a)$dTvHw}R#RQuC#`>?W-)&hMGgSFK0kRSs*_rkAXpyk)Uupi*l|
z#kaw~Tmt8B53`YT5TS#Sa{f8(dJoC;AJRsss>N_IVIQKD015^VVzc;BLViZ6aPAlc
z(#9*EXGZ4NzSgt+0Q1<bU>WhVamJmh{(l9Zzl1(n-sZIqmwK`@nK-jSx1+2uhBT~n
z_Mzc8YdPdPj|>>@nMZvJ_TmQhymF3B>PPGAPxdEivs!meFt4xThx+OZHG>Is+l8n&
zZ9uRL08n&sZLF$l@|O;i)O>qI5@xJe%$3_J{*aLDYDd=Y3X4j_%NU4AIADQf{%9G(
zG!rTXyHNKB5u$qDGHOFI5YRuV&@y3+Lf3aKRJ{9RoWy?*8%3kALN-|KS1;F>W1}!=
zR2ojB@HukY?39R6Ux7FjZM8G)3U{D#v)<sEdykOwM9!#8;kh5pwRTuSojJ6g&oK?5
zP5kP=au84od4J}G9@;?<`XQEOyCG;q5%G9p_$FU$8{T}F=ISF-h^z+f{TT)J)CBAc
zXK&NWZ@xv~(_L=7rSV)m(2g#?sr-+cX#iO52FhSPGga#34_(xOI{k|D1*~*7$sx@v
z)5B?@2+FF7nT~c*z%cp3h6&sc1=!^`vLaeBw;n#sn9qeIA^0}vt^J7#HSD_83`QNY
z3hMif3m;_l<0O$xwBKlYDNChKVW*PxGvF<MCI_SCt_&eCi<>wA7KJBjj5gNWhX#Jj
zm{*RHIkbyDXV%E7o!)r02r3Vnray3?JZaoGz7QJUoz9r$m22D6yhU`0rV!Rl%Bi9n
z&VpgnaZvS5fxS{JqqQ4Y)G^^!lYZEXJ`*|A9kTC!qekaIaV<!HyAP_St8l3YD7?i1
zgN1E|DdZ7USN&+|p^8T0I{qui(?tj0M)nug4UbFqqwfVO=tIxQgv~~0NbWhOVbhp4
zCYE`NVmSDJIlbxFM5ng+DE^QHEl2X8PQC-*56s9~!3xHJT-7Yxsitl}m!dJ9?*^SG
ztz4?tBu#32n#UNK_$i@4@zecy>Ds*mdY4+5ysCN}qJan!;j<u>mwhMrf~7(1W@XzY
zn86B$`5>V5LiVv#_gO6JA&P9@<IDSOOk)jnWM?Rdvpae+Cw7_ejHF$cCDQG#(I8)=
z+~tl&HKL%!#Q255|9hSxL^IkaQW~Uv<dSL-U5A7z|3RQ2j7m#s_<7?N(Si)v8-T&%
z-n7A7%A)*JEW2Hl+yB38(FE2IQGV8wtzB~Dp-5i|Y=h4s_{+J$8w|9}KgfcDSz>W!
zqWwfRVoNr!!iIw`Ur(IZeoYJ^m)SwPnFcGJKLkvnXblVieyhdU0009RL7qZ*mH+b>
znCc$I%#kf%9JBvkTE5??9zOU3=Lh9waa<yhplfSW%vk9f3TiKw)U&YgIsUn24XJRW
z&WY}8kHt%MT9AmnfsJkx3NULMhl`B3R?f1G*oCM2>2g<@02OR`E(!mx>>~O#vjGOP
z$J=*Mj%$scHkCBy@wHcSL;NgT_YYNS%N&@*zMAb?CgIiG0<EIyE&%vQW~Me72E6js
z)5PKDK$OmUAnRrEuC|Z+%9+md3_8Tz*`rj;i!RU=8v4#HO^FDn-zqgGdCg+*q)Sr7
zzIDpqMdBqYi)IAT^@~y8Tlpvg5vgnBA8sxtln+<!jMX^H1^|B&@$zz7YM(q>@llnE
z4wQ8rVFPe~(MhtRr78CFU3xRC-I$j9uG8ynbNMhy-ZrDG2qDF<=tPgO>dc}bb3GYX
zl~Gum$U0gu@dU$kKTa7BCQH*-O#XsQ=zuPrv3un>A*GEcSvT=8$63;u*#+|@ySdck
z&iJoF0xFD6pToa&MRTUt(VU{&S(TbV`RMrIYj&{`JPHZETSHFQamO3vUvb=4_2NE}
zt-FBJub>x~Fu{448AIC<=FLU`jf!o)k=Vj;htO2{ndB;4_eVV)<A{WCe^y|bSE=qy
z!)!gZVPA6jRyrE!QWGje=DanB+`S-DyZzbj_?_bbx?sk`#Zeb6CHs)E+5Yo|i8@=f
zU_1`j4oJtANpB`cF?oZtI9j8}sch|m)DwY|ZM*6L9-@Xwr6csb8nT8XR3R@uho&nS
zN7t!8em~yIG)3&iA-yguMz~!KmTpF9fk<&l?ms<SIBP-sd4!D5<I9+o^VzTm`0kQN
z7@&QzVLG7x_TBwnni0!cdNw%e{@DuQ)raayEwr9r;}u3iZqPpma058b;i(kHc2OBh
z{wbgg#kv4{`7hs&ryv$*AZef6Dz=(7BD}CBSF<037h6V^Eyb<_#^#DqG*ORB&~abm
zEtAbNfL7opXrH{AN1f7%Z1Fp-w7Xf7ZOKGQ$L=BvGA^4+(Q`(D!eY6X_l&4a&Sp!)
zABVT20005aK>`WnCy5`KAO{^{ChtF~ve%idnHdCVKKF-(%zDoytO-Xb(#jO~(@?G<
zGcvt-MHCiI3W`F6c@p;%WXE?FQ+OISiNF_i-vny7ry_`Nrr*$Ix<)i|<tnh`5kEic
zL6&|AY(0zv*-_uMDVvZ-+I*7XGLh=I0$gx#RtSXE{jREhVyq$|4k;~7iCiVBGxX33
zO;59kR|*pF)smDNc1}B>9m<>5gL16+vp2jx^Zd}m=THW^&~G{3hi8CE4Ds~m_?xS5
z@%zFpoR$B)o}PK#-^o;f(xlWTYKP^{#KwS?d6IavN`m6$$Dw3P8C040jyqG$HIpnP
zGaAoFAw(`u!Z#f207W@f{<$%z13xtjR3<oRQgS7UN$Vk1=!JSCy!b`aH)WG`|LAJ;
zlcRS(F&u01`)Vbrbg+h4Tc6yJU5y}dMCr$uG2mAeG!$(pp{7!_>K(D1g^zM*K}=Sz
zkN7XAQzmXV4aT`Vybyn?tk~^>+0$05qkRy96xz>5cegC}KZSTdq(liqm=qxwhi!5X
zcUH#JoZ2*(XvsfMHcQNXb$76URGg-~P09W97oWVlOx`qr<0r+q*8+UoisWxGQdwH(
zfdBviTR{N=c?sf2|4~$$R5!yT5}C;Ai0mo)wVx!KJ6j10ar2YldNmWt_-Tm2WG=&h
z$rA|MOF9vV5u^ff!FFOFX2P5;1oB4teO5oc2s}m?modGS>?iOfS)?{w0001+K>>n!
z3F4oKLcT7fwBMSWG_vB6hDnl+Rn4*jnbOF>*tRAFO%E(i6~>80;8+!5j^Yf_%DZjF
zzohWf5u4TqSJ50N3R)K>jhwI_sNs{IJ3PWqBwmGB0EZ&hA~GFZLGE2$t^2QU+v&=6
z8bQ06|15Ps?u!)rzFuFQZ8W2O=%Kk^J$2?VADsQRcW(9~<cH!lXpYLf5{$tB00G7U
zo?&!v&HKmMY8)#rD~_}$BhsGd0F=c`e<u+zqBSn!QggS|3A5Em8D@AGR-w@xC4ub1
zK2-OJt-{O`p#y%-w=Pqq^!72H4jcdnZa_)s<tV-VOy#yi-KR$0GeYVrY{q8OpT}LJ
z;su@=(V~Phl~pnO!5_tYK+^MlaCw7&6$xf>$YRBcaYp8ZggG4dtFUygr+?5vEXhl`
z7r(M$7;)_`o{FOB8~}7&Wulkop#^N)e~G%xNV3*T69<Sdy+XZ>*sU9}L;<&(0Fpp$
zzs&2tnK%~~7q4W0ef^%R=<$eoB0aU&GQrADzK!Q`?QfGv)>KL}9GShT)p~zYb;?Kz
zB|<CB^I3|5i~e+)_Xg+|6{eEZE*IAN1Q6LIL~Co5>2F`{sH8$!#Ktt4Hr+S^XI$cd
z{kY?;dSn)b1hT8EJWfGs;bngKj@d*sN!4!a9sa4yV%ubUP=ge7Z0MQF9LdL!Vu*!>
zh9(RnA0Z!u2R|jDQqBf4K(B7|;q)5MNz`gNvPXW|ju5Xk{2uIg8}foBVN9FV$j*It
zAA>5F)=Lg*SHZt8hwqy_YQoP-e{CRL+TAr<kzstgVR<~!PW1D825<lX0M7ve3Fa3>
zL;s@?j!lhLkOV(*DC+5ZHXY6n`e|tFx;us&?UsXOD%MN>^5RAcEV&_=eW=2Ptmfyj
z%Fg)q)0yJH={dQmoZQJ1?()eh-G9_l7HN&Lb0SuMPj3jMA4Xs`)5Bv#tWEivExPYV
zgsl99@~CS39=-uvB(BnrCnHW3+CL!Lr-xO~Tn$iou?q{&1o!+K*u7s)e#Z^dFBv9;
z5~YS{1C`hL&r~!WXVfCxTUZ>C=$Pdd*>lgwH9n_AW#e~mj{rSS)tn|NfB*mh6afJO
zd4<ss|AdjYVLjS+8dV`a7ytkO9svP@d4<t4|Jd@Jfwll8k+J6vQ_Y-g^2PWGvaUD)
z00ChEo?~in%<4oS(YH0i4zrtOZ;4{n$}sw(aKe32d7M#@s0)bed9OBTGl`&msEW2S
zmjjOm7a)Z}cMyTkr;3A_+vqy4AJe+}TmmY*D!(jU4%^F96ky>{g8LidF+PA2z)G=&
z2GD-@@0X9Dsjs2_BIApj<bLmt+VA<Y5LUp{QQOvyBH)sQ=u!b%33+}qE3M{xGs1lT
zw(1OzBpVnL3=+n0Ufzmj1TDa8KlHjzl)1p(1#O9!tCov8wD@PfgRND1=A#H3S|52h
z@ADh~EiH=)%&RnEbmtQZPylx~XNyF*DrDdo)8~2J&JRS{7o>qb6gpWf^2mHgjv4sx
zmR))jWlP#5WTUnfhf~DdZb!}Z)7;adz?aT`cSq@w6CO7;j>Yr&pooEtJ|zM8%Lke3
z%|Mq~;mA6A<js<LLuVf{me$98DM(}M{eYMH%acp$p94IKeoBYONHTq!0003u0Rjo;
zHl#!U*axpAE+1lU*L$>H;FuxRq>J2Ix+4~4VA*0X#voE?_=&@>zZmY_Imn*EZA7ca
zK_p5gxe=Yl9i27<Jh-VlAD@phT-y{G^)QKSrSJZ;49RXhi2Db5stVk}m}M@Ii>34n
zQ}|!-{n9tyB@EneznO{354xiK3u|hGK)u>0sJnGI{eW-f_LUgnMox3@+$nEgHEuVw
z2|Nm`+70Yyg~To|RJC7ROfgs&jZRJ*o_nqs%LZ-n5u&kQm`@n=Hok*k4c}w+0vf}x
zBj>t}V>R`ThF^HfSflAr%u)|VrdhGzK+`$88Trp^a(j;?!bZJU{!1|k#IE5-4jNvD
zb%wF0ucKXk%9ei-k~|ME^p&f__1$RmbG#KWJ&SzNtE9Uas)$mZ4n|%E|4Nwv001Zf
z0RnlAsSyA2kAP@u{^7SrmqN;Flaq=unt4fGc2ts`7}^1zJvB)H003wK0fKprsWboA
z?NyrydswsK`A0bvu=*Ua5@scSpyh>-^4=fiJ$q34wy&JrcMvHKmf1BadCiW0{*vjS
z75A_hO?(co7Z!$9mF&UkduOOrIgU>w_!o6%>%(g5_Z7nN>VGXu@B}9S00=rknr2C;
zL1>vw6!CAIt=oQ}0T0G4e^*rIQat^Y_<e1h<~k9J(zq419;;!7K}cD4t|P_p`rq@=
zk(WA}*ljx>20EbgOJJbE9`nb>!e)wE3|UaAwsdN9Z}9NEAzVw`DW(S}s){|#a8mi9
zIyv64NTSr4n$st5<F<=62dAL=X+$P7<4aB2i+A`7j^bKSOXEgG7lh-9?{YKHa-|MF
z^uKnfA=TN%<ozC&ZGAJAst{6>@2ibd*>VF(2Q*ufDo?xTq!K^cEtCo&>g&vANTPWL
zeG*L!9|IPeKNM+lW&k-$eDp6OP&5v)xQ9bFea@}Dl*<M@5rv^e!=Jp0=3AT?09+Z%
zaRK?()3~iM0_)&iz1ka+GBl1TFyVddyZDf@SUIcFkt#DkGMp%<n@HEjS1!?r^rB5N
zh+K2CBMM+9E<3tIWZAz|;DP+(>t>ooDf>WHFnezax!duD;C7p;MEa8Hi+*!yCbG93
zc7${L%^=yr{ccgpO$I6nJ@z|-zrcZRCeUygtr|I}B!(wItLe6=)3Elxa8}GhR?G%g
z)%-o&Zcij@VDkeI?%zMyVsU^o$ZIgau{xX*1l|wY`x#Dbx&q>7eps79$v*qd-Mj1E
zj5<$w0KjNMxzGAwf+yS{zj7V6PBMaq-J$jnCi^&vm4l2Y9^W7PvbE8`5UfWKBx!#B
z{ptkf;m*4~B*<U}k;k<?B>INnX42#^)OgZJm8A`<kP_+TI05yw4eB$ea-S#+WKK3F
z%uh1xoQf_$!|>vzzhgPVR?WWhS3bxz8Rg(<Lkyv<dzy-A$DM)NKTIG@7{^rv&g2Tu
zCixkpw4)elIa(bg?x=)EcA=yFMSjD&r}43e%G^Lam$^#u-Xw`*wI9O=>=Xq=lsV-b
z!h&~ts$dovsYk~QD*n+)zI8Ae7|m<KMmAD!Ygmr3#3-1vvr$Z3v9pc*lfMW1_;Pkc
zFe&IQ+(k>?FhUfihJW%M$4I-FD{P(gSQ^>E0^LPCoR`nkda5=&^>jYXB>&=yq&Je`
zFfhz=5;^;mVdufsK^-0_*G4!%ahr_DSE6y$O(5gcLtV_d+4Y2zDt<KHp|B+zuw}`B
zi}vVpg6f9rgeFjNV#H_@?BESXI|aA?5?#7YTLVW4=sukvb|&^;bR)p0Np4hLEz<2e
z_!(+#ZG0;UK|XOBP_1@ro{6w@-;j(f1)(9sXBV|$x-qe2ch;I)IOWHO%VI9C%tK%-
z+hWDQ>H7;`y6!e$n7d=>;4L4axF-)8B@ynD;+ll4e?2?wB(*ii+=l>w_3HbVZTl>>
zh=)#aB@iv2f_efVppnyMi*`#Z-UbHDB>3}qhu3Pe3ef6LWs$bVaas5x5Epq&TH;1z
zg}Q_9V#I`WZicIb62;fUtPt*moCSoYlypIWOk7DH$ig7X9~m$pY&80Iy8eyH>B2`R
zsM^`N_KQ875~eNF-0)I#B!y6`GzVY?^jRN9gfz`E1#hY305!76T;V86WOq6>($8Nl
z7f3mHW8*7mFPXHV0z#21)mI|>T!XT+r2Hi>NLC8HgF6Ry;b-r|^$)J-_QLr*of$iH
zORrA{%#;EXzv);li>G$p*nNW83D|A>s#YFxn-^M@^>b>l!FWM2z`+)?o-$SUi|zr<
zFY7W&$+>#94yOiD478(@wed2>2a%pBT%Op~NW`0i2V<MVvL1n<!P7u(NvT05RASc!
z`PI+XdH3(O|M=%Ngb-fu*C?4~sPq0_L~C7fITWeQ?C04}KJ7c3=etQY;;qL&M_p#&
z0biOLx21wAKs8!=AOQIy_mGTQrF+E;I7$4KTeUA4-b2q)m;1In9obJxSO_a94-O<>
zG~z+@yYK|zyw08U-6msecI}1O>4VLRw4&U^eppWZ&Q9UM%ibqYpyUIw-Kmgjcc2o0
zG&=d|Ka4GV9}q=VLjt=53PP`Gt9Nv5V}9t*+SrNIMjJysP8pylm|WC{a=E?tJ0Qv^
zt4>9sY^Dci1ZyVK7bN}Is%0Wfq1ZcTNqTgs(J^A*``%DWX-fJKck3)L_H|9*H>N}w
zra~|^f^|t?0itLHUvJD^OB0(FpQ-G}e;t;)->RhIt$MsoFh(*Gr9+CONfT<Z+QXcu
z&O`y2r%9QbSAth`<51?sqv4r6i%~%DJPBimf(X(e*+3xthrSygo$~U_Bsq5hbZcVp
z7_DTK5*_c8jh!Kvis7*DJu~1*P&_{n80ZBg#@6BHI4K?Q<R5ok@Z2B%h=rzf?tPRM
zvP~et7wLOv1K%7`>@2Mh+wQ;6)fGLO(G6MW9WIoW5@q@D0S!h<Cn}&3R$5-<-tD+8
zQ8RQXDZTgkefAhi3A4LSebkT`vmVP0cKzQLFh)GB{N7K65n`(7k<)H7>8LSUgM;-U
z{jRHaySfA1&Gyb=7zJ$LwjIdY%zJu5<VyduXs_F|*ISUl`4TNEB-GsF=dQVv?LJ}N
zmfs(!Y4skY<%TW@oTDVZim6Aj1X7@i_KdZamSfuQ_S(VPcFG;P7R*B45YV3YMi~kI
zSNmdDd0_VLVbJtL{F)BgS<777BMsx@!z2B@B%^(f79WXT$rBjmgH!Q@Wxa2{3gX?F
zv|2mC$)R-Lkee4u>Y$P`e-?%$l&0B2N2Q~iy1=q6GeSAyTyILE^=A8AV%oX#80F$E
z(SwGsknD$@hW&|}xU#3v>)lO`EKk$;u}#gG%wxD4m&GE59e0(NDgA4}#=uL%>N6~R
zJqS?H?+;Ds-Ao_WZ1Et|&BiQb%>V!hj6nhkrevCg7KxO>Pb>cdx^J*2#?0gE*HNz-
zxBU(iJR@j9JtYF_w_}|h`t(QmS7EhIadejG@6PaT8S&{-35jdX3L8sBrb`qDWll7q
zT4@{QI=?;E)mc>v?h(vXt|>N;i>zNg9s~yznRR#IReJRItH%Tu!Tg#m0vn-dI12cl
z>;*2wPT>ootwF;b-66*z>8&f~;MJIFd}Z?oG#^#OfDVn$bhynS$cS<)&>Q|TXPe7X
z8k1;8zg=F0YEcJsds7i8#?^0Qci-CeOE_NFhYp>Q_;227vXSEGYP?9hAtH+0Fm!j#
zAr5|NNk?VOUX^RmhGeTJBq=~uw~nM$(l6N4uk(Hlj|f}t|5ItSPqQoAESIoPE*PJ0
zrP<Lb1-UvP%r1~sE~57vKql!Q&yU_$I`sbS98Tl<buqC3Haz}X#i42=x0X3eaZNA!
z>n7+FJ2TS<ipmFTD%^mJT=;4w<{SDc_>dS4E=Yta-Q#Ab^?bhlio5kxV?+oH^~1(W
zxKO!{c5_Muv5do1Hg*-LHy3xVa@A7Rfm2Rs&H`V^@T$S<OEY7^q)Gj<Dlb=`&-ayI
z1k<RvEzpW0luRyelaT%k@ulk%%i#Y0$<`Y0U`mR`L8b;NgurEJa~_REsf0+{-<bxp
z4rO?krASj=_V`7(Neq5L&Zo(Bze7#kio6aE%jM#e3TfYKN&EM!2g)Pgk1EOZ&d`}&
z)rbOLMD&ybW+fXUou%L+ZX`;Lz-m{nE|uZqG(=4R%0_R^KfN&|Utl1S8NrGh=Q78{
z@Ic(1AoAUJE=%vUy%f+vBHtO*8O#7~u<vYhZfO3Df+Q~&uw)d7=r1qr2IF(DhxcVU
z0s*af5k21M167xrT$&yvW(i_`1EQRI`V9}Gey{Zbn>P^B=iC|BSrl#D1O?t<h&=WA
z`0J-=Mt`*SE5Z)+w<4<ptCWluj!*9R+xa79Ost;u{E+l&NR^M-sC+_m)~0DtbNt=C
zYoO6aEUs1N-lSo!qo;=Fj9CI3>Z6*6cHZsExtwET?2)Qwgi)t-3r{2bcOSRwm0*In
zs+Sa}YvFP>p)8qZu9N*~)XMghTK^tyZU3b3rEzrZs5CoDk<9mc$AJ7UFabM7KDo!+
zh;aGUwgfF(3TtNJa1pWN=v~}VX2@dkP8x7)-@-GpFCOh&5jw7H>|_`hq1EsN(|G=Z
z+I$Y44|#+p%U2OZP4OZzy@s~=c<q_M;2?BE7xWrof;PMWrT$7Dix_0O(T!@w%FDb&
z<QT%Yr=h0GVP+pc%aZbiMZZS?6p^(U&jXEj;co3%;6IWY6pZ1dP)7R!4;;U=ZuMVS
z)hQd_RNx<R%v+JFJD3gva4xwpXx<A8L)@*}a|HTkaThoa!T@JHpV>3(axrdM_v|6~
zNKY$l$E~(CtRfxvd=Wpllw!q*6N=L3rZrP=szX>H6QYcBN2_ohgii0*;7oVV(bbz{
zE9-^KGDauf{`N>De2xV~6RG}6T864}hCmC*(o|x=dM*sH9HaJmA_s}CLL4a%&Rq2I
zz6&;jl0XIq<I=Cs6)#F_BtNa3CXp&9YAq-=;UqS=_GfSuss}#fFx-~Qd9(2s>yuPC
zWpm}s0{C@tN2+sA#t7%%BojNEPNyBy(El(iT!Gy-!v}Uuw9uidXs{TiwI4x7i)ndv
z&%&@+`H&UM^W_$q>eBK=<L7RJL0P9h8XaIDCv2m7x7v4l-v_c0!u<!6=LAcNWx@$|
zK}6nqZl0fichQSRN#vPjyt)0=F18I5FJT1)|EfF22mO)8TYaOWRmNQ-%*h6V`9WuI
z6puj$n;jf}pl(f(<?&nI^SScLu0~2UPAyzhe=53u3gME$&B~aWpXL5#+%iOKk0PW!
zgJjW*Rm}`r{wm5QRd~=f^?#}?d9<53>UsVOC3(Q?efsa46LLMLdZBNxpqI02+`*Z*
zF!Ux~JMGury;Pm$MNXY^9O;}88j0GwJ#~kuC{hK-_+<J^;7~`HS{m@igIn5p!p)6~
zp7%&+=Og-gcb%@$b_tyknnD-u$vXl6719(|{VUAv=|TLHxbdRw1lkKTWG|0~b}dWz
z$2d20IXJt^o3m;#T6;&p6pmBHxLyW3#W!o}tGe(;PR#$9D&re49o@TlPC@!WMQ)H5
z$xNAvGo{2B)lj@69%E&U7Q8*zfv#orM0;1BJ^jFr*)Rya8Uvix?DgOf*mQMgi7tLG
zGX1;n>=#dnK|KA3bSfsf-wn#9<~@t8MuAtTtE)nCq}6>&Js&z`?49Fn(g&~O3{1jo
z0>3vFx2;~P!Tzc3UP+EOXc-KeK71G|8R1*Dfwj$5ZrLW^9-NcoK8oJ>vxFZ5C{H4+
zL*}(p*~^Sc2R-91BdbsO#oj*tS`63yAO5;EsxhtL-cx@Ih(=}O6n=@+_crAs+?>7I
z<o+IIl!73Uz$kmZr)DF9&>6d+gVHbcA@rC3zT}V$rBB)8aCPfLp#<LD#U^cQT24<G
zUq@E5uZJ8Hg3M&rj}P%p3!$6zO0&s%vX4D+uH?);RxeqhDM{D$?|f9R`44_pttvNq
zP-lsjENX{iBE72_nvoL%*YqTCX=V41733Ef2Pq!=w$P#Qr}~J-CFJHwm7zx=wcAbW
z&pm(w@1+fDn&?Z0OkzQB84bI#328BHV-f8#5Mj=gqV2@CgZ2(?U*1xCQ=+tln|Ik~
z6UR6Pf=0hsnv654Mk$}N(4%DeE7OK1B{S+?G#fXDk7R+_VO=*n+aZ?7lR$;6uhP>Q
zaTO}ERD&I1mz$o>HoP9=ST{M-ufX#ww=qd3?OiS1ZOXpZBY}J*-$8|FV@NOm19y{Y
zhJeFZJZ+LIWTfGgU7z%)6qS?OCc<u@OERdm2LJ#DMnM4rX_+RW1)^mzQ_lZmESyiN
zkNYKn8bB{k_T#Xv$X^^a+=jc_`nvQn*6NTDMNXh-EHA%>yc<B|?&1w)f~a|*RjRuV
z>pGcyqKm{HvoF=&ZptIybt=f5{E-NW)|QD0yxaHT)D}(kSWx;;Iu7T}+UYT6ZsIHX
z75|qcyu+r6K~n9%!Wk+|HOen)(f)?5Q~gKaGj2KxZ!~{iw-ljc^(eS)RNe#qsasB-
z5b#b=W>E??OO}+cdNdVYF-pV-h9^eNAnI}JMi^D_iZDbdz*q4Zd&#IRgW7*|3d+7f
zc&C!04Ind=<sX9Y%hlQQKANgzUU7+A^=pz&!ZJ(ZWUbcmp8}Muum;aB*y4Yiq@!)j
z>D3A;tKGvFslJ$d*!L|X__XGlhfdv4T12>qpwTE-$oyOkJls5>gLhCIdL<JfmerTV
z!H3*Pu`g>dIs$O@RDJOfmHX$<g3Ot@o~f`2?f+_L5u>oi@>-+A74mf7mEw^{e87rp
zgwf>m8*+J;d|lPSB+VN6mW%4Zm){2&B)AK^tso84@2#QG0Y*|`&wBW0q#K+q5rtWG
zO8zR#4PjfRxhZ2w!*%DCB#j&NW$l@78WF}weSN&rp(OE&ovy(Gk_m~qAOXbSOB1Vj
z9*a~jBSR#{z@=RSW?NSgxp-_k=o_LhBtL`Hz!YUxAyku#`&(d7hKCP-Q&!2xgRgdK
zCcoGnn6d4AzI`5yuS$r4YIrF6VTk2c1UyRny09`3npCn_HIH<TS_`_#!Rm=n!TDGd
zdG`8s1MoxZTI(8kF1p8Pcy#4ku@ube1(C52Ku}??E=CEzcijDLuyX`Ewm2uyrc7<{
zWXk93wPwvfRiY5KEt9jJ;^geT17&-(RS^sX0(wHQp}i{-X`c-E@~7clFk)~jwz6A&
zu%^wS{eoMJ(c(4{8n8h^tU9(XRSv5o4BY12VZ|Kg+_mr~;dEpT)!9AcWKDsc<HraW
zu44dA*NbY9$KkcNVXG;x!;>0BdqF+y^G+c%1xuVPzF|_X4~56t2wC2uHPD3a@Y#j`
zn@$UAU_FP##NExrE$yY=E`>rC%EMt4_y&=guexTHa;*6?;-V|DpxjLa#%?3j!I)^R
z6oM6nUTYmj#{My&WHK$#_z+V_`>{-nK%S!ifU744YZt=7|8uuZX&U|i9|8OcpL?S|
zb!2*fNYWj1fbzzaW>Ofx8V#O#g@Rd<Ye>AsG+J0yz(JR0*Yn4ylh1X+K|jd1DU|~j
zFo9~;3*IkHb<)1`)l(i0RR+9!<*(;jfQbY7YWE=lvNqUqZV$lMQpK0yN^KT!kX3YJ
z7189eqgaL5Gy8>q>FEB+fMWm@<FuGgdR&!My=olZxmJ47hd7;h(*-}N_`dBwyapg~
zSHYLRLd%jxw;Xc<($e>Im$@~IUxC^K0OfImb9e%4DU!jdd*aOoSKzn^oy$vDR^`G2
zON_ES2@83?Mpt;N40*(8W!9@1S_xK|lFz#IcN{)U<c)ng1sDyGX*-Izs~547WmDVs
z+ct?!4GAB;*^f&(;O4|*U`x8`*Xm2WfhXm|fz7Ku0&b379ERj=JNhr_Bvy`9L(l!1
z_}_!$+xJVs1m!ezL;}s~Zw0PB?cl)himGl?`Ko5kC#(Sb9BnSeuSBg}e*&g&r1G8O
z`6@mhAufkCv+vb1u?L6-$+i#h(K$2sMQh;KcM5VWiZ`Cs>V@I*03AefPcH(A#iE2C
zLKIN!tW8K9%TJa+6woy|-FBIniIrLutv?2;VxM&>;j2Ol{o#s|&(R1c!qw%R&&|j1
zTN6LiN(jDhiMU(w4}tgTWyypEKa&Ke0(zsVi{R%DI>QSgO;X?E8^~F~=DR>xqtv`z
z^Qa7?%vvV}P-JsaF`Vg1@Qdc7bI4lvMJ5nXYI_i10NE<OhDeE%Z+zPf06W^m*y@rE
zpP5zcDX|`>llvm6^OVm!c&K?qv0TSoWo^gB&E#TaR%5~YO?ylhkKK=FN&&O*fI3)!
zEW%-v-Wphfc+v&J7Fk0qTseaUs(7+TZSooXMQe^7flqKSxr$J|ZOtBRo9p2CEdl~B
zb8dxZ&`$Sj-sS_v<kpe)GyYecc^-f4Uf}-#00jC$0fK3nCZPqQWiV63zmkE2^uB&{
zdi#<9Nu-Jq|8xvdkj?%gf-@~T)LZqYZb+TjCczV~sT)alUb!(+a2{a`GX=4|Lt_DM
znZpRUgNO|OA!#HlzI%_>s#-zu1JFR8L)}3XSgsJ#@%syibd?oohxsmT(WHGP^4Ajv
z6WiDWUGBeS;wdK+i!-+1{YSJP8)w7?D>5<mw0(1btnCH8$>q(@|MdZX9s_T?v5sj6
zld<jG_mUZA`L15wR+Izsz!~-PdLV=uhz7PN74ost(0Vz_CoIgUwRo3}tXwywf7<9Z
zq<OH`dhD${DX|2Wj7=MENE#uN)>ep~Az?7C8q*e+HEzf4PZy!N9(KktLN#=~2!R{(
z<lFy4dE`g4r+hnNcx&3j8KsKBUQ}pDZA0M6jyhH9T;PcVkG5<B<j9;oYjrDvowBaR
zo}e|#F1$;8-DUw^KC*SdW=Fmjg$GD(-rIl@pnElT^=;u#cy2@2jKC+&kxD3732EWh
z7AyorWvH$cH9FOIPq#D(q4guRP6c4IkD<4uagYgBhAp>{Fmy7gJl4=^S<u6dAY(Yi
zVnn_4(p<eAppvZv($b0g;7{5p2DD@2Y*pIDnssEg@LpTawyxlLDlIqaWcr`1z0vk(
z;c`3(iFQWIW1I;=%K;l7O)wHz=6(Pdm?<R!CgWij#eD!#-wMu5g}!QEv)%^;Gp~C&
zQaL~Kl_p$v^fSp0nw-SW^)y<RH#JgF=G8Uaria}yhI`NGoDpz*SicvvR^gLk4y^Ea
z$!4fl`3__<v1sC_{|wN^0Z>>mVhBJvHSn(1Mx>*j44Z`^8eA7=x>|=Xs59APt*}*6
z6&V>>$%O3v=v6plb8wHoro=@~6Kr3hG-fYhgmlz_m@lI^?4vT~Z3QKJtCTg5G7*TV
z@)BFJtZX}rwprnVj8cQoNG0l?M7$edCE}P!(Q4&FM07JdLz<`*LQ%Jy*do0-h{N0$
z(K(0o*0S!*QM>VX$$f2!a0@2QrvxmIs{y?5vhg7w0`%fKp4)lG&tB{%nE=qYPcUe?
zIhniolYY7K7m3+9Cx;x+1R5hl>Xx`Q1TnASfFgWZxDK2!;vHGS1^F$JM6#MmfY>vh
zl>A5gC9@|C4-?0+{z3Q@gc8WeXwA@%`@C_i_&}z4rR7dDmA#0&{5f*(=+vl8q4D}N
z;&V+O$ByjapGMd-k||+kCf+VMt+7L;6#2+D+*AF`Jia7ygd$k%LNp`L{iNjiZBD~i
z@?w^sGwWoQgvS!0>8!@-Rx-U}skI?*1q@NDd&z<1;O=ohmbyyLtPJC0vnOQ$YcoKC
zl}9Rfbul^=?uda;*NEVXcdu{-*76wDw2;2%P$A_lxrLUuoK5c2ER*Y3s!f`E1&}Z#
zhKFk^nl~ZBYxhK|z&5n2>)ci-r1zZI{16tvSqZo!a#ej$cWq1X;9QQGm_-}Fuh>ls
z#k=)Df_v9LszRZd(EgZSUh~4KH5DJV0ru_D1U3;L<0DVZw8(ZEE=_g~omH5$h4C!P
zN<%t0zX^G#6(>ITEkFd-uQ`MwK}F2sEyw|f^XvEdF;2GdR$PH7@~)y1su8k`AaTNo
zT1y5HFS5TyvkiUtET4l$BtUWj6>GghfpJXFhWB|^-ITU|CaCYV>_;5!1Z6Hu9HsC*
zDjL~0VE_ODumPThYH!Tzh?CdzAEcTm@pStp#i38A(ma*2QApC1>o8JSdv^rhM5O(6
zD%=Y`)=SaApzol?c)fffx?DG%TbEqeW>85#3Xju}ZI8U3E8h8))85otmCW50PS3I!
z2$yi--cyluGsYw`Dr3hTx9B9`6<Bsf@uHNm&;+YB9Ox$4TT-f3c2WnE-}Qi<w|)Wx
zC#Z?@N79#~e!PbDjTIIXg(K8y?vB7ft|L?|p@;o$tPQ_E6?7n?dP66tuOOG_gY|};
z33WhT&>8xqJjW}j>OSIumfYR5lo+HuJj5%bg!bUz2Gq3#WndJCfO5hD&MglU80T8<
zXD#-mk|SzezdzLS!Wm0T8%5nsSIV$!^m#4E#;b1Ew^gld>RaRQ!_6FXc`hEIp^-X?
z0a%O#p&fu%-S@1WzHzMXZJY?$QPoLWS|!K;Y(1pctsGXVw??ZIm|zB=&=p~3NP4>3
zV-P_FdF*|TnEckHfm>tb6rgpj(rZ6(V|$>^*nt_)i(A}kbUVF9p+Em*2fU!;@5Ix7
z6S(;&n${2(26i-Hch%-f(hK78O=$M#Jpcd!MF9c{=vJgd|MI*lD)d4K8hR<$26<Z9
zmT_E3yS6XmDKzV`;2G<Q-ZzzHB1&()X+ordrEG~JRbBoBwZpYNFW-DGdAm4+!~x&V
zH8ze6jU3XHa;u;z6YF4E7;1?t`p$ZwtQ#z-*~Y6d!7aO-o+Oba&ZiYm<k0V$KP~Y+
zot9l&asK#*GTikQXrOczHl`r*gh0IjQ*IV3F|R~~An`cjSU*}=bS=$dKVK@?(LGN>
z@`--<*unwMZh1x6S3{gY9?-9U57l{1RGs_=DlCxZpJ+=vs^IZsFkqv;a6;yc0Xf(}
zK!dA${y3ix7MWt}c(QQ1VBzToJrqBtu_UHnc7bU<w`qDOd&)wl!4J8pS^47$0x82d
z$%kWUL9+nxIsvCH{-Rt8fz&s6TXnh02lvwep|6X%39GfOdOaQjxg`Jq03HDW0(upx
z5dZSIyN)%qRcb<spaCk1Dgw3UUqwNI0000^0Re(~6{$1-*Y2}T5W+BeMI(-nz;=$F
zN}RSK_&L%p@M3ccsua6uPHuQBp2uoxvhw5?JF~t%UE@*BEtpis*^)(;wU{VYL64-D
ziXi|12W>%`ib>%LQe`kA-*q+#5zG4_WE2d4R!8oaxwWmx(&NIVya}8$;_EL#5Lo8<
zF%~%}%K#`cz;|FOY{fzpI`3P@(cI+ga$^dBjD2zMTFI%4XzH6(n|>--m~<0e=_yWs
zQa_*k%@DPvY7M}#PNULSLGp3r;+E3aE0ih+P{+meHDL%3cAt@M<bdSyG9R-+X1k`s
z!!VbOlzFZF$J7hsZLLkf8P4`sDA6dxqN2Zy=@qAH<C2>iO<qfLXXWo*ogVwrnk56a
zoEMHqQ-{<xzR5^1UX*+6?Nx`9?`!d!NeIA-l25ks5uWwec_fToQ9sE~RRJiIQqR10
zescH6jlCYB{&W8#ADHsQVK+;qCvPTYw^Ifc`^gcxk?Q(UPSI_yXq3pRN7_LPoI^{D
zwM~*~zfM+v$OM+$U<{dz|BwwYtpnhc^aaClA12~zUT~6N+nIY6^e&}{1iat1xy(iq
z)|h4dGc#Xg%i8heSs}{FYbhel)edhY9ji!RUK=vBM;ArxCBuelU5C)1$XGV+JR-fX
zgqz@~sUh?u=_3)j!cx>i0{stjzxoaxdiNng)k^6Ri;7Udm!}P#XOj7=9O*)(%zWu#
zrMoV7b!T+x?1$Sc!rT~8;sr~Cj$5)o^WJ;d0BVCQgr9%a<b4K#@6v%7hx8`1p<%Gy
zKQbbpasiL}o3P8Uyx=ujfsm7GTyrc!R>f)$KZ=50lR2B5IW?sBX|6?D%$H2*HIVb{
z;pD|{C~hw#7fy&Dw8xCns^YOX2|Q^HEuHXmqaQa)?{jffR|HX_t3XA!5&dHU4kY?m
zvOVC$GusJ{9<ixdCo8(8eEEAFPH)?=+n1P<5j<>Wq!XgXi0_@Rw{zuoBf7hZR5_hw
zMruiq;ZBIgw7JS!@Pq5&Lve3=R>ro>rN-)A@^w478nqxAGIkYMRASuW@d|AmPpz=8
zyVVWbojfi8;PQ2uf&N##m8eMaX|gJ<6>VVoWN@@)56+wd?W8(n*!t|d{O$UmpRz#x
zrGv9K^C!B>E7zC*fyrS&WzYG4@eFsoJ=WTLsRn<t%Vijevi{O{5dm)^0%I#<yy*LE
zEsdkuprldnMZiv?nta|Om&M=BXD6XO3cL$bGxp)KGx7em!EK2C4YpQ`AswNHcU*Pr
zMqS^xV5jBIFA{~IRk|G>JXW-*j%eZV+&n{k%^-1{-^>RUHs<_L$@wR7w|@)F!h~BZ
zZ|X4TRI*VX_U}4kiM_u6|3sA9--_9&7M+s@Ls{vlkbS`*l-MIKj9`~c$mTybNFGU-
ztAnx=oc?dA4V12EQwE_6dks0u&}Y`^9S4>1Z?iD}u3PS0factLfF0yhG(0ym=9}6~
z>lxp(lZtH*;hw<;G7&NGdv`u(WJDFe5_w}^Y7g8)bmd?u1>DbR>=&bxWOmJ%l1Eu_
z<qNEcP`!}ARkexWhfBw~F?xGMH}a|raVzwQZ7k(MYR>3UFlKTwt1*8<K45~B<P9@T
zH^k?6PYb>{XQPw&_$&V@R)`rjb!pXM=Tcp2G-m(Ua}5_;vKmdD3v5Gb+rb)oT*GXm
z)I%im8ClOpOla94AAOjl{$PsYI0>ZBY{Nchxi-Z~>|2H|cJH(e&*B*VT4jKPMYNQs
z%e4kgC5kUhT1jts^e|`V28i+`f2p(Hx`l25C0qqty?8<KE&mhU#Inu7*VP^Hi?D1T
zgK(ayJ6l9Jq~4q+s8r{l85~bq!I`nmTjF3g54w-1Tb4@WoyEdgJ<1(}MQd01k<krw
za|3AoGh^<w@sSO+l=qe@{O8(ASd0i71NANe*siOn8c?0sg`ai9_T;o9Po9ra^Jc%Z
z5e=ib4BDm(GTEuP5BvXen=FLGw)C>m-4&S((1j}n1Ks0;U|JXqAEw|vTPj`=Zn@N~
zsvQs!kHzAVOZyS6)Vo#^Z^j*VRLIgBVN$zAp0TZpTX5;0+lsv1sm$S1v8)3ldk<5_
zD!jSZ3-4sI1Ms$RByC4yK(!}Ecle}AuPd|8aud5BDTU69f>E~{it;-)qIy6MOOl~c
zgQiXN-JdKgXjm;6S<X*h@IyRE16-;EhVCAc$P$eN&SykcUEbW3SidIQ(WCA=6Or9#
z<1V7}?kh-fI?hZYQ&XWnL&^vRobI*Zex)B^k-1ku2U5R~)v36vaXr^8@SW2oY=}?c
z!`-Ab0?gKgJhzl<6=2daqtt_0N=?H+m|O;N12bH$Ke@8_m9y_-m<g<WCTworsMk|!
zxj?`)3da{hrPu1+Ska~TP64j!sdtQ{jSq8sq&(Ae-Kpp_3Pta`5B3ezXruF(hnoh>
zNdu<CRXc5rsRR*7CaKbcQaYBxK(y{YXxl=Q79FD%iaX3~5$>d(_=R5%{_ds39)3(<
z$~6<;I0swdhvvbxiF4WkbOS1lOn*CKVDTq&v!+kCuR;q(LIK(|pUB8Yk3m1inT&8p
zksIyxJ&MnBn+9WMFLvK@QFQYBD_$7JCQWkndO|2?CAL8T00_!K0tu+3o)D!bQvz52
z0^IUm#9dnQiSJywak0VU;=t3ZGvkelPJn3Ob!*0?JcU^f@tOkM4NToWVX~o-Zxh-V
z!{Wu>V(Zt{2Iyu849lxQjQB6+^->icbbzrTk;H8KTiv)bYvz0xm{-g!-yhAY<5nOM
zjqEUw)oBQtT50TUaVXI~nd*1Y#;Px)8%$_+^C^Xa?hc<wg(HZN?NtFdPjtP_%$00R
z6F_U`g~H^6&3Zz~T_IlKfm$vZSu3L)Ib<+)6NaBzwGbjguWMqav%%A)Q6=Y@MFO*d
z0ata3{LS3PFqEpddN?U3k%-*@DHnEM?#!jL_j4M6@~aCT0mv#q4R^Am>;D2EZ#*8&
zG|`YZ9*VolMI<Bor`EwI*jOKR;0sFU^ozmK1H4^Mg66TEIOp;XTRXDfAdv$@LJ378
zNE?E739lDpl39)Qhpa)Vs<o$DF1enDf??+O8lH9XNzHT?@eX>>IFzW10hyw?cNKHc
zQa^&s4$v|0j)erJwtA+v)|*kK_n-5%lIAD4e~w^C6>y3Yo3}68&HAOhBk+Bwcd=MJ
zJ?LW8Od8dm_<5P$HZi5Dt7D@}N6+p=?^^*B)R0h5Jza1;e%X!;$OR$M-j`}paK>Fq
zE(#}2y5pK0=dF7#Y;aXEUhcIF6D>}b3`G&6EQ1!E_V~@FaAq_@qE^sJuUNg`EtYLc
z0%ksdCM%aSA+_%VTrDtzubgNo_6L|2=tCf^knR8=cLYQSvZQZOS+~~w1gyX9I;p4z
zjvSI2$tai)k$Eucm;D9Dl1vA}ABiVdyb^wBVWw==Cx9nxNmJ!d0)g4unrdp#p!c%y
zUf%;=llM}dtNv!1o<a%6=;CrYku=^B_2i^_O=qW-sn8{mpS3aveCX+;PCXNDRuX4b
z&&i)qbO|~BL0OMZg2@n1U4AEmYbgu4gbEbiDji;oqNHl^k0vj<+NkINUBckVf~ltp
zMNHQ^nlkghd6ap?O1FgdFgGuFI~Y}d(W6U;mZze~mDARLkvQbmmcGYsZ|N8F4om6;
z#Is(|b-<mE;~n79&o1<Qid+AT45zE>E+7P&TX}aOW<9M`gpO7{JUmW4jW0XxMWrSR
zU%j1o%miV)n5r$w%TN>l27SCdE?@%H&s-Mfc3sSodqCib4Br@i<xJRn6G|<FkXL}f
zV~DiA8?l<vl^*No6`rt;I}m$A)|hFfYkQ@NyDm^;u9AX+o1BN$El+s4<bWVYaiS*B
zRE2-g1vm2HBx#IQPtsh=tk&s&(mW03dORAq={otq9B_0cGTE9y{I(TxxgF<q*zjaF
zHsMr@+OUY;po~uD#Yr+nf-yNC?-Bg`{N1FoOjJnHz0a$>o>D4YC}&OrJL~?73O1h|
zDWsU@+PDMb%)F4a?GmG`X<#(c6FUdM6VI|@$|S1!W&c|no|(f4{*nDyhIZ3z;1V2p
z_9W1E<DrNYCwnY<+0;H5+$Z)=U-!;GnXBx^2FnsVwX0h#XDL+mdK%=87+)m%)g=BS
zyDt{M3erZvr0o<&Q)RF`Lj$4arfi7I6n1&$;Lhsu!5U;2?onuCcd`yOJ!uNAc-gWE
zWbYxJ-H7T(&q=mp3p0Ye2u0}^@TK2O8;u_I>Ln#he@<d@J#`Fns1c@0w;ms&ndH+W
zK<2U6vz6IEb<X{MHKBk$X6=H$#IZeQLzA~U*EVF8u{(SapGESX<y3=_KO*I=9P5`t
zmT5m79%}d(h)l)iVGbop0o{p;)uV+Om1)ge4Gx;%K&SuMf?H{IBy%u>W_H+tGG#kt
zx<Mvrt}I5UNN;!YZ_5Z5ve?wR$mI1J&kBRFg1MhPZjOB+aZ>DBnN3RaVu_V!XS^~H
z9hzobeHcrgFfFy1RBS&WqumyhZFa+jf=J}CU?xawDM}*Ac7i4m3v^Ny;ErLwWj`s$
zA5xg*y3EC;tae!<`DO9qKc!kYdC{d()HGS1`hYWVTYjUo0}v1xwp)!-lhjcsWg%d!
zs@pHN(wuX^Y3aD%3us%{5ib4mJ@Y;|;b@kei+TpAap4`ysBB{FV|7hvOYZ%zO45~o
za&Kf&`--bQHj-zxI8fY^^li^%=rP28XCO4lS`Io2Ed|?apXx@tb_o|3Gsh^;l8(n*
zN|o*<W31xG;R-ngZdUAZ*FP#^ldmZ@{{qHVn_+X3Vop+GADo*W%A@7{)G@(NLHcDk
zXmx?9BEdj+K`*s9&v4=-l>=lzFxn~t7sXU;{uQsPn9|GwE4B;gRSs1<<@jW_qthqB
z`GIlv^RiRMFS0=jB8>RxaMR70rj!j^o|CH~I35Jp_%~Y5w!_#|Eb<Z^EHzns9q?JI
z%}}w#G?(W^0EcyKO0}k=WOtM_fN$cUJ;vszr$e0Ufck<-*DoXC0>iiPA2zt_Uv<JI
z-6x2n?iA4nm$LhI#{c2~U7sXx@R!K);`rmZ+RsLOPoXRfyC-<mrDn?6f|8>5TBHs+
zVryu1Y0JcHgFF-s0CJf?X)^<;Ay3OKXBplPib{y4J<~Vj=uw$-f+>at&}U}hy0km0
zQA{E&LA7=9Qfl=r@|u470hS**^ac3scXr<T$iY7iY%`RFS6)paP(f)k@JyQ<)bfj@
z2eF?4+R^N8VUbW8{!b*nPro17qge0)dv^!NvPz4-y|h<(T!MF6o0CLa!3g0O>IaT(
z3HF2sQ8Tw3x5r()aO8TtsLLu3)YxEt4m;fQxhJCpCbmSs>?#cU>D11uj%Tz$6-^AB
z@@{`P^a&qg)vQhS{g<~i9@K*0DOlfDf`A7t;_{UuMAjJO)b|uojzM_a7+OF)2yU~9
z_Rx&G@WcjLDASV|{jXPf)^E3#sq~Q)3t5LvZyG^J{RYnn&)kQl<qQxDBrknicjrJD
zXq!jw<#SElNT<Q`?QL&TQ2IZs4Fbnx8f|sbZl+PwLi`xEm;mhx^F2Xb019MPhZT!R
zy#N3QF+l+WYAGj#DM^&To&Q>Z<?bU4dGcTX&5@la^2bqv0l&QN!_i9Pc4cIGT)HiC
zXlib_3hCu>H#8-jz4PvBOS0r{`4#2(ypt>0$C(A?bf~$@Se*#a0_V{--7>~EgB78B
z4=R0AtA2Zmr;(}<u}tFt)RE{HQtet&COZCpCNbaDiy5CJ2ESi8s46piryVI8P&ylG
zG1A<dXN0Q9J-cw)GWoK<v{ICNniuPhn*f+b7S&Dmik<&fEqIqDo2Gbs8u1K4C;i$X
z%Sa<g$qy(zk!0HdaDgwwZ^`08((92<;f`{jP`q%ViBAePb{q8Mcl79$C?$~jSWK-V
zi&!=`-p&g3Gl;3jNtbL4B}D!(piRlxzzd3Jl~;RE!&bmAa&shyT<`a&6zSisOc<&{
z#W+qja3wC)dW4TBYed`e+;*x?XJ;R#9D@*`I>0%E?()L+s4SIPQJ$nY2%9Yu*$xZc
z97VQ<r{I8OwdFh&)|pSRf{CrJ5y#MKAcurMF_zXyZgg?&K4m>sapJRBmZ5tST%z?u
z67+#P+}DMRf6);G_-pKv)M+ZpAi+kXKqy4)@#hMeFxjL7xHW}lcz!ZjJoN#TV$PIe
zYkiXh@eQGum2aoW&;s7LNkg~vC?_mEHjhe#6#~LPz<Bx%Ee;r^dwb?)H@^#js(Ta`
zk|~P$++08?kz4%3!bszk_OHF*MmHq)40~ljOTv4YocNekipepz&g(g=0)i>(7uFA8
zg5Xb}qns}!G}fFz9e=8&1CZnK%RKQXL7WVLgq~`O6x?7My-izYF29%rxUJ^}l>VHO
z>2(-~?>W@^F5Hqnk9`uXBm;rw)WgQ)*9?b9Fz7h!aZT{a*}S(HS*KQqOC6kP`)&n3
zIVfxcAu6&S16b*Ze+pCkF9@vLKC=c-2-&2QZ^Vg^03M-C<r!?n=j3w4p~LuA{&Rzd
z6Jk(Ev3GlLso7XqVAQ%618Bj_aeYs7+Gc0SGzfwavl}w`HGrR!LpSL>D;<7(I&?7D
z60+3arJW;}*#yfS%|=~=*Wb1yQ!f`rEPS3SSt}`}O*13tjw@$QlZ!!(H1Y4uTjCE}
zk1b94v;tEY==ZuW8D%~;!YV0r1PoTS1u&KAk@DJ3vfy<)wj~x4)tyXft6SHUFv67N
zF0AO2!bGBuF8|z2iPC9fGI+hDDth{|JO3z9LYJ}qLD!c3V8&81D2Ek~`>}_cT#QKO
zSa;F};C4sX?0I@hwe|e*k3tjfN5$GVUrv>0n=R<wf*;-POajW~&3>+3`?tsnOoRYp
zG;UiA2qX&!_PGgNfctx_;zHPca)gW?_s-zsm9l6~xXoMnNc~?P7_czvc4?#6g==UZ
zR*`>VbTF5R>ar_-!ZDKPIa6G5;_}v`fTl_C_1k5V*1aj_dTd6T2ppQscBeu*vTng8
zog|WW)H(itZDM)Tjw2sI-BtnTPLPaK$H082{T<dz2U@T5b4${0Gh#eT;k>kB40Z8u
z1vcAlp}y*>gfY+Ic>6bX0qsReb&>t&cOYx~6o*Hq^bPXvm=8hzXA?*yRUmba&A408
zqSf~4I)c^u$IPowmnx0Nf^M>h@=`*Mm1*(yQbXJUMBgnwVjglO{i;v4!_kjEb8E{~
z9z2$jqXS)Zj$Y6}u||Y%ek7hL^C7&BFhy_8f48^};|T5?v3#O9gP)8(a7}Ub8w#cj
zVJD`PfHr!c?_HS#&y!GSh4xVk+&Da!)y;b_&fJrO4z;u&<1+M7Pfe|d5?|;$f=tog
zX?@BegY|6mF8HO`i-yl8!1xf}ecKY5IW<@Tr9O|y;EaEyes82n58r(Gi*~;#j^F^y
zsPcMA2fjW1>VYNE6IFy~*!xAH;8J}VpYn#hpOe-2x9;OjHgR1XvKe~U_hwU^UDM6x
z^9Z<pW2jzuS%)7u2^JU2fe_*@{hnVCAA7hx0Ltd=kT1~lmv{T`Qq@c|m&gtNKPQ2B
zX(g|)@(}IrJrzdsieoH<@f2~Fr4qNkaQJWiln*6U=X84WMiVQ)$#wuvnP3jl6Um?Q
za-!jsUI3C0{WHi4{;s5fDDe@|j2;p$q&lbSV!j0PYv^Gok%ss=6@Pa9O0m<C4JE?G
zTbyx?07p)@U(5G$OmOEMkiGF=O;cC?IFnC%vhl6h2x7G;044bdyDH-Z2G*Tcfh}*;
zQozk7chiS3rcquc*b2RfG|ufyKrS7}&DZ0a&|<r37*Px<%D6nJIJ6&E<1UQs?7Scr
zECU(}fGvR7gc4=ILYRcxa|~gi^diDpQlFp3bc;ej-Dc{{H(45&L{O0msG<ii2m|$B
z84*dR?<w#Tb~E$*ec1Czj`)P1DaSht_}|6^4(~6Vy;0lXy=vii9u=`0kZZB+oadQB
zF|>2=qHv(e5$!L)X6}lx6h&#f@KhCkn416q1PMU_f@&!zgeggsz=?lPB<<r4_lm$r
zi*Hj~vM}fRaxN(4_@UKBbck!>@jBvTA0so-%I^Or0-Rtt0EG;j=!gI!4bS$$tH)04
zHk+p2H<zjUOaOvfNK^Hj)iiA5QCHXA3eKDcJ*ju+G{XUriD`0TgO=%(AtM%>qaG&#
zpBb>o)!35<Te7vizvSQz32VnZPtM$|)5(G|%%=T+e0^u)<kJrHa(#c0lEL`sB>nRh
zhhhx&L@R!$$!9XDseX_ks9U^=&I%SlG7Mq{m-^b2@V%=vMP`VVXxB!DagLbUu2b$;
z^3!-hICqcYu-2?S>DH7ojpa@pT}7PePK-X<`)_31lIzwHrJ`^`);P!#JUv@<E@s?y
z+XM^?=#P#|`AMZWnbP?n+OmZ#;V8N@usO+iEF+Ds;rvj(e>j;SlP4o6fDCIj2s1sq
zhk;FS%SXD{(SNH#JHc(sJ!%cZrwd6AVlUgUP{JjL#o*G{WDg^wlm^6id{EsFfI=i4
zc6dL*Cw;)NKi)ijcxnjSIc2KE5Wr&OW}FhXLYJ62=PJIHgStP3_iBoz-SX41nGiyB
zj*^i27^)xo*s|_pOUsikGa&gZDYB2?h-CPc$Yqr;Zs2_5N<?-^+j*HCxpY;Ji$RW9
zX#m6omRA0Mbr417^Z|lSNNNrL@`bnX-YN1hjq+v4Cu3p4cx@(DOJSGr&mLr#e!(Ri
z%wyljN8dryADx%Lg%0nx4}n%XJlnUFD?ilWkY{RSd&58;kaxnVTq|$-(RVcy$V9`v
zjU&XbiKOg5j$HzXC16xTT(U)%Yv^gUFpfzF9q9l=cN`nXR)plW1zje#v^suaq@xVY
z2LT)0`3vIC;wc6k1~i2x6z9O4@$roxT1l>0rp}`-2pkAJp~4&!7$b&_09Ey2dqpEy
zO|A(}3P_s89!;H&;}$&nt|}lXQlfjm;99(jiZ{|@O)u;|ea>2rkCm9{1Mg+ri%zzC
z$uOLMshw^;xVHI|Dor6Upnqcdnsk}dCngCY{+H@f2Fh!ZXMVhV$HPxVtL&nIn}wvl
z?s(I&&YwVK;#Y6StJQ@I&);8SP#AEAH#Q6r^`Dh?0|Ps4l=Q4t=3#FRU8kKI<TG#^
zc<7pq5}Hl042FOQ5@58AD;6!0DHj_4lwFn3#nYouf`h+scGA7t?kdTZ`o5`yTj8JA
z=)McO{75J8Z64Cco8M$Ss)g^t(+js|>%=C?0E@{e1i}Tiy@GCjQwc=|9LWX))6#_A
zgu(El)+;*i0!8%&H99xE(6Lu-rtpRuboBu{(`#QB1_pyw3!!i1xN<`E;<JhpCatPe
z%6C=?fK!ajM&6`@wcK|W5nTUu1bY(z00N>xo~T7MOs~3OoBVyCzW2>#7Bs2t5Lf>q
z9-388ClF&CeChV$AhZfUXA&wzi^y`1vYJV!BqFAK!)wZ!FlpMb`Uq(16ub=p&_FN0
zBVw{e^*Eu<HFb;$j!U&KZ5DqobJ`w98zz{b?A{SYpJ8lz*g}JrfSsov>mPlJ@st^C
zjvCFZm>@DxS*1l47vg8U;-u`QB5Ke!g)7Y)Fw$9?F}3X?`n@5eRqt)qVQui7dxk(y
zph#S~h@H#fG@4c+7P78e#7+u?R56IK|JO(YJEq(5rl)Go?`u!fmpDq02GwzMeY8~s
z$*}No6;?i;pBLIG5L|>eNL=yG?QtrGOQlNZ{YN)eJnNBbWsw0ipkafXCAs>WcWk_-
zoyX$=LSAcwP2<YYt3nMKNO}70El@J{Aku~csm}#vuaXhkQJp)m^ZnXB?wUqyO+t7W
zo5ufHgbu_)TJwV#baJbw18L)VSn%tY968$E?-?zG>Izuy>l-k17s$hg1qwy3fE)7<
z+et@KM$?XtXoMY^_R&s`N&8Kiu87y{28QXX4`kyJps}YzDOn=#7usfJF3;z`;T4IS
zUPUC%1>$&Oc#}!)Cy==9SHh_Psv`6|_<LJ-tpmK5Pd7i*_1xVpR4&8Adfd>kfShWs
zGM9s|b7fKEpEZ?Ifa@`;B3!AXFw{@L5RP5hWD55)-Sw~tg=lnyGnRJP>C5o~7Ix%a
z;sI(uU_QAVVl8Yq`N6Iu;-pHSmK~j|UsgvIlIU!dVuB4P!PJO3{92Xhx)#$ad;y^2
z0i1F+z%Ofhb6l?|>E_`VWyc^b&bA%Mhvjx=msq2aTum%+ZBA+%r5!xfyxjt{%|w9+
z;Y)%J#;(NfOY(4>C?oKrL;x#$YjV4NZgc8hQGfsd13*Cn3F?GXMAQHKsza-8CyDUG
z?ihDVY{z0<wwJTw4uk610PGoe%mMn&qmx4`NNl`-(s7UlSZ}OUf_n{f1nuSEC!M+k
zH91aQa?5wKup_gB)ll`<_0o2eQ~>xjhXB~Sy&Wt4o*VOwvVfxm0Rw~VYyW-?qpXD6
zc?;tv7A?@DPMMyGKsb(;)FORjW@N2l+K17yg-Q*aq^1LTUq$vHgNp=B8}0riMGx4Q
z4O21#+f1#}!-E$|AAtGKQySKK`rrR6xz|kDSZrrx&xe>!$+c}GLKup`B=c1;I36Sn
zU$w0yS{QGKYRgp-wU1N#F~nF;@_>aeEM0aWNI(-Y*;QxN_GHY2+QS$@=q7ernTcD$
zxpHA}IG0$MAEI;R@}QMO@1C|~zA;q+?4>+$wPpIzZm|Y<w$M>XQ>qCBEg?Em9Pxy0
zN4a&w%+((CS5=(EiMz1rr;LdDEUYpZp3Esy#^2{7G>ZB`Q^^fEb%9XEvo|0#0%ip{
zOpwEt!I$rKaG|%Udg9t2uI|dLBh8$^YYU)aN<9CDg8h<olEdh-PoSux-@ONFJ_PDp
zpK%W;19<N~gW{{FQP7esk*?<MDQ!#e-_;nmAK0e*Y!GKRRf=u<7cCw<uY8J-+b<qV
zW#9;DE+PzU$#@)UZ$|=!bjpFu-bYnd!BTvQ-eZ)XrT1DCbBo?0@JK$jJ5#r7gjh=p
z>`8qTg=XLd(Lkx)CDlygeEzJNeYCNm69WIULXu1{N0|_se7bx_nMV#^nkv5-8hTb0
zhs18(P{$MfvI`7ncLparA4zk5ybpXLd<+LfvSb`IXZ;jt{anxupl&ie9w{kOFYPY;
z<qDvj7pF&fWzg&uPt4TnClck5IO06<sarHyr5h8xz1nXj#Ft+WByzy)>hi##^y7}|
zB86|i>lAb3fRG}`Q_~6#R~hq5^L0)7df}OKz0CCCLcm)`f_vpzHU9DA7-a_Z8IMEk
zt}6MPe5rKCCYAARD$&?^r{*JvzPpLX9>Z{J0hw#)ISr7zVT&)8V$JGMCG|ZnOqdM_
zL#g(-Dj~qPT;_)ZxE#c{^@L)}QywNK{POBL9EC1oY{&or0arl*0(zko(KP@5x&4Zs
zwda09D|Q!u7t4ZlAudgiNvAB!$~;>Eo?!jVWIKTcsOrjppc}T=NL(}|$4jqRyPpqE
zxFuIq{6-Dk(TcO!F-DFpbP^}ccAMDh>jcn5l+apzSATS)s+#<<<7^w#sbDfKECgoL
zU{`59A^q1}DD;WQ@5$3PbA%ldi!>B~3^^C_UMyT+2w$>r0|me@%kpfbwi9<wZyrHj
z7@e*K9ycb0@zrzQlEw&>(`$AKZ-!a*IAe$NCc-p|UiP8@KjOZ18@nzLDmDKE+T%vp
zuJ)~&0#=kx7>%apAM43k_Sk(?1PPNDRrgcuz_=Hkf7YD;h##4LyjY&j#Da<B0ob5e
z5_Bs|(8K;fw4<z4N*mTynFhK}y<M;gZ*O{7V3lu8S<rM#4Ia5e*+rL`Q0aB%Cc+>V
zXFS)(uB)Nm$Mgo=v#sRnU;qFBxj_MfdZ852H2<o52lRtKWWGxwqmM*h7+Vb=AWa%P
zTGi>Y3zm4r&}D1u*pM-(c$N>9Xqq%Z%1jzsDtVLNqr?$n%wJD1_Sm2Bx+Ax3Su5%O
zJI<3w8Y~D12*zOVCBrzRz|U$B(Ij+q4Zk9zH%9YK)t60^P&qaY-hlTo@~v)St1z?-
zyTdLco;%01t)e`UiY*LTRJyQThdEj1r{EpGK=>fCg*U)WY~Ep6lnL)J4<9c$O|Sp}
z0dWDI$8>Mb0S`gL|7q5ZW_TL8+9dO5SeEySU#U+a?HV|$hFJAkEC|X@H1;lzBx<UA
zxp{pp+a`nVE|mmE30_<qr|_&6;#8=d4IC~7!r<^|;2%QxF^sfGi-Tei^5D^<LHD>^
zD)d94IG50}<{~GGT$dxFu+nUJNr}5x%wS0S_QjOaN-rN31TosL%hKJJ3`-KIA+&aP
z1%)f>?}A^~rJTw0l0$^}yHi+Y!x*W_)2rc3!<1on8f+pH!+9?J2e+RMKT$ln-7&f!
z?><T61<_pv{w@T(HUSoqTS5!pUHjWBwf0?YAADOY<1GBo*S9HtI~@*Ht9H(Jccx>Q
z*+VYUu2PZ@z29M{H*gKMy*R5Qh&oT<$J<^(TsH;IP}z>ZKdtHBvWw=n!imf?gT-~f
zs~1ul`9~=(LcEM!f9sI37OnA?|MJ@&maMn>ch+#&;UawF*p|o)N5J-XZ!t)&LR|k~
ztu?K<YE@7G00FiE0txJQL_`1mt|R*HBa93{S5I@Mq@~0Y*6z<FX4{WA;ki*;GmHJ0
zWpw;1K17zfSAonD83ZMHbnXlV{v|cOS{Z|L-x3*VVyGpFedD1|VHA1=1M>K__Oy%q
zSrOTRf8E@E^dJ-tQcm~75AoQya0^aco8&(lsW7mqAxHE&HOGQa&U>Az(_jvFDR&Wf
zK+p=^SbZ~mFS2J$L-c$9zB*Y0niV8al;(fUbpFreU;Kfsampji=Kv!NO(sLG2QJfz
zl#EtkY~Q>4y6^oYziE|I{YvciQ?_;rkPQ2t1@VaBi478P5yv8aS~dsvMG-ZmQ0>Gf
zA14`J3r(5Y^->}G2gS8ax_zX*0!fP2)3ZVtP|p{Bu^*^ua%cLTz*)~ofHWy$G(vJc
zs9oR4iEEo9xU>=XqrZQlrVvX?<OzAH(?$vga4#)X3;975YDoWmcB39_7wPy&hJ@)d
z8Qx0HTR3F}JOsP1_S&XVwi<a?D0NHbg<L0D*oscc@`@i<rUNeykZc3VC4c2myGh+~
zxhLPfGr$i;Xf-CJwv4eAXDGoJ&!5nlQpN~+Y55<ANB{r;TLA$AdmYgb|NW?HXdj6m
zv<3a(#c)y=L$bx3>bY8X0mMwjhpFnlh+%d7z<c-?@UA$?BzWLwnUoSuK>bv3a_ew>
zD~oPD4;!=hc;OMGUz4ep8Tj18a+lsb0000$0Re)09nmxY*i$p;3E{IjayJGik~L<z
z*&&|D*EN6a{LoE99FD{{$E=`1Y3%5Xn1#!L|4hKFNSGQUhBUzV0003t0iMZfZ_N9n
z!GOq4=Ehq7x3*6>w<HytXm`xgiDWi(-P9^|*u5Q6#e4rpvzd4CdDbI#ycV2suQ`tV
z;>o|3dRo%5+aFk43CFd7BW|HYQq4S38sj=hnK*z`t7|ijtSfK$9LTrI-hqGuNoocv
zB^=o~h>5tsA?oI<!VaGqau4akK-U&5YBfA3Oqj#G5vD_=7q#5QS$yMG8*>%30^&N}
zsU<*P-BE|!MPrUr5|<muQ(%Xl51im&P2c=8g~fMdf!;%C?`fp9y=2=#4>}7=kGm|v
zB}nmskenOu8K~yv-dOymh-`jsq~t#Nd|aVJK&7)-8|jWyc0qWKg)-{E`N*b6mhuHo
zsO<=M6TCV)ehgD5W8~Z}&WK)ww3o?m1#$-gEB4b4uD4oV+-oFW<TU^Q0to>E3G9}n
zL;v%BNX>32k%~<GxqgKC1E?3xQI=@{(8Pc>_1#s3wrH-lZRIq`{NtQWxrbp2*R~FV
zu7ic%cxPDM2R~H@b-o>p#;si@*kD8lC%-$qvM@BRf@0ut=9x=Psi;3I=Y3;Exo{b+
z?~v|Tu1lp_h&O6Ai83t=lOYPINsn`^KH0s!^L0EH{vNUIlc1KGK}^Nr29@7B|H1eq
zuVtyt-@l-d7M!626&r+SZl`_g=Txs3dP*uRU_Y2qmfAlB@bmn}SYwd-SSd#Av|A#X
z*9wflX!z4-Qsn=78l?yaMZy3e3J=r&I6*}<$iM*>liuf6J-<T}O-2UW52oN=?}X$5
z&?sk%sV)D<C>P;0cS;6m*dQ04+ast3T>H$IZ$d=?sC)<0!9XyAjEzgB&F`i80gQv{
z0aA>DZl3!ZR{kQvUCSL2EGIZ!gtVCoBgHE+`QosVg<HJXW*PS$)w!$69T%-Pm84~1
zy=!greX~4k7hlEUs(0S@;~kg<az=G|gZU|rhOq$;)%UiK#yi<;7xZ1pJ1KHP-@X>E
z5<JR+`iHH|72L07yfd;6><yvlya?P+m;;6k9O?i)dH~=cdGg~-3X?CjGs&NKE&AKC
z`=vfzbv8rRq>k<4CTA}phh6`!te>Af%y@$$8j0OuKh{1F#*ji*<o%l4U}{+a000^R
z0Rnp^sSyA2wAfrTpsCe6TL6J}S%A3ufdBvi6#)T)dnKte|MKr#?pdA1y2TX1kjDT3
z22(+r&Pk|2Xqik2-)Rk#o#%g76bNt-qF73R5d|ldQXhkVVj4{(eO7B#p^gYEEi6om
zT|pk+DF+hKf~91fADU{ru`P@*ag0QeahI)OdqHufjQUIANc2vFcmc$io~~$}Hz*g}
zVl0Sw_uVADkfd5)+`eTyjm%pGyi5cJHO!Awu|VXx!<7*>7>NF-mHrU*7D!1y^$r<q
zF4x7<61?Xe@2&+A9bn)M*4*=FH7C8BV0amg_KpLeN<1){5IyHt;AaZ>8e!I~W_J=F
z7y!mWg%fF^PlY{$!LMz;FC_C)48wCok`sHrg~<{zFyENeCK>|yvg0PZ+P=s-x7et-
zDn|}JM;vjYt_PO!B=gQ59`D^igVP8d6lf2?{>c5ft|%x^*U(Jv+Amc#CLyr<pfonI
zwvNcSA%gbydyTS2qGZ3<CFSl|RLS04KTO+?WYELwlaI?=hDWBBjq~@Dae^a&?$PcB
z-igl{A9OT|64dYDQ&-??Y6o#^Jm40UKLxxWXoW_+&dbxRsORWqDhTVtT-q&4C3{C%
zP_G6^@>)jk!vul*csBd;4m7hO0Q(nd@ymv9>q!VY!{2A<Z(6${1H7=N#$h%xxe1UN
zRFNr5(j!>xDXRCVyp!m1h5M?eK7D%DcS9|KKofp}PDsY+{^Wokw)B&zy%hA`D20so
zl@jWV>v2ORP1K{*x-4~|#*ncB@#uE$%^kPOl?k39lVS#jcA=v`jL?{3;YW+RF7e>1
zy5toU)?vvuFEUt{ZPZBn4r$bqoIhT}x@l)LI@;J?@rl(-4tPoBrUxERLO}<roh2)P
z!o7#ICXwO3l8fOZd}}~h6!zCOXb%@QV%MJ$z+}c6Z|?TP@Y!JqupL|+w!LYYk>pl2
z2#}4|$Z%D+1!G_z&r698iw)xn_1dLz-xG1zebM%vCt!w%MFVlaQrwuB1@Eyg;I9^!
zildmdfR5BjD!pvtUn2X>w9;SiNwP1twnpBMl_%~o42C4t)1}TM0m;nH1A0geCCg*6
z*<C1zv8R<1fGV>e8<2QqLYYr4Sy6I-x6E{GI9P~_@R;xpo^L`9C&X2Eja^-INWwk_
z;5tmA0BuXXm0M!-(}JJZo1KvY>5Uq{6ZJ}G+uJfoWW*H)vL0&11Ofz$ZnmXtBByzE
zZc^5#lLy^+)n_2_-DS9H64B*b2SoRzjQ-zZoQ>|oPerFP-Xi%KTQome!Y__Ms}tM@
z`1_m|=hd4=L_pt>{at5HnIcUq=B?8RZ&cq@z>gR9jbd?=aAP6Pi1dhFh&{JE#Lc3L
z36`Fc-*1r{n1_JUxDk}IckNj_qr*}@3Pr})#qAjVya>au)ZEGNJpD&YyUgB>kob!E
zL!Yh=cU6t!?>91=Fxf;H&{ShPS70uFEk^5H7+PwHLv-SK4B|q469@op|8fza2yjT*
z!m&P(p*lPCRIfU_A^Usjl`}Sw$IZv8c^S!YL!zehDV64I-Wo+roP7OuAy=AG7N?w?
zCauxc#~0mq&};!Ts(S>;;FeIxS*1113+oO_lHLP8z=GJ&Sz;8%N6sF~dbtHrtU7<n
zFxI5wF2)?-5vKmAbIz4b_7D*~VcmEubG-eNR5&4nYN1B{&6PCG3CB!jHXFJGH0ABh
z#N+yF`Z1<X@=?H_OIyVvAzW><03;FYb{tp-yY8}<<EpGA9<^w0Is;M!co`!mxnkp=
zjoJ?T7*@+_|L4@x*#k$%{}VUPX23+a)Ww@yV#!$h%of)x8<MT3*48WuEvP2PR&4(L
zN76@_pNv>sGJ=K+fZ|b1B!T4Ic{BDCmA7VTsDB;no79rCgD9NqM{-DbX#?^#D9oeA
z&i$@7`cWKny$#;e$1bINgen<Gx*0Xm$5?_-*On+16=sjdEl%NF;gX3h!<Y6f$lV$g
z-tegGp9@u(8_6NkT7T#f-hpt8+ohMRV6F5^vw_cf2U=e}qfPI35r(j>#893bU?}`Y
z`#wIRRjecJ+;T-SX3^?C0G-dSrN9o_a9ojZtVg6`<ko|!e~Tu6lW+#bs=$Z*5?kks
zOL=Ec5_dN={oYxwcQ2s3-(4UCuivoL@6?#Rsd-K%r{uN;pI>VCp*wf7@Cbr9CLcWm
z`%II2X}y-v>;M1^{6PW<tmK-67KxO=mHz`+c>c^(;0A~NZm{DN>gac>JB+jtv5?!m
zLs<g^Zy6(Z$60jMK|z)}JqIPh!fy~V68GE!nS}+NcE-pFiyRu25v0*m&f*1M9YmD(
zHw_8*jE-#z`AO4q=g7fGks<0*GAgduskn@-_oRyAH1h(4$T4?_DtRT&m!q0Z;jqIO
z2`BP?Dr)?mjOb`f<1pdQ@49Le<_xtwM#)fh0c-*SPX-J>oxzBscN|KRQ9vDuK%aRm
zgeFvm)VkBkdSsG8WwH~=SJltr<3YTzOh=4A0iOJg7%eE=Hln2i54APz);eZ_J(;R1
z$JCRqY4?rjRtQ0gUgmu6-Vph3<b(PmdB9bX`WXoKm4hHbX_SL!#c|2}-v@?*Ha2q!
z;}>;)4oK^3aIn+v&4&MFa)x$FH>aqu>=Wmyzv1WNBwARQ5LkyfT^yVIt|VeFU1Rha
z@h5?2X-N>o(2~?{oNJU#4m+6OhLAVm=}T`eDyCB>Z_I{o@-nIqPgAS7*Qf@R&o=#f
zj``fGCdAV(i%*Y^nt#tFs&lu6i$FBENib9dCfYO-&jC$8yB7E1<bNZs1Y{qAI>(1?
zQT}5r^d<zYZFI`zPnz`iZ84_qp%W^ggQ*U-<)qL9$KFYM{G%@z^hc~itW7w6KF<<p
zRx>vxefPnpaP7rmCUxECm5#}-zcF`XYgT5Otj<Ks!>pA`{7JR~$?Qr3U&jJ33SM3;
z<s}BgB;(VsWA<O+?4cJnMCU6w(QEz-Dp9>toI!z&|Al8WW#Gz&p%l@E{zM}Tv$rJ?
z|D#ByFtwweqEGKnfJj5JaK*npf&g$g$a9a5`-~=-Sa&&!lypKIBR`y%4Y3_%H3M1V
z-mU3&He6|tTEh!K2t60<P=e{I`j{IVpvrvGLaNWC!i-hcR`*M?iKFICISh*Rzy{n|
zxbt!tMu)5=8{S?&{NvQ0=nq|<k}osDH|c|d3_uxTfD4@e)ZiV?r>{3%4iIfX#s7*~
z;b+--Bj1%BQ_wpk=>@UTxYyI8m3q!#+s@=&v)M*7^sK2?Zd<F~b;M)m`pZgZVK>~w
zOUD|l82SwR*IAies#6oIt2YJJ*0uN+_Zm7Z$dsDPmd;xxgm`F#H`@MJJlLbkoQ%^S
zU$T{tfe^6&39_C&xb3{GHR=z-d}zO3xkpVN6Y5h?zy((8sS2}w^BUwF5DPI0xoxp<
z-nn2sXe90RyyZb<nJ3mJuEMmUe(3orbNYKjIOsi>WOE~>SlIE*YFoL@C?b^8_Fqxg
zSp4ry;lFXmSeGb{M9MS6$1?0Ra9uGZyP=sXu^<KbEH(3vwtkjBF!Iu)N|l1rfiHPY
zowhgD;e9DFPt+p5nv_>_o0Oe<w4UqupsXt#0i@M;RwN;{AWR!Pza4@G)kyVo%7xvJ
zZAy4+WYo*75d0VyX7BP-`CuxZPYyW9=J0V)pIGCgjLjbUr$3>fEmJsRgLWe0l||RW
zFdzQ~ol*TrD(TQ`6(Q5zi*gFaC=H|@QItUAdO)I4RkFzNULxFE;z6iCRUEfyy<x~E
zxHxL~kOngb@3)OfvqDfcgJZ9`d@vfB4B5zH)ElwfwIqlLa=99!!-c^i$Bjx<wE)sd
zYkcgtD(6eN^})`Aj;INwqj@cPutnAHJ+7p3-npuE5egdoQork>Lze|o6^G)%mFI=P
z6R!5EH^ee*egnD+Yukl?<v9#D@KnLlV4J9(!k@Ci4~xJwAf@+!f2tQg^sR8T1(w5I
z?8bz@l|y7!pc*PrD8!oXnNfHVh!syw^2=N2e84=~Xq9ce$amz~AM>BT3Rpge#E`V*
z$=yQr2@df@R@1?dQAE@JwB~55lyg~+=8wwKw;x5Y1#z1n5TN3>mi%VZbn}~HZ!a?4
z+w^S}K-W}xn#&SD`7B#4;FGGf$*ks{k8^pe6b!F4Ic(7H&B;XFM?RGgUXIUCA4x*x
zI=-2V_%Sbj%ldNzh}s&1Q)h;!Ngp6Vr*o{)XK?=@N#}UvgNL`ei;;fUueqD%Lza`=
zIsa?_u6&#P8&h74RgxWsC=4||q&m!yzFVAY7UA-u+KP%v8_r7}Jr0vVW@;DS>xE2w
z<zsgy7x_!n^`&HbzkfdYtXaa}YqKhXL0Tq8SwEX3iXK0;?PyfPy=C_#+BS<0D-=)5
z^sFf=Q-THr>jbXcxOV^f*jfONjF=dt_Z2q6IFqR=DROl_l~RN7X*HR>Zhn&djhzS5
z@1|WN{2XEKlDAq@09ev4rLt3glok&<D_-zrL8YCR!Jz0k^Md$<UX)VjXk8g!;gR!s
znLX?!6;ZcFp6#g}Lzd=f#3bmu0e*F|oJ9TlCVK~A^?07^yd!=GZ55R7A0^gj@Ttrb
zh}wRx+_QtkxT~v}qt9b(o<w;{RaWBxaO8Uqp;AK|?@pz}h%hbNezl`?%xsn}Ng1G)
z#=@8CGF3Tn>k^TJ>+nap>Y42nE}E&Xm_p10gCwo0@Cy+MGtjUVG;yPoV~=6zJ4B0#
zXX5eCpAxm_cz0HR&^&v4Js2E-;w&e%#F*MV71=;fqpO9taoOc*`Ad>)$>?n-J=#MZ
z>*{$BZ9TF+SwOorS>|zk+~n+6>SV3r7-SgAn+P?Xd&-zaO?Oy6G>k1@w9^KLDC9(F
zzZadQBPmUxvrM#C+Gi3gKQI)<A!4Q7vFh@_RBf(a@Ned9#;9=ns&Q#XN)MoP#}1SP
zoy>zpKf6i>74#k{6fYy#8J?Bv?KX0cOEKw7Mr`L1aJ1kSF*)p2f4zrEeq8fzBNjvO
zfT&GQ;mr95bbrNmPCCxPz+!x3P%uQU*NG;V9p(>J7nkC3K|U^$<)OzeCI&2h0^6d@
zfK(9I5@v=D-Md0ZDQ446W27%yz`~=OD-A>IvWijN^lE17SUpq>Ojj!1`F>`4+gDv1
zA`A`!AFeb~mUlbt3eZ?7%eld}tqIHWR#oz8E=^QeWo0u~SH|O;f@T_JOHCLs>IHg>
zk`ZtcX6&>Y*-*+5GYonundg?3-G+e$U<QsdP523+To!sFn|<0pRHq%LfS}11)pmEh
z!VWnXdQxZ$NL^(o9Xz*SeYU~&V>K8+J+8hYISX?zuQ$4e3j<Rz<_rcW<V5Y09`a1B
zC3t8WhvvA2BIRIH@xSQyL6m15&$pM#rLmGBjbLxhTyU>cq@IHa__D`tlu|vp9Q3pB
zA=37yB#}u^J~Ihk?#07Vo?3DH`CCoI!cn6MqX?Z$UGe}Ar(C}f5+fyzYhu^ym9lmH
zFE=uG<zssnOwU57g$<HeRt+r*s=MI>hrc0%^h#ROFZrsEP%tl&PEiLOshi<;*@D3_
z(~%jCCom|9Ew&0GrwzYtm&pGu%J8&Ke9Lb2)wvFvjUu%mR>c$F+#b%uxc^NtlZ+F_
zoWmETH!lEyN_pz7x=871{g*2nSefflY3p0*hopHscR!!*gY*`SRMF0D1Gs{x?}Ly(
z>dW8l`34kV7UEzf!!Ewk)wv`|iNjm3)cpo7x4RfE)jL0vzO>9188*BLV}uM@82c00
zz#<%ksvLWa?=N)l(N>~wK{v}5+E5#Y+N=GX34T-HE^Mw*NBtKLz0<y^=fQM<R}}j~
zlUe@Da~|y9OY$PSbXE%|&zPB062L2>0kNZyPX+SH;yblrP?(+kh#I2cLO3MAwh*wk
zM%qnZkglwA86I(jy$w+-yDF)UG82X&jU8$dsB*XEFXv9d`tSA{K^FCK_IsXxw@_6b
zPkq7qi=sZDHUUgkCO4T#DD$~xn*fuYAwqRvt3{Z}3|Zla@?guGn0fwH#q01M2gED^
zHG$I}@Qg|Q-kI_AfBP#yb@EDMrgh!BKjT$*Ma<zk$cI)%^T|Mf1z@t05ISfpSa#(x
z){3X|rUMTfI`D)RvA~zlwoG77<rg1+7CRSZ@P5RBMZh-r#RM(y#PUyz{J>1S0LmFk
zwK9TXxV&NKk0Zlsr<vKFNT?Vx<chXldu%JHjhE&H*Zh+m`^H)_rN2{~m<X;2WBs&M
z?wtPF2X}B1xEO)LDEJ<rwL5*2;+uEDpGZ#WgL)gPXz1_V-8e%uh@#7;82yLQYn90%
z2jM^EqfLDzF7;xrR+og;#iai5x~>-iFZbB7qrXeH>)zQtL$}LQ&+79_VOsfTZ27`*
zFEae-Ct=n1QqeWcg1^RwxL-7;{mx~fk?Y+B$nJWjhwB`$G30yIo_Z8)s#Sor02t&7
z<My6<5+QAAY`@CEhCaYEvo9)2d3zr5;IPrTNQ~sTy8a<SL#FKl0!JxWf60UY<9L6W
zLMF+HC*M<)nT?)@w?Q6cM>_{U`_8{<+YwWO!irIwIiJNkqjvjX=R76!bgLRS$52VT
zaJwoWhRA`mr~m*6_CWyxYdI#N1)^mzC4U0BF#P$(FB$hAaP@-W%u+^02i_2r<x1rY
zAU-!fUojqOAZQ3>eq>sJ)X5vZhx7SXgORMLp<5%KE@F#+@CGIAT@^45yP47ZCx-HR
zN8j{T4IZ-B27~>mLn)ogb!qbZdHR6PE}<ZT71LnG5$rR$^|iu1*e?uOohcPEM&ON%
zFl_wcG+KEPvr=a?1ZqkOz8@Zc9!bs0KiDE+H{TV&8Z-OMSfs0yiftS=C!uDq0mY8b
ze(`Rj>$fR7j*OHR+;2&|UvhVlGlRuSf-kWiRKM4Nce>mDuauQxL&I`O7_}K}XrAxp
zP~1EQe2T%IGopk&1yLBjcNCf;j6@UAzeZVD|8q+51LU6RjyL9>_&Awyz{9er#g7d9
zft{D)k2t4($)k6S5kxK4SIscmq%BsDQ)70UmxZCsd~D|jmXJ;Uz{acG*aBZ{S7SOx
zz3v~);HY=0-5OgIH}M>)&S%+qQ6z;vw8j*;2P_qe_fxrsP#X|L^rR?Hd1pJA^3jk7
z>so*F7lSJAk3#e=+Rime`m-6}IC`EY66^Qk)|qCdS2oXgZKtj)L(5Gi-uk@93mx`K
zXz@4P-7p%&D-#J@jImO?|9LqIbELO-3tT*TzrpV#3P$*|h|lx*0W%qxRH-wBx(4~%
z=TH<?EDhv&Jk>-Oy6JeDZI0R5AZ9WtR<!;hqW`I3nmK1){rHogoRXOTuxCxTYh`=F
zHB9zO;kbAU`=u5U`p|A6#wNDk^WliSfoJC4e5Y;bd^h5rY!<hQ<Bx>^V4P~u8jBtz
zj?q&zNeFUd?&Sx|JW5PQK)>jj(o|bw5RX118ffS?OQs09p!U<Xil!4%%Mjj`M+7bK
z{^6J*eh*VL;eK{=mDth@CihQlZ1QL>)Lvm|O{X)hMfV)vk4_Gcsytf%R$l1U3U#9v
zLB}dJve<pc(K@Z;+1a}QrPllC<|6hvfL%=LMhn3r#(wKkdD&|2E}^wojA_)tlLs3!
z=^b&y-3i=$-zGO8N*McMhIG}<jASLYz%5_srb9jRTW2RuFp)Sbq-xmrqUp(;Jds&D
z*$O17rF>woeM}>^rByLpSkEe}6ugg5=oipANGz(|+TkCWdWFZ_g0o>GcwC`d763HJ
zWDc3s@khlkK{^QHcbF%(-8<FoD%cY?xmQ*5j(`&EhRUplObl8-$*#?w9j=DWl=`r0
zg56Wa=Pih+GmW+TTZO-3n&c&RluyL@pCt&<pzct3-X{_N3&~pc5h|_A2F92gV<6Cv
zV|J_P><e-$1JxfA)(g}^@`m4=*?mX|p2s(@=iYa#O06*A#HC!W+Ge)(+SV?K0PITZ
z(29{J^=*wxZdOZuhAql%Oi;aW$~`xf-x(Q5I@(6YH2`pL^krW$^OyL%Ovk1LA<vm%
zSr?(i^BnMgce07-O0Jdyr9No+QFm{-64bM(w~y9y*3|hADF7RhQ}CI!tr|Kh{4<uW
z^Q&a?kUXs+$7oeMT$7_>$iH^4LH7M0N{f(2Pe+)3Ef$XF*lL1d^+5JBTQ9Wvt&RF{
zU6OyR=O}3bR;2gHO$zydqu3xzcyEV9uaZw0X$qN%PJW8HY>S7TFb}fRDSE-xB@Cjv
zUap&KS1atk(-fWyi{lK~<&?4zMZNcDNPf%xEJ@Y#6wT$ZZ>SBq2Iv5r;BjIvk?UHn
zcX8hBCKwvJyC3)t<-#-|V-5VmlVwN#T2Of0AMXbsi=862cbhY6Scu7AWjw6dNhbhr
zI1h*haSO+y`lshPhE(e8n#7t*5(sPaDv@QX4Ck<&B@hev1B(Gqb3W=kYRLnQVH%Tx
zz^)#UVafD2X>IZ%qz{@N9gZ)oiP3<_WU4^3`Y3nN3&B=@`_=;SA*r*J4|nJt?a;+?
z+ts1s?$2M-uRHV%vz^4?>1OGU9={ws*W9QAY)i6?M12jms&-6%Yg>NWvF+Iv^%d>-
zwwrKxhRs&}LYCIb^DJHTm4fT?N@JVmV?yF}^Py}e^}+=Ihm;P$&_F3Gnd)jG(H#I+
za~o|k?+Hp-8R&>pU_1*rD=PVUbX211(o(xvK4fFSO>(<Rz06zX#tWdqr8%Gp)vFPf
ztPO<9S?t~ucb_-yYoPiPTav#(J@w@Vbvl@v_C4|?5@KdmO&(nG!P0vpwG)dYkhZ$J
zpaFW`%N-ielKdDplH1kV1_PZUGLka0s#HG#a_<a%efGRX-&lG(JKP*YchI>k|IIeu
zsZsG0Tm;F#Mz>9~#3`((P|d&-o=qjNRNtKR9%8oPLrSw%j$CgXQ(_<Kf0gs4PTh#|
zV;VW|uz_O^?20|7hs*JW+MDLK!s)L5Y2%%w*4&-pz(AE(K|loh6rw|tD^$<y*%=Np
z6wKj6l-`nwuR#S2+S|FpXRhe8vP*pd1^FuRX$^V>k8Os1b9@}qwQ6V_b7I?n_qB#e
zYu0ghM9NLXkKkV`TAQ`KSzMxDJupT|4wninPEO<%yc1Ytq0E(fY{;Ml{?37B`GP$8
zZ@eBJhm!^W4aRo^dqT;tX|JeyR<vd8X(ypZ02)=(dVCs&Lv;I?&U)LJcEU_%AEkLX
z>7-C6uh-=;=EaZ7R!e*2JZ3+7WhFeL68q;Bl(n7P-(9HftY)1$i>@u@-5Z&opNLwg
z87K5!YI(edVg~TUz*ir;LnYTtu04%G*`?UI8#8AuHEC&8%hkDY&w&^AW3BGn*F3|G
z8>IHgn8pl3^VO6E*R9y}-nQrM<LLJxcE2}tJis8Q%mv+NY46RP3<@gyx%B0Ctpp(S
zFWltGqn3DHtOsiRt)|tMTg*$y?n2~4){k?(Ba^6vaD+F1$`%GvJmomw?1Iw#heH}x
zmkG7{`N}A4P_94}T-HInNNcpmtI+=)=&&<OYU@k>kf@&KJkW%Vz@4G8%Ly+)QTV^6
z6jSx^XYU<-iI2YjG2)syM)+exjnpGUCC^hmfQYzX$ucqc3<&5^LrDCD_jfyt&a{<G
z=?+MmZ(v1&e0cfhUV#ZgP+x!m00oah0fK8eCZPqQWiTRNjF^v+z~=!fSmA!=6y#vb
zKjdLoS853OH;N7KhNQmJAAVfi0Qk`hteapTli1{#BOls;E{`?z6On%H^(foPvjBoF
z)ma?x=5&I4dpV_vP-5G&r0^aiOj*RD3T{?fB6yoC(XdB+729biOrB0Wb?LYwhZ6#U
z(-<MY3H`h?#}yh1=EbLU1F$Erl;c<F3OL<fg%Os5$aZ|-;_&0}OQ~GiQ=gvL@EXS-
zu~<XS7Y!=%Sz|qldu(MUcK-GRb_aN?=fqdWVwIsL>L#t%nb-Rr!IxoDYYh@pK;lt8
zl)Qi`eZGcltHqN<Rb)wPd2iMRWs3_|p^f}EAJ5Iw4t2%rjd5-HNFuX2h?hhJ#rSa!
z!|Ybu8CGrsbwB)@>Xv5!cXS3&h6i_MBOmgsgunk2Sx)=YS=O*D?a5&PjHK{2<rv|C
zq8lbz8evf_e^ZPI+L<N1Pmx@&VQ}btM)zY)R9b9=K*-J2N_!j`iD{i(?VLoovKDz1
zg1G1@pSsPHXN=1Ip<*f%{F*CSAJw)aehZ|@daNaFpB7qbn52m-Z33OCK3iPRGLT6n
zM{d}i84%B8+!GO_woukU44rIw(P<c2?ZP7G{ukOV%irbswErhQZpsqND9;H}$=5`x
zJNlsyFn!h!)Wf7kZZ>5GWQY1e38-3cX6`sAQ}QE6{Ue<!iI8=Ybbg$KfveuD=t>oN
zQEHAAbseg-PfwM6p?E3j2Vubt=&8E^1kExJ8RG+OiXGy;vPu-wm>fm(huZ!wM?D7p
zxU$D1f^|)@qRKz`oK9!VXbQ_;ly>A}(EUig+MtzKp8nTYSm-1GgKo!=8Upu4LIn-D
zpdW;G3%7Vvx9tSRi(ex!kp=2ZT-G1cmfiJ%p6QU9H2$RTqnq&Og6f~p7O($jK7`Pm
zww^c78-M<g7{B3VLev&7Bc>+MliU$*)VAe_LF<I!eXq;}H4(X%?yJyRa<YkJ$Hj#u
zwhxyRwTvLPt%#@#G7+hl8<+~zoR=A;sAa6Ihq<<<Pu7l^b;8pFvT^bKjhD1GnZ2z0
zSQk{Y+RCzQFu!X(bHKr5HR{b9LYn9yRv1(%gFR!neEL-Bw&Zz-J$7n<sONM~oCAXs
zWcUT72w8ovv5fX|;B!@2*ih}7;&Ts3JhotW1*h!){16~Ll}aTrZUa4z3YUK{5U{Zd
zs$%;qE}G^1PG#FB;2hurL{lEe2Xs&>|7yey0G;*dXXC~=b@zEQ`_yxI&FZ8e#9>{S
z;a;eqe+E5TqWb;RxjT}|LM25V#;+<kOgd;5Ez2k1iU*J#MvK@OZZyVwR*+Ysh$Gk1
z?R$_o<qTk~+0e?z=5Tng#$~THdsD^+307HOze4T*%i@bbgyY2JYXWKu&D+&<DK&?&
z1ab5uvGfI?@XH7~`lJssG{2JWLI`qholIAORfi6~%3SD<pa$TQZmMBi^lf62HJXBg
zIi;kYZigamYn*S@LpX^85D7w1s~r=;CTVw_Z8%Ni*fd>s^`Ywd3th)S%zTcNT4lTq
zw(_`*mBleHyB2uDlg1!eUa<_#<lbXYS&DX-EdS>z%wpFslj=6<^^#A$=ev<nC&#xo
zJ~in`rzhTc*yG{lfn9&*-u2YsGG#xZ{<v%4C2<NA?ZXjeNl-0Uun_Wls@7(EGS{<2
zU}I&4LQJ5uM|zds)bqa;0#S^ITh(Qyu4ekj(RFfCb68%RA88Hc@Hxw)sH8FZ#!MNZ
z>E1gyf$1|uNqR|eHfX4gOEStWoCNqziW_C#ayfJDSWY;Ev$KZ$*>YPqdE%>Nnay%i
z_j!{5%JGLkv%Dnv#0g_yo^%-_8YDFQaAt0fEav*1a!IX+ly=K4kZ=G110X@3>_rhQ
zuh~QdE6Jsy+IW~=DT$_coGy4~t+NsyCp3j5NTc}A#Rmmx6!ob<*<_1OwIi|VKAS|?
zv>I2&K&y>yf-Hn9H)QsIRU6M^9*d~q!2_HOF(s0SKO-7UNnAGSn5JPKCg{h#p69e?
z0tw)$$Z3+b&9dc%f;su{EBhqMj92U=$dREqh#Pi%#zf7Gqgk`x73*XUs&K??sRp2S
z4eJ*MkdEVdkOmQa$nwSwlGGzaHev(WzcDf1QVX0W+;?;CRvJ2do$li;Lc9e->DR7Z
zh@JWM>wwAz9EqzoE@Ljf=h%i{xBgyL&cky|s50XSSyq7?NN-<}MPET3Kd7QrL&8NR
z7qFbioB*&tm0Ul)s)-$WUDDc#`1j;0!U|&HXBr!tT-Q2%2WoWQXTYCKXnj)dEnoiV
zde+eVxcg6*_JddK{)5uP+I9J5W2bjd<-y_y4aHe@0%y$tEIUV9j23&XEKV4tjHot=
zFSsfb3|cUw(n)(82_O8a?X$n`a<#cq!7t-A>fAz?6xBiHh<5LKu*B${{{p4!9aL-1
zT)jJBvYA2ooMV3VR*j6mhRxwIrHp{x>p=##stSg^2qX54g*-kOdI!!L4m%d-sA;rm
zFtaRbdr#KdVGyUIcR*oyt{n4+oAW2p=FOGIkpiE9W-Q_hNeolPv-;G5B9JtauK^DR
z6PA*oeo<bf`6h+{X@GBGMvF^`_j`qRmGipCJwcq3GtjrdiS#H>NT#;j>HFoORAw~g
zH-CJ03a?ZG<uI-?h46?{Fl2v`(ms4PF?0()u-o5gNs({#(XhD15jnP(*z`>mWBEFX
zrp)U4FwdIFO;G|w*CG8exZh|?-uRvrFBH{BZr@nEgW>1wcBat9EJD#E4YmAMl0<vx
z;$GdI<l!4@WVv{7{NF2o)$}zseciN`@5@jlH;W2*E_jk}cL>h+<d#MN4am-7$dGTn
z3LP#J4coal&MYG&X=O`%o6AlbRxtNRR8L_6VH$!j-MlJ{CB<ZJX}I?kL{Ss4uPHVU
zrDge%!<cKB+{M4-x+VYs1UEqf3GT#E64U?mreZabmV0IawkRZyQs)9p^O1P9=WL0B
zn8eZ_+OPKmGKC{&T~{@58apqb0zo)Y9!MX2;3<RGLXu<@a%HO|D>ssPbL>Syjh?Qs
zjyf8}&o~*a%Ks5Iz4ZBT-+y0Z5guUtKIc=tsgEzLk7MOKIw)Qd1!)A`AIJF>>bQ>U
zbk%<`@K0PkrB9IVs5VQmYv*tB#DRPvpDpa}sthpwz2);n9h*_ZSce%))UDD;#+N?e
zOHZBYVXe@fqWcI+o*eT$+sLa>kXz$aTfMVM#J)yfVQU!3!iZZRH!(_zqcE=1X$s$}
z#UYR`4(``?JdHo<_uBQ-D@}3E_5*+M36_LQkpM{3G{4Z|Lk<>31phHyKFuVba0!94
z$L{b6qKwQ%+KHu3uOTnJML>ZrN{A}!NC$I85~P0!b^zTxm!^xH`ip71tpsNK{qYZ1
zU`{*Sr_Sa!1beVFyv{d}$*84hS@9QtDMjze)I)UjYr#XZ^$gz#RdsAKyq4SPAHhP7
z=a;6KK%`|gmLk9_P`>kj{wbhN8We!??>Yx*-|ePm!a8Lb$zZmy-C|<Te=VMbz7AsT
z!6d-u!u9WTEI)VO7?sW^eVOE6xl?FIw>DV{CN!E0>&+9)mQ!CRnUjs))ePN@;F&GL
zkGV*LIn~#4bT2{)tO^JJsCtrSRbv-I+DX%7mEOb??#`u;-r;SnT?I4(X>wPvUS}aJ
zs(3U=KW2b0oEia~dQbg?^*(<)HS5T|`}RM_iq&WhOQg$uny!EvksS){@ET7qz<n_D
z&d7@REn2|xqwu23cnvX5jL|8zR37R$fUiuDEB`{WI++~o_nM#4)z0{%{80b$O%#&T
zOO*R>h&8!2ep`P?EuaqOmX4^l9?u?48S;U%STdTIK-abdS-4we#igg~<^`fZ5ahag
z*2^2`gm@+|kZ41Rm@r0|7>|YAuCFBl+1oQ49|}~?_-6}K)DFi4DpV=5Z}M7GvJolH
z2s6VAN<Nk-|7m|^IPWw?DFmi-H0;Z}8V(DvURJcn9ZYYO%<&{bB6ev%VLeA#q_-=l
zWGT0XtXmgo9Y~`}E)1Nv6-Sj^I7EVHvCPMA8c9+2>W@q#wD?RoGxXVK6zlt1BjY|Q
zJ@6(O%VcpF)uxvc$3sH$J1HBs(S!^eCjw^z$@uI}!VKfNRh-tkw6kz&WoCSsx!B3A
z|2Yp~0nS>OtPTiOs!~U?<@wrSi8;<5yfE2fF-5*c;KiPg`YDl9D9VDv^5)s~vac%?
z-U6y8A`gugh{I=`YBvLD!&^TCIrhvRhVlXhf@U5{{O>{`b~`!J<lD`XBfdQQ$#j!?
zxx#xt(qrE4pA=xPQT~M`4zEpf2_ED>9H%vL+7$MZv<Mwdid77U3ES7E6;%r}5LO;Q
z=N8o`>;M1(;z0odd$ANmvOo0!wgMUxEDtQ3xDMS!*TT(^c^=`Clw-s{+>7JldIbqT
zM@7+C=hUQ4`dIb8BduQqv}vp_FA{QXspG#v_R56{HbMaV96cxp5jemvKRyKjO;m{i
zfGC@mnd>JfeG5SJ4@~vP^@}~17m`1V&=V|TK(yegoev-)0W501Nf4!7(-=`mO?@c!
z!XFHyS`4moBPBB6PPfEA;VLR^cBNG$e6M()8<H6t>hYL*RB58>iz*Q2+n^_M3K(W1
zR8MJES=kr@MS#r>hZsMv(}JOb%Kixryy3ht25qw{+4@Hi&x0g-o#o;H#30kc!iisO
zoQR2PL<SK4?YSYf=xAmtz-1|WY0pS8Z!++~q#hmw>p%}#YZ{ci>3R6#$K;19Tfo#T
zV6lUr@c2E2M~G+??ms6v=#AXGqt;lh>hdxQUzX6LltP9Hfm;;Tg?7(T6cP0Ajx}p2
z;^fPpnr9$ZVBx0k;T%LRc>@0-IgFwTfxulhQwu_~$1xp>aYfG~Mu{JueKwqk`7~VG
z4(!9_(IW1%c8&wAI{#1z21{fc+ekF|lB3G6zpf;#KtKs}ToScd?(c{8=}f1!0jNvY
zZ52g%%k7J8hff@z!zPB~V88t{gHdb%009I+0fKw66hyTD!YRu!kxElCzwsEmHIy?}
z+4fSef&(`4I4CgkDS?;@v)nW)QiuW&E!C$Ykk|<R_v8L4;qZ7^Tk#DDcx@?MzvDdr
zdJMNfW`y|S@yP8uCQ858d}Ev7yIfn~smzLql}Jgr6ro}U-M@|8ZBk640bIGqSe={;
z54k^WqHldo0qk~Oj<e6$w)333YX-A?MP-R}Bb{nnyPIxGlTP?f!t*fk%W)pl#_tqr
zg_@W2;ha+_SZb+&;iky5Oz3<V<4e_@5ll%fCoV5y)oXo`J#TW{^)_-H+4wCm?-dn;
zNgt|i#J#emKdLe0Bt;YzJl((@Z>gC@XpRs7009#Lp9^$v?p1o&Sb6|D$X7PnO3UcR
zkbh!wgW+`DHy8Z%3vUrnvH`%@SVdEI(G}=x6@+5vpQ`o||8X(I5?rT$p*p6S-5L=U
zRGh#&m84G&ZX6a0<~|N3g8dab!&3%>wNamqw<=4lSb2><mTVY~i2ztqvQVWs!=_(*
zS~V%`Bkv&1E+aVxUGT0egktXJtCfxKS%BsMNa|kVD?R7pui&e#{%g!(mHXGqyq*qa
zqbMM&np7@<zIsB9@Hv&00%>KR`lT;QZ<KiMPe9%PK2rRr5_`!E($OPr2;kAwWe>^=
zPk%#o#6OlT@xbpS4OSub_NmzC=t5XjlUHT_bH8(tAua;QfOw{Ty4mH2`UwC40yF^v
z3GlZ>L;v{OI`plvcXi!J=O&rU&2SM7UYb|53V0R&2Q{927U7(1ZDS{-!qE}SKRAH|
z4A>|JhK&UcL~pfp#Hs4TR_GO7wA}f!q-p}TYG+q??4^>YNjR)+mq$(-oAFFcU^S$_
zP)$aAr)G@Jh+)0u>qa3z$<;N3t`(zbuapvM1$8n}_1^FUFl3ojcpBf&i0Ue3T4KFF
z#mj(>MgNWvki`%LM3`%ClgXA@UJMgtQdX#xj=MT=yC>t*Nhv#PN@eeGa_=u2<helA
zg=B|CjBLU=d{HK|5!7Vbt2uIXU%D@X7W8Jyjh@<W+-cNk`f8x}VqDrb6Xy@4*M&;~
z{%&?%9I_RB=Wj*RJ$|rwjc}6V9gUl7CK|k7@<Imn5x?n!&HKI^3TpLcc$w*adL|IY
zkzSY%vto}_y-_GlEr&;%p)MMtsbOS=>&jYo*Og{Gm<=9FNArt^S=Qe$pvO~iImjPV
zRY5+bB(#XiY5}LVDi=LoiCg3+vLk(88sFiac{d|$h0~x)$3km71x*a(f;dMRCcJ&m
z@}daLnB%u=$}u96UlO)o*fN=mU_HmH;_!d_|KV+hy!w%SO?2C8&}x!%HQ#)|uM~FB
zw94+*_DGIrU|qe4D^#GiIXbn1TJtYgHS3zs+p?|GZVUKV88A^~kTZs&H)M?@BzVJ}
zYn_SMcT8gxzV~*1HyK0!+Yu#u`_2tqg9CpNzeH9?>e@2cdCPC=JAnWI05Smq0(>pe
z5dZgXyA#{4s!>e&n9-m9Ei{%y-xPd~{7e#bk#)*=@jo!2iaPi+&m1}-0000|0Re)1
zEzvXo^t1H?wk-2^04QRUluH&qizTnF#h9;XB2R}vpum<;!*zvIH7w%m!z6`JwF1-E
zSoCfhmbM<d&2CUjf4k6=z*=)>A5z>jS4{u_0-FJ!4Qg-9?pR0yX9wlq#e-^#bHw{L
zcJNTp0gN~fH;Dr_P1gJ8pmHGM*-B6Mb+r}JiE!SGL<06|11j|#d2HJcDm|4DnmyEm
zH)Ng%>0D-{*rK@xj*q(OWCq~ZTR?`8@rnG+Rue@)$ats0Z_8F#j$dPSw>#v0tt~)t
zb96)Av>(rAuA<bP>23saL<850Rm2WoX?X;g{%n6i-xlb@RaTwdM*yD!LmqAA(&6i7
z;pm53c`#0yR=2=&-Ta>=^mv-Ji)?QT%7RdPU)8D?<ANa{U7)V3nRSOBa_4!;{6RhF
zBa>+XD|m6<tY7posWJbwx+tDw!nB%uF2K?d<yPwpYf$8@{h7|{7QFrl$zs*=wB(D@
zJUjw@a;2=^WROegZyw}7!noZW@m60>ep(-yO4c&Nz`yLwN*5fd5$kB5fXl*M<IA$o
z;nil)0TWc11Q*BXM$;n}Cem#<Lbmgvt}y=78~vlk!{?^i!cY#d=6AT*GrdGB*v^@<
z_q<c=&OPp%vwcRkdY0A{1p^B=CcqtN*Coc@d&JgxSU+CYQ=Uybz4oyv`pk{outGm=
zp0h3i)SXk4EnOF8%eHOXwyjgPZQHhO+qR8UIAz;5yWaSs|3Ke$-tUOa$enYom}`vr
z$mQnp1Pzw^L$NJ#3I(TQbZ$1VJ(|WVdq7S{7H(7e?~_Lf2RvNa-F5=oI&$}Q<LMha
zG^U+y<ZUycxP0K0W|VxrsqzM2QeGW-d+>)xw#W-~NE6+&iZn0|$&&*)9$YG`GZ2yQ
z(wDH-*A>$(4Q!f}MFwG!aIZX^K-b=fOZM_QMsk>+Viyd>67J@0i}D+{%n{K5jgbBt
zR{8HgO31wc#$O0|3oR+rl$R+7JoVV~Axddv-tTsi>%KBqIq+JsqJF1aq%Y?*>*Z7P
zM+-|4c$6TbFz`*m?q{C44Zm#+kpD{T@~`f6e*uin5<0!iTrFOIVolOR;b&B8(PZ#h
zi#JKEB)r}e-~idDRh+KV)EKke$3sYGqI?tD#iDSlxMu^5HSP7yoxj?<(+dwJh6QX_
z0Y0nx<Ej5{&}2Hlan)5oReSC>A9H+3%#qd}r2TW*8u9@~bqbc|Ga#EQC~Xg}rM2be
zrMXRe#C$U+c!%(&%tSr{24!9A#-OOk){kB^wg7C#FN2?NeQu#i>0I9}>ngj?$*oyb
zIRwct!?HiRmKrOXjm7d7jzMX1X7ivdNonkDy9jC==VyfLP=nbJU*<YU1|tR2K@p2L
z;Xh}1#fJ_n*_{&1y803m2Dpe6<`_M4SjE--mKPLavVG)Px~5IVSP?Dc^Ns$V5-#(v
zj>4r809Fnz;&|g#Sc58y<~$KV`t<N&*3O97zk$GvAUe83x}?-#3dVYh#*#D;Te{68
zoZ<cia(b{4sL@yEuD+z1|0FrVIf&ApGk80jh$OKGpjugkXDKwhh_p<Ctla9<?tm`q
zZ8fg!DZ;!vU>iYRvrmCWXXI<|P_e|5<81C0&YS)_7HFT_RWL(|iwToy@Tm(*@^XlG
zeAsLU89uUfQr_-${XHI-1d#;>m$&4N=kYP-(ICizP`#gxL79#sql#$XL71IeycHW=
znr2q%9F)h+eT3&G+JI%|kDw{*md-^Nq|qO1zBnr*3yR1j=s{PraGZ?0eMHDhb(lZ*
zeKXePD;#g2n@}b5C-0b{2w^DkZwy>$HiAk!=G#{9`5-887AwN?s!ZgG`ahEJ35QJH
zy+oP+uB6CucHq3&%CEKV0p{t}91o-PmoHmJ(PK#NDVJqH1KP3K%kRI~aKt5_977Qn
zLqQSgxeKqw1zx620nbS!+B|=_mZ&C)c=^sysl-6}!(`~&=iGB<m$2=#^K0Eg(B-%a
z2MDx~_!k)N;{E#t8O|!IELON`B)~<%d}!wkEh)ERZ%3*&0YGOiWS6E=BteyJ1a5&a
zw;Am99Qax_nX4|tes^Gy|1BA;zoyq}d~6X8S}6PwXnSm_SWxaTrE*MKn8}C=-$udH
z&ihmV#Yhp1(P9CmVg`{26+l_3fuk^Tc?s9_;q(=T)S(;<8~8VR*22_9CcNDH&7~yR
zqKLp_q7Y49Xz!@JbkMM#VcenWF)VF0Jt(H4|N0HbvP2bGj@eIP!28#a9d=I)Xo^8u
z=nL@END<Y1JZYi`=x{oc!Qq5gEYgt3zahDz?^PD$ZV0OO4&oJ8fIPVGHZt2AAbe=#
zpZihCRjXV<{=@n^VQFt(pfbu(TiXkpkw#$GXbJpr8!nAbWcf-iYG4%%c|*SA-fLXo
zGO$2Sx^(F}Hj7d=79hX$`wF)IR>IByRzlYQ#I{G6O0w5R@2ukG(6uISenxc(#(Q%c
z#y0~KAzp6Dbk4uA^lp0mUN8vdAiPyrFLaiPMr;H+-ftZW)xlqYfA=%L;pL3BFdM%>
zcg`yT!SDK^Yr_gjw2RV2K~^%fhs!r)ePUw@Fok9N1CAFD!HES?rdA52g5qB^{dH3F
z@I0qQ$2!PxawbbT^(=O2H>SWrq|ui6*2hCGXvmDJ2)8V8(hrahj4RIZjM`q8+lg90
zn#bwykR-*>8EW3l!)hc#O0P3d--QH)Ek=5~{+0zE{>EHD=n7^b{rY*<88Q%QKOw!*
zN4SS`1P+pyMm$P`FP(-xh?7Is7+MSPF}zonP8?OO3K@yvkzsjKIC$!8>{R1bb|3pn
z?tm#y`<qk9rN(9~^Z+W5odoo8z+F%HYu{uz4=c70hOoDXbkc~F0we*NN<h&*PTZ=u
zXTIg3K9h6m{k&~h%vNO21Kd>yP)7KQe>@~I5e}&2DPn?yCqY-3EfJj>EU3oshfhS5
zco9E~4T`y@bgNmBi8O}Wd5rMju8j~=LmX>e3_`P2G>WUX@n!`Tj}wVg#6i`pAT)%K
z=lH+d;-5U}0#0O_`$GGElPHw_tSt2-;!}n1{j=(rp{q08#cnY*9Me!e>h12vg8g9d
zjUSnBJt0tEj0A6`q6?OE{QKIQ;?tl=rq#7+-OEpauJ}37CC5Gn-{m#!*)s>Z3++vq
z_&b8yMSt-oex<J88C=u%%T-`0#=~yA%*Sg%1K#>Y@5M#&nF4`%DqHyHinZ+ouDAul
z5dVNz(uP5ls&R1t^g&Is<TZP&@MI7P2|-q+P*=v<&zRVbz9#X^y5A(NhSE-Ch8(9a
z_iBZ;ja<+Pgg&gNmx?k#wcd`X+6gRWR{!!&zSCLL+B*qg?##A1I+ug6A;(F!ojN6W
z;ED9_hbQLnB$mdbFef<ui)p6SkJ-`u4HbH_nHhYTdz^A5liqBl8ju!X8LC=qVrDOT
z%jI4WbSW4Ila85KkTxPs6<y*aFpMq#bQfZ63k|jA2N*YlF0t7%X^`+@8Tvt&reaoE
zaklGisT*sTkgut?8t_M|YfAApP4lIk`_P)zMvJ+|by$a~U)G;0FZ@&Q-k1YVb));?
z2I^iS{pp_Y^D2LNT!=1YCfanEL^+NxXHcAET+HUmlpP!Y637;FXU0;Xb(eX3{Dc@4
zojC{?!1V-$JMs@xiCeUwvwx7lD#`P8vNR?$cAZEt?BP7;8JgRdTe*_#4f&7iC%}(P
z2#q3%B#HSv%lE+f&%%r^VhpCoij>QD`%9itK9`!8a<*75?0RG8EfIdOFbkYivvKYn
z)0}1B<LCYDQ#TL`tlV3}P}k^nK=XWH$LE95htS=j_Rx7|X^<t7_FQ5>AC9T>LueQk
zdsY^~$g5o&{IgBUxrb{;W~40ExV9}TZ<iXqCj!U}=WxU1yuw=YVP`amSsRfcr&WG}
zL)BHWb2j)SudnTb4*rDzUJh`XX<C5@RSf<;PdBnS+*jkl@J;LT6(jOwM%@fGIZH>9
z9E0p?*evkQXzZV%Fmyy>Gxt?-CIK`1SU@JvY;C<g+GLY)-A|^;S~LwGf+LsLV1FWR
zpRtfIZF!2b;^*b9-{r4cOH*7}h@e??1M&qm`TNx(qh0})AHeZ*k9q)k{(9oSi5T@*
zq5jMG>Kmk2zU@$AD_jdX5o{a3IfL~`8L9CPWd+z#9d2N^AQ7cLJW*?5Z9ku&sOF0J
zH#MplaFDbvSa}+%=0sRGi4O9q6pz3ta}|cM)*)K)4dUDViZOBcYN*lFOY-y=Q<+`5
ziERUhq$e`E$|<ob+6*9e1E=kosgRN_5`J$}!L|i~I;lX&{@l6?n|i|jWf4;eXmsiy
z!&fgs-NLAspoK0I-q}gRN5a~V9vT<nLg4$e$0wG;=ppPLTZYl&6A{>7blB}6PNbQp
z;T0mg>#F-|4g**)vY55#9;GW7SUIdTRBEE>x9SjEn~K@J!h`nWec~PK3HtIc9mB1{
ziv9@qZNE>&DATac%p|f9A9agJKy;k!b5_bb4f#_i$<RpXAjDfU5sWTfke(e5@Ab<&
zdTpI9RHc0U;kRui+THRYt>*>{x=1@yi|O|2XucDbI}(O!Y{WC$DE<oUVb@v=5iNhQ
zxam??xpT<Okffuo7}U5+dRBOJrjn}FjC`ZxkENgj&;U{UQA#$@%>+EYNGIvOcHpBx
zS=P~RyxuoSBX^RvYN-_K`(^FsYxoj$xb69@Nr}W61Q4AEu?QY@;=)Hrpoj{bfXfg0
zCor*>z(szNA-XHVKsc77WPopj!XrjG{6}eVa}+}XJ14!3K-yI$@xlaY((U0}DTInV
z1eLCIamfSf*i|_bt?;R~nfJ+%S^GS?nXig-lBIwV=0c{%fmWx^OY_KA^NDAjmR&#7
zE2s10^&c#>(*EYku~=EA^R7R{4J)4_s<bl`^zZbfDg7g7@Q`;$7rR_FDt`ZlSk*_u
zZC^a~jy+ILa-{dkbwE^srMsU+BdDL<QppZ}x%qgQ!K5Bf1#tF$(<6jXFzzT}8pLCy
zW7?E0IDE=@o5kxmoXfWB{mcA#7CK!mjmAH4dJ<83&;lae94Mb+$rMv0(z+rpJ$v<t
zx^QyQrWX|BY#2*CI*{pJspA(pt*HwLiRtr~GkW8Uog?WXi{P&&$+sQ_K_6yut29d8
zd}7)pk@Ic-|5T<CyMDV{kYtBX!Jj(Q-XhRSi?zd*Jo?N$j4#Pk;%<X^_H!4Y$(vv}
zpy0G#oL|&#wmA0bJ#6pqH&@fqaB88Ixk$jfwl0Y>S&lcgHl-kNCKpSuG72*SckRB6
z#U}Z@*(<*JNL2BG$I~(2lzC6uNB>3}vSgP&UxDVs2~FDtBzrA4>*AJ%k;J@akrynt
zl~nHreM`xq4}UO?=eLQ9vv_c!I`H}}O_W~z9%vlze_*g$yl)*W%P{SZHkUfi5wwg~
zpijQv<c+FUyR(3*+)S`}&6Yirck0h)5J=W(EmI_2Ly<Zo&f0+G**B`UeWPBfZ^Ho!
zz_!ki(k#~Sf!jL_7#w#oF|jC890*x|Uo?M_UBZdif?5N7r)`&5{;QfOqW;Oz*RQIH
z`+M@$K=P|<P9*=Tn)ZUu-sM))NEHYG#Q!+R7jkD%`{7kt=H3yBzW<;mgN&N|j#UHZ
zA#Cck#-nfF(Ye_dc&OCJXybLBTG>1+mU@+h-BDf{5M0V020e>wxA-D%2=tq9nq<TR
znG}s<xniM6&cgzl%OG4k|FL|&=(J_+pfC08xFK<Ms@PF0|3jENL&+fRGfY}0TZ%MG
z)RACbVR;X@0z>2W9lR9jZyx4~u_M>o+_Dn&V{`_agS|`qNKL{lFuAE8yQgVLr|S|G
zH=8TkP#fP8bBP0M(X8lesDSnP(MN7?BYf}9s~mim&DP5w(ycqyu-=fp@4!|I2prqx
z@Wqowr*}l<wC4((*pb*zV0(3I7rp>i<s|yYIl%*MDq(rdt_6WUolRdI)4@?KjhQT|
zS9rUPnJ8}-Maqfa{5`F+fJ7ajJ1-E1K#4RYLz$(Wjb>N8w~&t@dQL}WHhLrji^~aQ
zQRKr`iOpCI0`6?2qk7a*hNIj@57v~y2>!6&R*=|*-|uQH?PUV~b@6C`p?ZVUhex(+
zk9VU*UAtmndpnn}SZ!5n{@7P*r@jEYq3->~tCU(wvT8q=d_wKh`#P0;f)7$BI4L`j
zP)f;YRRJ0n+U&F+WjC8rw%#jaLnn`oksZ&}e3(<5I?PRX_)x#}M_8)6x8{8V$?RKf
zrfO>q-tXQbrdb{ZT<50Mv>8k$cA_2_j>ZEuay)#k&tW`NhpKhU5I2P)iwCLJMt;Pb
z%D_akkvRyeC86>?8pA*aB!40QMX}+a6aII?1jCT1VnVCO{Zhrw%mJN{U#hRqWk?5S
z(C}=uZ?ujnw;A+x$KfBQDBDk6At~JyX}XygsY&C5a8|mPTwQ4RnCu7w4};W>u8V(W
z8yMjxcV9Tc<3$7V95_@XcK)&vz<lrK_*)p#=~UR_Ufy$a`gJaN8`=f2=<%QT0^WZE
zdKTm2X6U%*kz=bGES*7a3q}0P^xL)4o&Ffq5Ml~TqvCRj(MpZ$*Wf*&X|N82*DBKG
zEFZ&cCIpDI+DyuDFeM@xm&#X^6;(FZ^^Gb#knFX%hgTP3Q||ba-*@jv^@IIGn2amd
zF>gH1VU`H4VWYRj@GN$E%Nx^p5plp|51uH4%;OitujCGCWsVhYIee`dI0q>)bQ)0{
zvYSck2ZgJpE5nJ>2H(LaCL}*ZoO23N8{%D&+Rw$P5<2h@jO<a4rv#3b9B*$3MNHta
z$gW4H@>f@U{%IX_WRpZ7p$kv4NR=Hph7jH?TMK58sT6KkbI<s+SHOvLK9g|B>tr|P
zpi$#;<4Zn~1pH?X6{A~y$uLtIzd;!jr#cpe2rFeQ>soeL(i132j7986hU7S^l_DlQ
zW|c#C7GhV&<Ia=7lst6SFh=6++0r5@Exx?`vf#V1Nr>vtZ($v_`cYxN7CB_c0(SX#
z1!Iiukk7@!XT)~NZkCwQ1^r1@pRys`+@av6j9iy}m4+tAfAGGnA_meNv0UOy4p90T
zlA9NjU9kvAeRGi@2J94Bzm3KI^q6Pu(phT4(s!_n_G)dcBk|b+jr}o5-NtimHPtLc
z(<EP^_0WR<9QF91Rz(Pd+dv*XXWxF@x?ay(sD#GIOIz(sRyXDmh%yf$5UXt|@B@pZ
zJVNqsNF?`qUZ$_HTC3Uk$*SR<UdFD2n4noHxd&*(m_kR80EN&jOlU6ITso7aQR><#
z>_+sZF_%gmPA&X$X-qEy)+Ba5mltwgdxYqLQ^eqJU^kF-V^S11Xb_aL2MN^r8ANZ@
zX?s7>xByc4Ab>u4s(E2r-Xdfp&qK(xa<CClmA181_<q@*4MD*%o@A+m#k3O$ys~J^
ze$^!Lxh_$ZkChX$=~kTF_NTgaNa3Cf%>^aapTDf@AE0BBkpM?w6FV6yj`&=jk}ZP_
zd_fGGtB`zgXps<F3S(hdREtfZ*_FgAR8u|8dqK@i5=?iYqurKxQX%r%Y8g!{fd^r6
ziQm&`bFY9ps<f^z=>{yf;R=9lmNiVK0F@t)l@>naC&|Q+Kpe;QfaTb~aJgO*F}6lm
z++}IFG@!}W{l;kAT^E3PX#Yq&Q75h*<ffz~`C<YbS}9x<)s5K}jv9(-5~Hu{K$k~!
z#m-ZCeW9@@l7w6Xa=S5q<m;3ZO)@W_ey@0<p|EU7A2NWTbKexfHnwC%CCqfrv*E{g
z0eo4(%`2EOtB@KKMk{fYwQYmP4W(!6Tt-ndCc|fzkhgt)FfwGs9tr6cl@V}zZ?Td>
zSV};G%3plD8U%4fQN|E7#^MD*sUt-Cj-=ZoS5A;<&9Jw)Kp5wV6;64ALKu5iAR(5E
z`nSDmsp|t>sqF~5v?+vED8h&@>gyO1U8wlAM>?4K#%z$bx&70$qUkB6FkZJ4`zjFL
zR>dWPl<$i?9QsEI0#tvl@IX{NadmU6IaRC<*VR-8qSyPX`g*oVP2>@jqe4!4pa-1Z
zIhg0AEK+P3>!vpcByi5B*DFIn%v;HQvsuAU6KAmQsevah&3>~l$kv_I!G2vh|8EIw
zvmo4~G%Y^L;N*}-N?+H7_Bvz%k#!4gf&$>srs@2PmgTkMY(vU<lVLNrzh|bBhWRSM
z0}qCB0+ROdEkFoq)t=+^M(}}(*Q+i8M~-{L=8F=GpDHqXmi9jkkvew$Zuor!08j}9
zMvY_JR3Va85zGm11nrRnNp3vq#lHK}Crb{19$1Vc52q6Tf+CWr3j&GWeXHq9E7{3x
zy5A2njF1G$I;<BWa=(R3WX43j<S>M{l;oz9Lr=3r8W{HxA>+s`YoHgfJgqq(ZkVC)
zG^o0g@@Dt;reuBqrq#?3&b*w$iLU#>Yq~;3DeIKNFo&mm@k$0ZG=<KX>OG;>)N#_3
z9N#at*ObauqjS*Xydoe$dgS!a;RrEDu^m`6p3=XqCCjpJpW$A3`T~lc`8o$J%CE4i
z&U@44NUS-tRICKk@D_>5CiohC0u(gbA{Y92vs*JyobPUm&o35Rs22KqT3UyYO)xA(
zy)88XUP$N@>Z@x3-6TNyp1&>&hrK`sml)-2z|dbaRpn0K2R~86q+_({aG}x}z61J(
z;O*(isjgz#wyz*t;!XYN>&Z)v+7+Qr2XahP7nFe^L962v{rCc(Q761Eq|56Vz)NcN
z>|cgzxvO(lBsCNU_yU<>vuD03f4NVf0t`=&39LcFr^{8pI|dGmSXg*L(2|?;-oFPx
zSm6BI`eVx+(wds37Xtqna?{4IuXc-32D-#!e{I1Ki!lmE*<HeEQU(z3iTG>SN(5{o
zQymX6VpH5>a>%*wNg~>TM$U>ilMc+7R+7j3tmDW)JSg}?=w2-oWK;ldLs6Y1lw+x~
z!3yx-Kb+bZ`$iqQo1@ueGGY`<t1n#@Z{3!z;_KF--esl-+Cg=v;}RG5exvA{3e2}O
ziBt#!<vu7!sLaV|D{{?q);kgM!m#uRxMN*#nX7+5u@LE};?iebUcV|$NRrYGd`$fa
z8tgI0mcMLF;eqBA`yZMux-W?%q*nA+NHlYPUNo1=)|iX)FTfPg{`<qntJTS|WcRa$
zXVBt|!GrYOT4hk;GlIUzCJ>SE8$?|cgXKZZSSW-hp&u=6GQ+KaA7&zm6{uyd@R5M2
zV_5gGgHc1MIby7kS=;Lb6hot_!C$4+H1}9LO+1Q<Du+SZ>;OUC%ou{w&VXeGGnJzS
zdycT37`ku}^h9>r2P3`}$0FHj4YF|bb;oR$t_onZ8Xkg^1^&-a!8Hib)p95o>H?Zs
z=X$9RR1YSG%SniGSTG`~_r`E7g{w<v{rO*kw&4!hCKcz}*=W=4P<Ut7`DZY*k_N<F
z5q6COJWD)ywLm@yY9H1F055aQ6UsN?AXvUmicOq9|IhP6O`2d><HOTmvnHU+Fh_Kb
z>90aYH;_^fKT7nj3h-sb{iqz^1s!n_8eX*}%WvNA0!!uNj=+!N_Hmu(9jav_`wWid
z*k6>RB(XFJjB0!(?TZVyTDNw^nNtd8Z}`+eU~FgXQ)S>xvjt4dmm9pJD^1j1xxJh&
zRodaO2Nct?KVo9b3G93f7`b2!uuPi08SAeMM$2K>KwNIYZ)^?EFW+J<`Rg(^nlO<1
z`OR+P>`IfA9Qg*XZJk^&6k8zT5XWgj4AL32mo2qef)-d2pvI|(;o|9N-DDIPhs;rA
z-Q<kp+}@w>C4gc$;VE$A7&Xz|p3ARzyR;oW>3Pu0ZF%<vQCQ-ccWx#9fv*1!2~FP)
zooC|cmCiTbE1<yh$Uld!5IUvLf<87Di(RSEg)LvOhHo;KUYf)<8YloCr&7#fw0dhD
zz{azsU`$x5SXU-dWC10=%O>FeXb(F3`;TdVlSb@Nyxv6<zKYy8u%zs(pf>5?Dm<E@
zjx%}C=>f8BsBk><nT3m2?V=6CH)q1)0GuNj!;E`}83;EHL20UUI@jwJ4caB8n6_8j
zfRQXef=}e6&(YnjBFhEFFVPZ}`A4eq@ZuUBiElzpfUbxl<?wKTe>AXCcyLrI=1?HP
zX8}b_JdyEh{*tlo9>g?ybHZah&Q_99Y9pb^%}ZiI0KY^gBdrMcdDTmcSe-V>7cp2d
zc0QU<0J}F3_fkrPEvqqEH5swdIh@B;<0|Us8e81(dZ+4ZkF7t&z&hklF9Hvv*=7(`
z9)0x7Y^;DT^}tZ)T2H%R0di}7du%a&aC{<xVRV(5^)wEwplk6V)cr`e@wd$$h;0yR
zoy#80#!Qs#2!eQE$OYXY&LZEggu}Six)p{kE7tXPW$FFxE4ZvDyuc0AC|9E><|)-#
zG~m<|Ql^4SFr6k3=c~apJ7a`DFdopCQmx9{jA2Rlg+Lg|6RT&vUki8jRxCRK$W(RL
zXI(!f)j`*K&Yl0=jj+ZhOU+|JCEOECT1Juhi-K2T9huvW@759}7N4BH`WX|5Xg9+S
zzO(bn;;3+>l)B(d7W(~jKW}Cd995MxlMc^ZoeqB$$I(T3#<q|lBscp%HdAf-2rRq!
z1|>R7$o)caCy;%e(?Db{(vDhV@#g+T-??nX!4$W<HM!S6eBB~>?wdmd!$oywsy&Aj
z|K`Zd)Q$V~sh$7T*@KwK2SU*Jj*Y5NhhZ$q+;~2-HTH7B>8D+}t)hfmO@C@L8c4Cu
zk6^PiqF0xfg?vedU$k;NlL$_G)`IXSWw*b^iOH{<B$ch#PW49Daqehehd4cK+24UR
zVJb7Q+ju=HdvA&2s-lT9l%7z?cYbavRve|NYu*eU_RK7t{{wQwtqnonvXwiz=#+nk
zaXh8_77~rB;=Ojc=6UnOAHy7E9;Vxgqc!T}v;)`1z()^xx7YKm6kK#RYSAb8R_i^x
z+1=gtZ8L6yzX{Zy%iSP{9}tE!K?c1MZMuq`;N^h%V|Zlwbvh%u%g(^%Zh#S4you7+
zuWOs~0wR-?c~Ony@#msaJU{?63wqdzyh~U*%tY1kkM1ycj8@SAdzDKi=mMh3-jZ~q
zt(M;YT~Na3)VTRcal~=Ilr24a{A!;jfu1IGWpLrb(eiuQ-z%vcfBC~W24ndANb|`d
z9>i3wP^5f;7|9muJ?s%<(KP@m`bG3jczzAzSnGX)XKLMIa(&T{P7Bhnas045y^>8V
zhV669!|~K?z?SJFt0qlHl0tt9d$s`Q-Raja89pnsRaSWi6l9rE{UY7L_))AJffA8V
zAo7$JNc9`K;Fqmg?POwiHJogM)UZ9)yfB)^rq^_(%63aDc(;m2#jPGJZXbhuYw<Ow
zM8+|?E-xQ8)QP+zxPKWGv7mvw;V$JDQ<5*nD~iJBa<U76fUK!aiu=qVv8{D|GT`pN
z$vK6c<?i%84eeb_9wRMzURJUkyOXj}DkO04An7V+e`J~ss_cS@ad`MSv1hs;x5^(*
z!#2fn-6suecW8Mzdy{}yduY;p^rE9kl4nK_qH^`9Yzqoc@94sRJoZ0u6>=mfWnzW;
z(*yu!T>Vv5g|9^;*OlX{v@9hjdrg;SFFvrV;7q{vr1;oN^~J5<4hA8^PnD<F#Bf-o
zI1CoSBdoj8_PSq-`)3xA7QFwQ%2LmLCB3w47mO#rO>jXfMAj~eujuP+{H3AYV3NQ0
zEA)SF;!0ShwJhqPbu@gFqKUh)m@HEgSnUG|&Da1qNe;=m%BKESoXyi==mvBn-x7rY
zxLGllc}U@vmG(;0H1tUt8c3_tN_)tQI=K8U&Nhh%4S&PNpm_WVc{|<!xZPthT;bKL
zHQi9uzNd2{_!PE(Nm9H=p?yX%9B6awSDrj?<@Mc$G*6EAFG1_#WhkBO(AdDc0@%}2
z-(iv<JQlPKl)swWzPK22hk6i-vT6=<Ljykfbx59YY-0hNxOt@=;sAWCqIwP3;^j)r
zsCa#uA^`>x_<SdbRh3m^73AF}BUaPQXeh&YiKnErZNY+iHc-%+*l0^UEC&JO+sdv#
z{{Sk(vRulb1YFt{XTnN!rrUjC`f@@WAaSFpyczuZ!X=_k)3a2{x&R{`iyFyf4^(JG
zU5<a`DoW`B6ZpAVS=R4GnGKq*?|zLquwQcU|E{0pexOKDXUQsRvi`x-SHNw^z#ZOM
zjAG;8_I$Z@5y>Q%Au1hoMWS(yq|urxV+PgbJ*Jq%Kr;{2UoqAP#3zn^dxS0J89^!b
z?4ri}UjyTEaQ^^lqn;{N6#wxyLz%qJ!hT91qanLD5;Y81N_+H{<6iBp3Gp>*=v}8C
zjpE|8qHPEa#l}T`(Ss1XucA$LLA0t*P$w^l5Q)r^5GO4xI_u0sw2d^ThkhRSM6=P{
zedY8KZk(TER-O;TwmW6hTur)W-D`KPMFGTU?QY;RL60RN(v5K&!9DF~;y9b!kxWDf
zQ_f*Rck^Y1!oYjd)6IM?wP%wsGI(6!NWHTaim`p&Ds?-WJGAj1l?g0#{vI)Fo91{P
z&Le<x)>%&R$9N*9hlt+Yz&Ms<t|@$c<xzBOUdVfgrM{?m$Q;Jy{E`SYXIFHQPGB?z
zeDjN=!J70@e){bKTyog7v>Osu=Vy{l&)P16Ug^bAGMsKhtJKpk9|a$7BCSaKRmh6j
zAa>rD85?Bc3l9GTUpaIry!K0N%Y}bxn7bqb0Cn1y+HjwDa{9F8k8jme`W0hOT&Z65
z3!~3HPTWcTtO!YlPEH0KESwE1lauHg)CPyy(K>;2I6X2jEyHe)@lZVUmfN-751xx4
zICf|*XUyzCHTYY3Q-wj`1r4OYgRdR=Q{54T!!dt=2%C)O6&`8O8PHp^qmUSy(G+5M
zf$*k>yaB$Jk7y0Yrk&q}`o<p8i!N==UlEh4+3WT$Y#=OLdEWO)z4B0#`Ic&psqUmM
zs_dr=7JzNnyPz&6RCtpoZOa9CgRrw+)t6B8{0{IFfm@&uNuQ@h^1v5u+@Tg+Oc`Xb
zK%!@?I0_5IJa1lC9qnX9sk%>MZlBk->~~#Oj<&+UKtS|%e#T4pqWKphmp`<jbL5{K
zfHY~(V3Y8xCA9Ud=3wIhK>})N1I})#%zN|InLo<LG!$ngL7RG*b#(1T6w#-GnY0yy
zBopv6>#&k0D^!Ptk?1`td(u8^2~t5=o?kj)bvFF0c~Q&sk48WZO|HE!Jy=zIiYboT
z=*{m}^lC;JCY$X>rsm+>wa~nKk66B_b1Rw9j95DP^r%DqhpucbvcPJDNVEKPDETc8
zR2A8j%}K0c`y<OGkI;Pe<(KBFvMAIFS<kGvb_8R!saCtY-{LVtydZdgT8pjj3#j^c
z-1E^|X2$EfBCUZo;CXs?L0<4UB?Hc`2?-JaWp5}3?8AK8P$HX6HG8%OU=#)vXskPn
zHWH!^sPRCrEdJ8|k_Z3Km-R>Fa?Z7g!eI|3y+m>9?1yT&P7ZQ3d#L}JL+)EIUN4=N
zpynNJiWnC+QO@B`I!?1}=^iort{}`!72V<cd6QJyjcq5RATrTtgSd>w^kqd&k%|Tt
zOFuB7d?SbHF$wZr`xd|JS|5!_BCj%=!ux?u2}XhOlL6wMx>d5&Ub@{jC$a_r_M#18
zFmwDC%F=8P(RLTCuBRTZ)Hq#{oU2Z!Z&oz>*GPyPL3x*`r<iL$7M(1CkJb7RU}Vk^
z>&ev!cEqT#;SfMHVqmxl5cC_uLPZEqyvtiMn2e7h;G9!^Dc&JzkLzmbU%K=#9KwR%
z`mv4p3mCQw<}rw(($sww-@6}97V52+BD?eJN5~}c98iyWV=F3&2bho-BT+-JN70v1
z9}?DKvs#f)h58Nt(ZH7XkG2}6e$|J7ah*ru@(JonA*<|{7aELwQg~Ptnm|t)ww>53
zk>J1<TRraq8^^Yhs*H6~@lqsD0v#aRb~4At;|fhPplMfBC-l6mmMonLz_lv^BkveU
zm%>mh<1Oj4dXl<CUFiW`FvizJe_j5;qe=>v{wpXsYzR&Osah-NhRO8kkyElZvMquc
zdYqxW=G4?y*aM}AbO-{<fIEzgc2kz=UYid(sa=D?&kEcn+Kg#hVJFPGjib7O2ZU6{
zFFbhXw8=W<HHldIDN^7wzIpxcj)8Fz<JHJ$xUoHZZIafzM*7&N43c!C{XBB>382d2
zR1?gKWNu?q&Nb$T3wL!GBgAJt%bs@P9#52sYRAF<Guk~{wZe`p0bou{&N8qM5GBEr
zkHHdlB1p(TE@RMzdMkr7V-hwzhOGF}raOp?C7%(%={M4_zf7CVYt~Vjgx`m*QsC-<
zd=tTUPx~+2mgqdyzd+Gt8fWBr$N?UxYZ2y$3&IDC%=bOpu_r2b5q1^`dVbAz=FC@_
zSh39Y^yM~>&11`UxW7W7Lq}1Mmm1)ah*l>Ebtelp2|w$vTrq|RGy@yXWX61zQjAMX
z(H$MCpgbEcFmv28TqwG}Jtz2QdgqeI(q9r!$)Y&KwPjTk$<3u}_B4zqPkyBfC_=1q
zUZ)bev3a(Jx1zlZC8$O=pE=QlvOjNUEuT#zH`7n5SLh?C{#rzh8tT1OM;Wri<Cjz6
z=66tF-y$uYWk8+=kWAf#RNxcdJIRY}pe2nJ^Xld(cK6&bSU${9<`W5Y5id4e3_?VQ
z84M}Ny9D4KT*Sz7+%%@L{0dk4MHe>mw%=3zztolQ?=jf}h@#Ts&nq}1VgVADtg3Et
z17vf>yvb|T3~*VuIJK&xW|jHz3qUe%A%!02k*85f7Zi$^iZ9ecilJcW3oWaty!Aqr
z!anh+lxg_bAKeYa^T@z}R7Z<Xh_A5JAiUJM^2<nrC!SZmuWBrg16ELr#_y+{%VT|v
z!S;Gpm`6{4dsR_<ZeC(VfC;$)-qx@F;db%NMqY)m>>I3jJRA%Pql-P*!ZW53gJp+Y
z3q4lSy><$}`geuY+1cYaNtk9%)yQvOktceAB@J%L6+I=U`YI~o%k_@AGb%$#pyrWY
z(trz2`^Tnevq`an`>WpsupCcM6OhJ+SS<CZ$3A~zQAe7zS*cAluuLiwy%*{y5Qkl{
zPNrzN)ZyY1v)W`QqD{RDcLV41r`75W!`H+~`Ozn8@qI{Yi0(NR+!KEXzZb^gEIb@)
z`n55|@_%i9RX3+!q8sI0y*0m6H)u-eI$5i<qmSaq(X1m=<&-@32s94&it^jo4BLOJ
z?c{GC!g~Nkxsm?@rL9}|wWG~#-tgA{6L6UUevLRN%zpsojp8qS`x*l{&NWkNhMU2N
z6OYt{^ro6yc9>IFykcv)`Q)@HKNhui9onY&EB(d7NgP?Sz)dbTsc&cq_iPSVY~N^W
zD5(;xs@VuYG4qO$geQ5gjPo(Y4W%?$(_s$;l4}zhY-i<Lt1PGW<6H*k&!Rx3a4a1b
z=hI0pYzI5e%;!V3%OhD8Vmgu0LYOfQ;~cyozZ~yLDi~Jr)m8oJm%a*A$EH~R@6+u3
z?h5~Vn&zJ`zSULl^q6;T-3uXr=Fp8kRJ@EuRmz;~dUC=LMn$Ay8>~{i>Dm3RMd^*X
z;z?k0b{ZFTdKD7)ub9)+AsZeL({qQRl$Sga+9bW@DNF32XmC1<-B?`ASJCcV6>ku^
zH(#u|hk_a<=HRiHpsHlfHn3B6nlm~Uke+~iHGe^$7*AIsWNthH!Ipy8iF%J)Z$!P9
zIT&#`T2Ijfl}l0mKxcZ02l%FX*$mmen!wh-xU`ABCn{>jO~85UGz}E8t}VX64Gg|1
z!yD7|9DyoYiT0C5*T3an2{2FI+3rSyt=PaqZroA}y4`3|!tF5`0(cJ0GY%LFkI7_E
z4vC>YG)LAJAc96KS2g!3yh+`Ck{Eq;P${azMi!wV2imew51%UVW0yvBxvZsZge?Hm
zr?pmy=Ud?_QIwk1M1qc{Z{%H=B`l|uCeM)070@@)GSE&_D{WBTU6feAu8~>|pM=eV
zspt;I-0I&#RmAlUhQyx}Op}FcZ1)&_6fbtSTJUSnm`<=xx#5&gdP)Duw$Qo^JYFQC
z4{&^;fUH9Opv5(iRxCOJ7*6s~ILP~dN<8}kf?&E0L-TK!oNPtxpGM+0t}-<sXo+3^
zwwh1EK9M(!=%DRdsC0tQ2G@;Y=->emQG=#qdp*^8fw=S8>{kss^wQUl37v);5I!M<
z<V?ty>vZJ$v*~Caoj8O=YY$?;xbAtD$S#TC=mb^Y62eczkZbMjj>$&&w2;-2m6_{N
zs>shZIHW$avz-00&=Pz&nHOFuoN)7^<m4)$lnXn1V$=R+^jAo6z^W0P^{zVv^knBj
z<`VXKGvP7|w9HcL)Dhgoj!cq>wzeeJjsAFjyA@FZ&s=mH9`{x_Ec7=|GoJMixJqV3
z-m%U@yS$*l=~79QA0Vj;U50&mCX3#>90W7=sSfq+fKt&3tz(&(Ue5Hn4I*>eNMH+M
z61&oSDR71ez1(USq73z_3(H*}QkkVpV5$9a@y#0#;`<>{WB`m_@8r4NO~9p7w>{qs
zsII2H-x@Fb?Fc!0&2oJsj^xCwG{xmg{nE!5i`)G^qsg@w)L0b1|0?xg4E*us*R=Vl
zPZoD+zs1Fq94pCQB=e7-t0hoaOiVfCiasBy>Mx*wgu*?wViH+GgK+sf%8$J&XVa2b
zC2O1_JCQYhZGCcn>MQ%A_OX<6b@%sAvo{(1@s93=wsYX@Z0pB+zcG}9JD%1oT-KT{
zfWuG&zNsu_*chObHp7Z`GiW0rJ{!0Qgh+b)<ZdMA0SxixgG)1ZapH>cz9!Uj>Ae!?
z%Va^!&v?D6b8|h69zw9b0=BwHn$=GjK@5y}mPEGWy}DP4_%p6<9C?ZDqBEhrLox1t
z*?v%`b1!;Rkd*XxDsr{qQ>RX15u6j1U`>LnGXUVkmq<L}Zsq&zWCA3dNI$Dn9K_19
zNOi=>{yPhtfn}czm7XHi?_89qx5Z*RqG@&}e<KInBW-?bGN#Iz+zme^#0GNQfymo5
z)RnDw0Z&zgB4*tf;*N;hm`hdAKylo+8{OV45>}B(PB?a^ny%s)-w;caT)m$*T*;1<
zXKV$c^KVpt4z(Jk0dL70RiHS7=V!fZo>O(L1D4Z>4*Il*L(MZVMG0Z`4ad7}edJ#V
z40<JDdY|y2r6_N^z3YhMwE~EhVK70!?lp896FkD!b)2*?4?CW&-L4ATb8{+4fI9Xn
z2<1mQa|jGZ<xIr{pR)#^_b+NRiI<1f(C{7getd0<KPfemt?#zI)6|CB8|u7oB^-{1
zzYh%7^Q1__+agvRjQ5kM93N<Jj)l~IuWo*=D{X$tg{^yk_h?8tF_CD}*)zm4J#6tv
zx>Q%$0fG<JU8-pOMDU(Y3CR6#Q7-*Ot9^KXZJh6FQ1@Ua=xOs|B0a%#@w#eBmpy;?
z?cUM#7;$)Q7gk6r(@FEVG~Is*+YSgJRT9c^+Z&n2`B$*7z<Ld`PMmH1Ynh)Bw@qVV
z;}R7oR(01@8fbcfv=X2y=sSYqWTVdt$awfd@sO5c>3yREykTM#ohL0$8}pz}{wza0
zN3hIof0%ZitqXS3KkJ7mCGW%%e3GD8J?#Ou2A={2e(yB-mwW!bQ|6yFg28Y>SLNc#
z7|EZk*c#eulYm9Z5(>o$vZsYMtc&A*2THVDFkiz*h}VC<pA09QFNMYNyrE&;%?p)V
z0GJW1@!KHTl7PRFNe^nMB1Qq`VpmPtuXnUn&8ad8as_U9)F7={>KO_<3qp+m@Q06{
zq@A?d4`NNK+{PifZ4<eTj0)gt|LU7;`sp5iw@I~td5V^Hm!=c90H+^2>!5Aj-{ZN7
zB2cqN9Ab!!4`xja8O*0;i%5iJ(04NZSbk-k+%H@V`$vfv`6~FqjML>@*@}rrv%_b<
z!zYpITg+0IR!Raj8=4cTXdU`0%Y6NLEHcE2{z?0jKya2`|CH;_YSiuO$!Bc`(@LMW
zaiM-{zDD2suDyXXhj$TPz`d6z&^mjfJma=hGUD<%;t+M*S;c3t!K6xCgS?Rl5XiJt
z>hRyyUh~Fr#Pt^8*PV)9t}F_SEazO6Z`K;a%Fxn+I5zDtJL(gR8Cvi-W{bigjS^)0
zv63}q#kZpVMcrn#af@$b&<DpWzm~-sk(R&Q)&}&D*3)(TYd0Zxx!34Ry_O0Q7PTk9
zFRStzi<FM(Gbz|4TpS{EZ8Ch(QF;np&O!45z~s=62D^_GC=|gt2rG4^o#f%S_<qhX
z#)r!PTI3_)CXuO6sMTuV@$iiTF9U|AWABHs*SCzl)}lAuo&I~I@7fp8W(1YtF{hoo
z6R4D}Y1slR;z=>ke!Ci9x>Yamr(a(mX|vBaVNa;|bp`N+s<Ym36TOab_qRO-;PYEg
zpV0o(Y8r~6YVV@H>H-HZ*tjxFIZy6l><cJj>%UG((qA*;9tS=nicoV5@)BTkSWb90
zFIK}{0Fd76tQqelgN??rNI)t)K2`@QEX54Sg<lT-@M0AfdL4(={8U#68;rtO|G>UY
zaMTGhjSaTxpp(D1K9(-ZM*Q9Y`+pyu=&#R^M;D|Nrm+IR_Yo5~W)wQZSJRfv`<JB1
zZekb{Ohkg`b$0{;`X7o%4*GyOF8{yZpVI&L6^Q=Q|Iu76FCqpI$xg*@Wt{a6&(7Sv
zNNczEycS*3949lwy{U$*24l#-!X0RoV3z9tfO^Os5OFeAOo2Bb_o^hkwT;N`!HFoo
zD8{6|3N`~!Btb`a*Elf6HSMM7`zh(JKizqZoXRIR(G#ZW*7<?`m2!4E9Q&afkOD$y
z(IjMC&))l{{*d+sMH#-UuhQT8tZfDlaScO@9S_r8a*zsLVJjiwBv$*%+1dq8M)?3B
zgR6Qc>(<7Efx2S|^0Xdd_I3--M(kzTL6g1L8kb+x)c@LLu$<|f1h#UAsw0o6?2xMW
z)To!1_7D8O_g$lfkBN`VYb$Q=0<r8S|A;O?5<!BYzJ00vjcDC$ol><D5LsZxfhn*5
zN#GQeM_vVgky!eR54G<RG2^-4CDw1j1o3S=ReF%<I1&HGg*aK~*0S4OnW8p%s#fNO
zG)d|`4$n#XDh#qbE^Hl+jd@}qBdA9z1AD&kxnL#sx}DNCB&w99u+G<h<Dz)XQz^)b
zUg*6f)xaHg5Xd43oRw=i*d88b;1<9!C1BURt{K+<q;DUk+}tU`ntI*fTX`nKAu@$V
zm9>*xuYhQWc23KO306x`kT>Z$G~t<*jte|I?SZ{nw@gS*Fn?C@xO%>~<@mJBPo)*L
zOdfKwdE|%pW$|onX|iZN>v@HXGPH3OMM;2cOp1u`jp#FKo!LbON+4hO)%e2{B(XO3
zD$}I67IMp=o<u`-E2~*cl;@tRwtM-f@oT@yiNNf*OkPy-71&45NVXqZf<sZTJZ$G%
zUpP|$LjEx2K1pgEu3=nx3XZP4-a?rmO(`?)822Cz2Z2l}tO`Hj>D-Kb&i_oGE{zeZ
zf<ADt{jPcw%PCi1A{Ss2V6Cu8W>&o3HUWma#V;n{1YHRXcwSWwm-5H4!zcD0exwlU
z%mZvrj&Bwp-YN_wA3BCB#ar}F65{F0s=_2K`p{qFE1#AUuhpMoNhvPxC^{DGxv6Re
zkw2?$bP3WVCzKD<r>Nit=l(&X1kc#YZ`hN`#m6h-O);N8!16w72ZjFe5ouY-w{OEX
z6Uev!zWxa)dO&OCu@N~tu{+B+_;OKq;;w9r^P~xLTr3r_JM4LINT&L17myEAIOdQ9
zz1A*pehj^MUEAT$*P86JbeqrC*k_)yq`U77uI~5Z!PF#Sci%GVi4Pax$KkD_*QRJ!
zYn@WI!$K?LkQ=9Ua#ST$sTinF73PO?Ig~6{!x$*i4e$mOg<lK2J_rtP9o%#W&!NxK
zvl^D#Jf`gE`bt-3=fkq@B~hV!I3^Zrc?I9UT1H7Z4k-+Et&y_{{33Xnstq`nGX=2P
z`g}9aw8fOZE&wZ08-JixzNJ=)Xu#{;xD;P&?tc$Jvb4NA*=Z^FLpH&KrQ*=MBqGXF
zOwE@%I$|9v#E9*5CbakJV_oDrx{mLQ<^Ae!^(9I7Evqt48|g_i7E7?M*}Pb#al?Vs
zT}v6Qx)+CS536sfjQxH6fONh}bOQT44Hwe0Qk)J7^9}0jNCls==I}^h=TtS7{>7iP
z(u9A*2g=2ny1pW0Wa=uv4L6ee{wji*j)JSG$#pM1pC!@Y1)GJy5n$l9t)^hMs}NP3
z3PA<NlZ>pK{s2}US>^o3+FUBJXA%|Zh{<|TehHnJT38>{eI%0R<!|xRh-NpVQ3du^
z63kc5sHRM&IoQs`NS&ZSD~>BN(&>m01;Wft-uH)p7<nz$#WBkY571`YAB@rcVyWrH
z4H3m_d&i@B1R`QA?;%QMQo*P!FU2aHKj`JehM@J8uCG`KUq}4A$Rgy^+9mRM+ymtn
zYGm7npB9ie6*9-T+_=pxBGs{Du2@J)#X)V})EB2x?~KPLL5HqxRo<gLh5_&hOx^03
z4eAB5DyrP;T*<B-x8Y%!<p{VG;;~BW+?fEH?9w|$IV?mgjlWXo)^A&-#rS{p@lM(g
z5I8~GS5iEpg5;o56%Xf0=gT%)J>soOaj<0XzIfnovcl><TF|V@<g8bJw7(OI8u)tg
zqHJXw_p5Ax&>&PdQsvKT8BmGQCoD=*=MU{jz`|W4^ZZhjPDVN%q>+3*oe%|Jv-~4z
zmVvsqfFs}n4-{RVk0!C|pZzpR^0XgSGum(O=tP_Y0L}rvs~E!Bq<Zp_6u^y?o7hTv
zhc-9h0#TGXyMa0#!L)kMB0k1N)!frJES>a?AudH?;%Qw&ashJ~em!Gn=FW98pYA*B
zU{k2iAeH(6u;Hjxq%&SAjxKd*esw5_&PnGm{c#ThVWuJgJjE969&?roml;g>sv9vP
zm)V5SZ+*@sA!L&RJiq8o!zxxIx0~Pi2w;WbF7EW-gm2P#(fPbf#QsZug1>H&JhaVH
z0)PA1^sFj8|Fl{RqgB}tA4qf&?Jk6}LNR2i$>PpE-p;h`>4^16F<tDubJ|zuQCj-c
zt5~p+%)f-x3zh6%aTx2a{PU#Nix`c=wE>ukP?xixC15*dH0PP6=>pg~j&Fzih{b*O
z#De$SQQAU5sMePv$>;$bidOP6%-3DULDVf2MG1f3ND6C0K%sbMaVQj|Sam?3^&P`~
z63Lg@^&8JfkBCJvI616D5m~~&6>)Nuf3n+kLCk7cZ+z}%^{VmSBU0(4P1=WVp$4@B
z@n=<6Se6XRhtDcgk=fd8K@T8g5pH3#7ihPblynfyd4p7U+He5&r+;NQ!L>DUd6)m9
z3ef}87RZ|c=~^NnZ%^LTv9v+mw3TZ5KL4CCK_Iw~Pc8!qNfN@MM!0p?|4L#ErySV;
zU4y|<b6tGRP|GFAdF41XWb63XmW@rzWru@1rZzQzU6pX@7t$y&Jry40<@pXh{N69@
zhv4b38HOo^6xOZ=exHxi%_0}Mdpx?J`8_Dos<Fpe*<<VtwF2gViDVSk8sgOta>=)A
z+_^FZMaEvGjCiu%S=He&rJf=8uoqS?Q4lNpoH5sB!_{c;7nqq8sJQb{2<~-{@eSp$
zuH3G!h7SK>B}58oxp6aREx3KDD{Rk+`ebs<xaOrwd#_j@#BH+URlIidcPsS&@5NRe
zI@jNV6iikA)WrJ-dtIwfcn<TROhJ;t(iG*^F?Edx<z+ujKeT%JCQp$0Tp0amyoGJ%
z=yjfu{yqX~MI&3fYYNsmY4uiI*?9e4c|`4(J+QUsj*6S<(NXQzACD-njd6~+sKkyB
zKhv`RO!St-3o7PaCdDlr#9L-E<>8%_Vcxo>&mKJiviX~-{J|av$u-@V!$~MD*f`K_
zpN`%Wxq9ahjn}<gMs^QwEUl**;G~v~yM!RZKM5wrC3G^L{z+xVx#?2Im^;T;&WZU#
zIu*ft4rMsys)$ouGViQ#);CiN@(g)+(bKaD1(#$Ea@|CX1>F)Gekka1;myY&7^!((
zfD2h0>~Qgqx^r5uCwiZo8<+%)sJw%mtg-9;pm~5@R*9|QWx0MDB=f-dL58b(;kpJi
zJJIZ=WZ!3mf&Me-{4BG}bFtsWtjYi}2#`-6LdN|{#D--YsNf0fkh=o!c9oSVQh{}5
zK>K!IF<aKyiZ&oaOJB<N+TFYWJ@gMw8_gts`ualllWAe$%@q*Ea*QvObc6*mgk0Ha
zJRjiFa}b^%@DhVGUl1DrgRZt(<o%eURAMFgkJ#D-P183*QR$i?E%m3aorUv$AC*wN
z7EZs53-Mv@Le8r_hR_qI^^8(-vjc$BRq<UB@O)xWBPY8wl9ItU;5x5j{XN>3NKS_H
zJ_&FlzwNetS7Y-^(n?1TOJeO4Esyxfvis&*39*bC8uD*rEtmg?sFm7*LZl<E;>l;J
z1eWxR7Sf3RuhpR<Lp)Y^tNj9W$f#~@iP>;M>C!i*Me>Cb-O5p%x$-5Mm(->L0kB%X
zrS&SU^R`Pr{ZlUD$gnr2X2|?}ZQ@7CX{-BZJ_LIh@h|{5@{v2|ItaR5-&fb!H&C{U
zQg;-3-p>#SJ0!7*%$LfP-Av%$-)>aSZyVP}<9~1*?a%r)u#u&>00Q-2S;BH8V0Rt*
z`3BNQnS`2Dgw>tBzr|Iy;mr$x$UcWYm1$tzl$Qr*eJ?0VTtc{?|EIREj;dpM8oam$
zcXtVH;X-hCcXxLS65JCYxVtC8JwTA)L4rHMJ-A!==H|V(`@Y||=j=bbbxz;v?y9M_
z>bX5tGu=7N!FX!to#-1-){griTm$;e)ZC8f)=$gyb3{UNJ0-D`c|I1Gqh*4fTyB{Z
zyGskFw(<9Ap*vEcluX@F;5gX$JF->Ms@xr@P{OUZ3aJQg8VGx@R?N|4-;ka8|NPFR
zV575<`6SKY8K<h-{DUdGOl<Ok2&(>7-zf_4rVa;K&eCb=tw-+%wx?p5Z>e}>N(&!(
zJ|n(@b?REd;iQsRfK!S{`ms;Rn%@Gg<ZcMtPia^bJ>C6&Agt48liD@`o6eTQ-4U;2
z3CA4Ht=yedz_g|H`)BBjk&wnSy9L5n<eU}+*b5tix4u=oF_XlU(;?}{^2{LamdfDx
zb+767-}R8rJDb4>bQ|+b)ta29Io+{|m>mJ>U5^&0f?O`yQ!ll|O-j!n-jIf{73(hs
z(B=z2H7X_Hg>Yn^9=-lL^Am=&k(OVFQX;{*e4Hevc(CIqR1C9#Fp&ns*poO4n6D)S
zW=y?`4W8eIM4H5*mBmsOeN*FpRX8^i?RyLh%Ry5hqhWHm<S@RK2Em5_8aGNVB`gK*
zG8cuXrhUI?RDITUly5H|?Z{u(GefB|ZFF{I;Krh)Gx#->IoEp!;wmK?jd8$hG07eC
z@r^wkwx!RueZXCN!;4+5dePuZ$V-r;dWV3PDv93km{3Snl8|^2z-IC(Y53e-oNl2~
zfQ&>mS6stYbahRvIcUr_Q^w*p?kd<o<zun@7UzZYGPjai>PJDHJOZl#>EQ%QIV>(C
zFG0kh<41udu6V!Ugi;(LP4$s;ok=`fQE&eBh=-Q#;~%6;fPw~!=6OJTuFFTRZ|}T6
zC(hT*u>2?-Mv{~gcFbQi=MLKVRHUU}lB{LO)G`WP5G}Vnw*P?_{mKUI4%Y6RF2$Hh
zF#guZW44CM-N08!3~kC_FTe`+(gHaTvrkm;MKnj&R7U--#j`(lyC$kF_eVQ9-wyiw
zNaDf1QfG$JH+fTjF5+bUGf_in3|*zqlp(|61M2pgema)(n5Gm2MO}$I#j_o2<eFwd
z&sk-k5t{QmDy@NaAqWMBgagGZ-6cIJ`sN|p@_Wwgs@S1SvtbpXY#P;?t0%5G%%RPm
zg0VYg0b%)TPIlCkASG$GWj>@8PF*T{jtI;eq=a)_;h82_djIkT9Mu3qQQaDyuS%zU
z$P%<*j$Gh~Hn+@~%GN?3X3OrI$)%~=0?h|)Mb{MdkFJ|x`wX|2)dj@hY4>@UF9=~6
z4faZ~h9rFJS&7drpAHwKt`I*J(VGA45*<DY^iVWl^9ASZ#Hf@j?V3=^4}5i~OLMmt
z#-?@j^k?~2!noTed7$jZXEKQu^&Cu-7=@@uh)<i6&8s6qR#&s~5jrlIKOe_;+3jA(
zS6*jJFjdbBENccO`E^)_-=Ux7(ceB(s?5EhdYnnVfJz#3H0_ei6&7FB2b?%hSC66>
zI#=4f-=rpcV!|J;?#&Z*OE%RVi-uaI1%>dg0C~K==j%MtR`JGfst)JN6!FztMyNt&
zsQbz;11~JYgir22KCb2D6s$6deK^l&KR5_p5;^@EqW(NymxwgLwM=)fF!`K^b*xrz
z5>9Kr93D&u&3y)aI90C7&C`U#Mil?Rx~98XV%Pz8k^ar?(lB=4K8I<#?MdXk6;6tM
zi1ktounE2b*7!FPs%nGJXZoaPeVa<ea>R$HPKa*P%$=sn;Z!viffjJ5WaW|4&Im9y
z5l_330rRZ_!t-2tb1+K>lAyR>&b*_>qc2I3dpzH7bFx6Gm+Gm**P6~uCR4I4^U9P<
ziEEA<3KUV3Ik_|5BcLiRdCauEu$#FpI+Uov@j*9zRapX<(Gx2yO@7j6*9ixgUV;cf
zIWn0AmiH8{y{8X&3_~s?1IULr$BJv7uf)`eIOptXzT*W`P3YbS2>%)}sd#=Nv8~LL
z4h99PUHhTAqM#^Q8kbN<!#9PrOE5_O-T_fd!=sVQg*h0~?box81(GkA&^sm=uHTMs
zBegT{gL6={@AYz>4PO5u<CIa}eY@!Co~^@dZBuNRo^6>+P$rtD?3)W;mp6ctw-G(M
zASzYKYzM69%mA0C{{+P4S^C?crSBn6{19!*IP8Uy#K2J5qUus6-uJ7qsA<7z^mGH^
zF>P-@!2<@q<gj^KfwcJM(}8V4YJ3|@d7pKji_lm#$tQCJ+vULkmHX&VnzxH-E>iF{
z341^({=k12q?Cr|pY46&lU*`n+0|I)@NruE+jW^A2n5%WRmckZrsSBjG!_hg$E5bl
zQ__(Xpg3#3-0X=ccal<4M{h9V{7DmLJdsc1%e`t6fRH=yqnG0o`tIgY!Lj4S*enyo
zT;x?IQQC~n*;M4zAE&r$lgq<%=GFu@0TkpVR9=#Y{<G2hlPqWhaD`{jLgRLtOR5hO
zW`|On9vU&#J(oY`z(K<YcgkepL9)=QHz9j9m@*3WmdKtlUHD06Nm;t&ouqyb2j;Pq
z3J(~9x>t*+?45igYTfEvGhwR8Cbr#ZQ`t#Yj*20yC%&#epN$!ui@9gJ3t!qUt2lc@
zwFS9;@j9CReWTP*rjJILGWf;*nx`y~`an*ab9|NWeReD?mpUGMo41XWd_*eeW~^4|
zR|BmmZvO|}`bd`awjT`zh%Tt{qsUv`Bfn1Yc9TU@Hs5aWj`T<KmoKmH-N?T($2-;J
z8J&XhueLr*fayd9qgJqVwWB%L(SuUwwH?%mS?=31pja@BZEK$}gq@YVCZbg{Kh9@r
zu0FEiHhw`qA|p8spoC8YhARUw*xvC14zTl^K(mxzdMltyI(&=Ra)0aZL6Z?6clW{1
zJXKNb3`zxLL>V`5{plK>Q)K<eq(d_{1!>n<-gv88=2c6pDl~+u7HxB7F%@Lvpl0VG
zo}Lt`+FTz+UG3W4_q#|rQ5y?<95~SF9C~r~q5ch@+LCH`IoOG-UwmM1DM3Ve<af^)
zahM;3HWN0qA{T<gMf#BdR1Oa?<_0JN9u)nt@t7}>Jfcud0VQaF;T41h*lGa^*Yo_j
zweiW9bIB3l;{vD~sJs+A{pUL-to?isZ%yAMMxUD9N$mTw1D3kI8#J|rG()dTj0TLU
z2Fguyj=T91yS|!<Kv#kDJG;&&mplRK6!=4!+MJ%&6H#$O8^a&=O5|29q!{g~73aZx
zo}b><OsZZs?&nZUufH<HF{>CIgsJ6iHan^zWLXuSPRCwpvGOJj7?IfVh<=%mLuvB^
zVQ8g;4F~hLt>&*BD4E@4E?g0|hx3A+_I9o6W4cEk=sD!nRCb4-kF%OilkW~=zudIt
z9y7cz%=~yA9f<lIAwPRGwE3$(oN4bO_+TfZsp?e=Us)-hcljC8BwsznOYb)}%7-vG
zdW7?-Di!_PxrppHr{CxtamKB3=Yj}QU4n3?ZT-HFvcd{k+(1q1XIJ?rt(~@c<B1$g
zLIpkK@jYS2e3cPrjThjPnlR1Fvot){E)=3%sH2@eXfd|!EnTV`kc*5T!zluPsz*i6
zc2GJQO_8?dGn2W)U<!lb{GPBhaYeb%JhN~Fb<Hi|8=5qMbrK-empnrSbDo+S)ib!{
z_B(@#tM<4eWYjeMu@cwm!SeWiXiZ}cf%dg~1AnTKShNKprHtY<ILr}T$-a3h`RwFa
zU@#=gOKA+eOKmWie*3^dRfaN9QrPIo%F}jPT!<D3O@REkB|*#^C*p6~oZl6ylnyAw
zJw%VSAdZK2H~Kkn;YVh}p!AFNr?g)kgz41Zyd_01pY!z{NfLggU}+Y91%GupvAjG+
z%_g<r;u^TpKD~}|+`J}2-6TI0FI=t-;m!$3W;QaIUREcCpIy8gIA|W}amHi5K9^o>
zd+om0%~*CZ3cezEs9kp<hm$HOn@cHHC%pSgfBx}&ad2=3@J16@6=$qtQ$?N%T~zI+
z4RLF`BbL)cr-Q-4zpM(AE$8b)ohRGW%P{t4#Y~KqWvQ9XDWiC(iC1E^K3F_n(&(kx
zN%6j3&9_cLxm8v+Yhg$M^VzD_zDwI~NnDP6KJx@04*40UpmE6u0j;lwOcYJP>(M)M
zSAhNcfH?N<N=d8N8NT6x(87XYV@USMbVV{a7eji#5r!g%ws)+Ij7{cdk4yVS5wnct
zh0(f5WDa+u5-#%?+XLy8qMN}mJJ)iY%_mTnfn}~@3ih+PBn})K9Tlp%<3TE6GF}Wl
zc0O4YuW{OC0^Dqm^rnVh4_#R3K=C(k$84Q4DYwkoAhXp)>8%l(-mw!;`Ic^CXdH)$
zBdn;%pSLZIBEVe5fy`h{z}qsOyhF+!pzKbh)7ZY&OM5uW{9>oayTin9UPIxB%@=kv
z;qDP}d`cHi%{x0Wg6hIFONB9dP1F*4*;6BfknsfB>yHJ5h#0YKg3nF@rb4jw&Lx!L
z->PQ>1af>oqedP(j}$QAjIo=Xj2=e>ysS>{d-r@zu9A-%dm=?=smF`;tBvJRMm(3W
z2>)#QNiov$triVJb<rhb+t-hUG%z`L<8(dEl*3Wc&lhb`_g<f>G>xW+O$Jt{s-&E4
zPho7>HBVo>KiUfMQCgD~ZHC$m&b8#@(BDT{r@EUfe(Rs<qfO?!Y}uLT0DsQo+x2vs
z%jaix?gZbzws)l#R;`gf2<}>_T7#cm2+%>M2iB^AqFEV#xs*@>=P@i3QB5P*CDru~
zy7SmdFi_E$xeQJGb%YIrN#^->;eWe`qf4&);kq(T`{2J|A%k)l;g&UZYoaHSMGLyX
z<{*m*7Yqz~5H5|G^)yU(&&QuU!l60CVh+$?>gD)!!Nv{+55trB*=3(cbcf>!FN0fN
z()~%H=|O)jk(u(x6uWI1GTGwEctIL_B=no3TNz{zj#^ZuXD}F6ZhJNRsn2m-6q#@!
zeNovnMl`;)v!3(3q+`M~`lp}BSUW~W!1C7n6Kg?qCt!CfUg9yu_s||`dW4naB)YQ)
zbr+Vq##%&&O=CUg-4)Lht9<pWw%@dEo|)Ua9FI3%Cd=QK&2?_s7d-dDO37LW)0D#W
zs$av4Oxi_7PX{|i$6+h`2@s}i@yN~%>T*3%WxoxJxz^HB4hr;qdV3NrM6|rfUR6b{
z&^=LE0FUdWCA5G<cDc-776P4_mU<><LyCz)I@3rNYjGCh^!?{-`t=?Hi`|IkMzOvI
zUV~?KNPjB}v*`LA!tQPzUQIo?aj~4%Zh(WaS~K`{h6A+^iDs70j7N3)+NP7`((c_x
z>GR8RjC4k@wdkR52&Rw62hXDEr?XM;98C1%E9j}K-CEM3K#$?7nG(hF8@g@Ycl@jz
zgan;chwfW>X%#&5tImWmEb06j75QSFAID^0yY`Qis_v1e*DvSvB*qm7sm0i*4QbyS
z+9xnz)+_v~{?hwVgH4`mcazU{vYWpiBqd^MdNz*?qesL}y-EK0)q$=-Sl=((PcXac
za(cA#as%Z@R?pEa8%aq`#)LuIDjY3QVq=kxI?m^VMuPlkh2^ka#sdT<NRxd|XW+8w
zYSdqOP#_(8Go+w-H+DLOrZi35qpsGqq+hmvhfD1YKH)|;Q;Ud$u1Sx=i@#zp{Y%RS
z1>l&g1wT0romck;Vhm9>cCx5~X3lWD=jUgNsL`$&#KbTVH-<zVz5(~E|2J?lt3hFx
z6s|a67%^PV5m&F%SJ-LqLYr@JiQB1{AaQxrnYqSHJFj8V1O5yx5j*U$<9ZT5Pd_{I
z-0~Y)LgRFKt^1HdMxt$fQd-eAHe5C^28Tiwc*-RJ9x=Wo|K&!A3T)6@&#$X6;pt(h
z?wlt{Vi$8OW!{LR=c$e5t!s6W%A6T2-F|4!L-u?SGEJ_GmKLg?-Z4F6aSm{7=&$}L
zi)n7!Qb0?T+Ri<+;VZ9u+vVX|MM_5g7_#KL?Np}%A`8ud=e1osnImRg55q=q4<C5E
zMpr)Bw-Z~)@++6%&I-WBPc|!0T*@kA|58!xaS$JT!t4i=boRXieWcWLQ`y7MGL&%0
zL1Z|?AOCj&)-Sq?Xb}2Z^@I%KC*+4JwF!0Q-9F+J?EE0<G0+9=Iam5r_~M4k8(1Vg
zSZ}0CArT3V)#EAp3Y(ETlZ@K<B~ovRku%qydZR-c58e^dd9kXrhe;#NriPJKC{=1J
zTe02EJRYrr<nNd82xwP6a(M3+MN*;-Vz1y(AHzw2UWx@p?-R-<a|U>C?Xtzxd>y2V
z7Z2ze&l_jnC2d)H^BQ%pNVNI2k<NCHYXjaI=6180IrzXi(#+mL`M1K{?ikB?Th-B|
z{sT|)o5)YQAz)GwBZ&sd+O*dEMlM|ed0Gqo#dGxn?bk*yo3GdrhIy!71Z?&?Q1V8^
zM38vsAxt}+?V%V}v{Mz4ZaRqQ2x}6WPi~HLjC?b@x;c(wGNxzJR;i+8y~JDGeaR($
z+_Bu$d-;NXbFw{HGNc1X6^)MMC0)WgyrOft^El^l*OztshFFr<IL`L12@OMk9zUU=
zoh<({(NU30dDP%;^x|$TTSqTt!<I0{ypGO&weObnwx}kM`ExWbnB9h%qC<_2P5pWM
zgy8GTTw{G)7&7^x98QW)cYe6AN$tPnzK`_gy`Zty3@v(RdutNy*FU}xtO$3nOsn`E
zRY+CQ_uFLz>Dr|@d?ozClABMq*Ku*V@!r{i4c}^j{FSz3G`<?=?Q>N|eb%`2wTJVV
zmU2PAoad=P3cfQY?2F=jtQIe`&L+lcbou8p!u^jwF(>M>o;*O@s4|N4wf5>>8K;3Y
z10E|Dnl>{IepkQnVoVRaJ29uUWP{(Ahnp2K$Z}QP<^Ei1I4Y#Z*z={{$6zqNH2ocy
zZwJ$82AncX%&}V}*EO@|UEr{vrBFM^_u;DQm+NPATuvhrRBvnH9l-N6T?|>WcL5KE
z|MVPPAb2OkOQz!Nn0R7UjbA~trLJrHyQPBuKD6oX>ZZ{`^B`>h2uzQHC60={ym;d4
zat1x(398f(Ns7i53hRw+!K-C6PJKgoYdNRj7LG_ZO5ITtN&a#YR2oEP3^h>9tn#S$
zg^&}T3g|P^b>Fdl_vV}8L}PC*H?5NOM{1|tgBU*%UPGfgW;VvpTECNrWrrdfF&-*!
z8P@h3qek5hkZG<j3==cFL=z3vx-W-hxfjtO8*__@<~2*=vUKi^gR;(3<#=%g$gZWt
zL}ifc9O&+39H&>R3@i5D<8;h_-Ex!kGTXRK)%j%a=<(BogLQ}zDNAyoPo0B8npF5)
zo(m_s4_z_vvnhK}Cr{&bWItHOo;hRJUu3j{`|WkAEJErt>B7U!DY>{OOS4L;UES>D
z?N9}ri+bzhm(RSfuW2q4C`N^LxbQMW*ljCyL%x{7N#f{zP&QnTJc7@?aWl}gamb(W
zOd<%~Qf57R6KZ^pDrhr|)T*`h#_~J5&JN~_^>`7Ahut(jW@J*dk?Rh10p7JrLE?P0
z(3>KDwif>bC<0B)1TiKH$Dy{iD5}>t#i$*Rq{bTunG3FlO}2&Ij^b~HP-}Hl6Eap(
z;*ns;2DwGzRR|6vU>nb`G9rZw!CpoD9q*Q8-ilDDSlkCLqPaHA(6S3beG7Gs@{}$O
zAlZ$x8O<`4++BK5xDwG{Hds@Wo31I>dKDr{L@haZs8MYulWU4K*E$=VdGpEOm`r2E
z649wE{TY{cTEGSgr!Bgo7*2$oJYX-J$U6zm86nhc8A?oTTZ%+_73P(8c3KmimW(^v
zM^`G!rOpNs+T%NOi{l;-NxAPr0#{lT1R-*g!q7jYx9+ftqBTmf!!b(^89R#~Dc^bR
z@{mq^ffD=zZvM#Y;v|_EMEB^N=YPU=MY)v*N8MYkol)Dm<D)b7Fk*p^DeVY+Ea1lP
zUi-cZ_HqE)QB?^hzHrUKeXPqkkFWL}>FP-^^mX*}%4*81)>d8M4#EWdyF7TPs`U!{
zs)%Xj<1zAo)_j4vq;9T1CGVqwZI6aH#kivu_1jgcVlb;7PEBP_BGpl7oUe1|J!+au
zm-y9gyQ=B{wL(~722mb66;*LN#qk&U=wxv^rZ*(4_(<khH+p4oSm26ztv-~(6xUkn
zZe#;-+Yi^a_|;}8G$LsV`K)!tcoZ#^Yl#v!XE++S_w#P%0@$}t;$apN!u9(&nd}I1
z4_yMSq+boTVXY`9PdQ&3b<k#@fzK=5nCx6EjST$6iIxoM)UcWBF2>#^V2vHTm@Mh}
zkO7BDTkKC^dq9>rw9+pc;RP)m#8*wfNl=CPvtT<6xi#fx`m~N{D3iy$7uWZSye7lj
zpD~P8mJ_FMT*0Xd-Qd3N%$2Vzbd%T0KED5qTVMgWZ4?7mtAG_8jau|-!nBtL4L`@W
z<^@#akXS4BzvPxzZoGRy9BI}Ej<;RIX^pcnW?#u-$II4$MnvVWFjRWU#Q2up7mFin
zow14WYxmqN>wH04aiv3JLwww#Cbk>fZw^JyiN0fIY5l$*XGFT6qVNdkOikO1Xai(*
zxK)0%l_zidO%<M^^Qrx0i;x&-Ok^dActZNXU!~N9dPj)0Ld})hl_%Z})DsK1y+P_(
zFYMv`fs9lF^*gY=E;Jle-aKHD*bE-#k=;Z~wq6=&kI_=~(EPReH1cfsdl+rfvDgwL
zL#vqLGkm^U@f3sAKm!J_ue{$i_wr75_LFgUbhr$<75Z)Z&M3Wk>UX#<@wavX9dk-w
znww0nh9ucN&9SxY`g>Xlycjp$>BGn5n&65wjwd}_?du+hrBXIZ$2s&=%7Hw%(1r!C
zKkrpd8m}GcGZI`Dgtw;J;>fXlBGZk@Vj>o)IMaXGm*sE^3h1wPCNe{^n0=cE8@s&B
z9?rP8iVUsg$;iEnhAi>LXg`&zcYvVz*LB7>PAOkP298FOYMj}yBVwww`_Py89W+%-
zlaUeucb2&`_3aFl+8((A*3w@0u_Pa{FP}UMUmbRpT0e;*zc}AToN6Z;%3+1>Wv?vb
zU`w1!fHIgA9VMDIOQ3rT_QU7DS~n|h?%8;?Mx3ncqe(u+sIkKsPcjqB1dRxj4ztdB
z$`$oql9|Xh)GbUpblE~?Bk?V*BbUZA(Wpq`PE_2Zq5N*%%&>(wLA#_)^Hl4b*%o0O
zzi4>Z6zYD1c>LT^*2<xEiV$cujW(}FvZ4D~YP#+9a-69{3(L^oFve_E8CP5ZsAa~_
z@f@vO+#hr)nG*Kst1DNj3y)l}>Aj6cgczEcIzMlolWt;hDcJhD6QZg|nD{&Jx|VTt
zs@~Pa6Ou`?nTs#jMFjz~o(Hf`JVgNZiG8fBrt?bVWFo=#*ggs>nd`3ey;eIB%=w`H
zb>|Z9t#^gGk2)6q@|ASlEfPGWXh}@7L7L*BwY}#~S;Q;v*Y3FBW;IiWjIQlrN^23k
z0>bNF-z#4b@N*7I^NwvPAZlBXWADcu(BL~%F}{)2Fdo05zNA#9q44Tl*3WztC;WN0
z@KvI+^YHQ{b<1tUcmf6V1Ic!i)fU&K-?tq~-R64t+Fm6)66J&2)bl6@ya@M88@jQ~
z@MwL0Wf|}{j{ATtV@eaHlPkCxLdMH5H`os(!4XGtSR32Ic}>CeM7uxR_55$WFm5RA
zS9~<>+Yr0N&Xc=~!&?!?CcDxHRKN9YSs^htKvU5gKhnKno3QR<(|$xpaz9R7%+N6x
z^SnlI&FCGbL*#%GI_Aa)=2Ze<+6XiSOdI9xEZ}7A2w#%-Z3@;wk3|7GqCB?s3G<yj
zFx0|VPAOF-Rgc2~=t{LPH1PrWvVKuo?l4A5_hm?9E1=enJ8d$4Msm{0?(XBb(}l4_
zJWyf?z11P?5m(R$x9Em2I(<5!!qMnLt9AC+%<maOUkAko0tlfoGqzY9XZV?*yTU=R
zV3P+tc@q_&Qe=Sf0;m+%v+K&+vcrisqDphpH<i6T$hf>#JWTPx5n#HG&dy!{r-P%H
zjRk=J1u%e_97hEN1!%>;a{qe*ApAEX2qf}v-v1eg222NG9&V=g0Lj3^{!f`O|6==F
z8c^<k!he+We<~Nk@;4ierG<w(kR-8maQFCI6hMN=#rT^E`q9zC))b&oI9mK?+ktN`
zK;t0I-al8e*jPBY{VfCJWou#iPaJ}^fV8uWsgs3+C4>jM%+c1#3W(r%IsOs)cYPqN
zAR3FmXe1VHmJl8YK<KERZVnWG#i6a;J<NckcR<zd9`64P2!tok)XSU$z!3Pi%)c{$
zEEymQ2@v=n9Dim1>zCm98A$N}QpKSxK&a2_AOdRuS1gEvBElfPNLa`ZgMdn@4ipI9
z=5TYEs_W_Y79s)1%){-kK7}T~1wjG`MBwi6r@tX|NZ~}lHr|jp4$ut9I6wfB4+-<;
z|C^WauRLH({?H)(4&nJH4M{@Me`Eoe;6Kv;H~#<MbP7m)0G<9*KN6s==8y~kAnk+%
zpdbF=w*bu-i2nmXzIG4@MAZbmoqY(1m;-tn45%v}fL%c#;$T1~WUfpCApiuT+ya4+
zf#XVG0Pi+JG6+O53S4&rgiGMs0uTu86qw7!Kv53>?ge;q01T)L%pFi5Fb3w%e-{j?
z6FU%0JuN&;0j%q2`3FPDl>gRqU~agXy0}18@qYoc?O&6L=A(x*L?DTU>7R=sLO}|q
PURJL-Sb14F*xCOJAC9)u

literal 0
HcmV?d00001

diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index 6572616769a9..6ed21de368ac 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -18,6 +18,7 @@
 
 pytestmark = pytest.mark.cpu_test
 
+ASSETS_DIR = Path(__file__).parent / "assets"
 NUM_FRAMES = 10
 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
 FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@@ -140,3 +141,39 @@ def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
             )
             assert np.sum(np.isnan(sim)) / sim.size < 0.001
             assert np.nanmean(sim) > 0.99
+
+
+def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
+    """
+    Regression test for handling videos with broken frames.
+    This test uses a pre-corrupted video file (assets/corrupted.mp4) that
+    contains broken/unreadable frames to verify the video loader handles
+    them gracefully without crashing and returns accurate metadata.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
+
+        # Load the pre-corrupted video file that contains broken frames
+        corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
+
+        with open(corrupted_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+        frames, metadata = loader.load_bytes(video_data, num_frames=-1)
+
+        # Verify metadata consistency:
+        # frames_indices must match actual loaded frames
+        assert frames.shape[0] == len(metadata["frames_indices"]), (
+            f"Frames array size must equal frames_indices length. "
+            f"Got {frames.shape[0]} frames but "
+            f"{len(metadata['frames_indices'])} indices"
+        )
+
+        # Verify that broken frames were skipped:
+        # loaded frames should be less than total
+        assert frames.shape[0] < metadata["total_num_frames"], (
+            f"Should load fewer frames than total due to broken frames. "
+            f"Expected fewer than {metadata['total_num_frames']} frames, "
+            f"but loaded {frames.shape[0]} frames"
+        )
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 369c5e6cb4d1..5c75bee54dd3 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -63,6 +63,63 @@ def load_bytes(
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         raise NotImplementedError
 
+    @staticmethod
+    def _read_frames(
+        cap,
+        frame_indices: set[int],
+        num_expected_frames: int,
+        max_frame_idx: int,
+    ) -> tuple[npt.NDArray, int, list[int]]:
+        import cv2
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8)
+
+        i = 0
+        valid_frame_indices = []
+        for idx in range(max_frame_idx + 1):
+            ok = cap.grab()
+            if not ok:
+                # Frame is broken/unreadable, log warning
+                if idx in frame_indices:
+                    logger.warning(
+                        "Failed to grab frame %d during video loading. "
+                        "This frame will be skipped.",
+                        idx,
+                    )
+                continue
+            if idx in frame_indices:
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    valid_frame_indices.append(idx)
+                    i += 1
+                else:
+                    # retrieve() failed even though grab() succeeded
+                    logger.warning(
+                        "Failed to retrieve frame %d during video loading. "
+                        "This frame will be skipped.",
+                        idx,
+                    )
+
+        valid_num_frames = len(valid_frame_indices)
+        if valid_num_frames < num_expected_frames:
+            logger.warning(
+                "Video loading completed with %d broken/unreadable frames. "
+                "Expected %d frames but only loaded %d frames.",
+                num_expected_frames - valid_num_frames,
+                num_expected_frames,
+                valid_num_frames,
+            )
+
+        assert i == valid_num_frames, (
+            f"Expected reading {valid_num_frames} frames, "
+            f"but only loaded {i} frames from video."
+        )
+
+        return frames[:valid_num_frames], valid_num_frames, valid_frame_indices
+
 
 VIDEO_LOADER_REGISTRY = ExtensionManager()
 
@@ -120,24 +177,10 @@ def load_bytes(
             )
             frame_idx = uniform_sampled_frames.tolist()
 
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
-
-        i = 0
-        for idx in range(max(frame_idx) + 1):
-            ok = cap.grab()
-            if not ok:
-                break
-            if idx in frame_idx:
-                ret, frame = cap.retrieve()
-                if ret:
-                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    i += 1
-
-        assert i == num_frames_to_sample, (
-            f"Expected reading {num_frames_to_sample} frames, "
-            f"but only loaded {i} frames from video."
+        # Convert to set for O(1) lookup performance
+        frame_idx_set = set(frame_idx)
+        frames, valid_num_frames, valid_frame_indices = cls._read_frames(
+            cap, frame_idx_set, num_frames_to_sample, max(frame_idx)
         )
 
         # Use transformers transformers.video_utils.VideoMetadata format
@@ -148,10 +191,10 @@ def load_bytes(
             "fps": original_fps,
             "duration": duration,
             "video_backend": "opencv",
-            "frames_indices": list(frame_idx),
+            "frames_indices": valid_frame_indices,
             # extra field used to control hf processor's video
             # sampling behavior
-            "do_sample_frames": num_frames_to_sample == total_frames_num,
+            "do_sample_frames": valid_num_frames == total_frames_num,
         }
 
         return frames, metadata
@@ -185,10 +228,10 @@ def load_bytes(
 
         # Refer to:
         # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
-        frame_indices: range | list[int]
+        frame_indices_list: list[int]
         if duration <= max_duration:
             n = int(math.floor(duration * fps))
-            frame_indices = sorted(
+            frame_indices_list = sorted(
                 {
                     min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
                     for i in range(n)
@@ -197,34 +240,23 @@ def load_bytes(
         else:
             num_samples = int(max_duration * fps)
             if num_samples >= total_frames_num:
-                frame_indices = range(total_frames_num)
+                frame_indices_list = list(range(total_frames_num))
             else:
                 target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
-                frame_indices = sorted(
+                frame_indices_list = sorted(
                     {
                         min(max_frame_idx, int(math.ceil(t * original_fps)))
                         for t in target_seconds
                     }
                 )
 
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8)
-
-        i = 0
-        for idx in range(total_frames_num):
-            ok = cap.grab()
-            if not ok:
-                break
-            if idx in frame_indices:
-                ret, frame = cap.retrieve()
-                if ret:
-                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    i += 1
-
-        assert i == len(frame_indices), (
-            f"Expected reading {len(frame_indices)} frames, "
-            f"but only loaded {i} frames from video."
+        # Convert to set for O(1) lookup performance
+        frame_indices_set = set(frame_indices_list)
+        frames, valid_num_frames, valid_frame_indices = cls._read_frames(
+            cap,
+            frame_indices_set,
+            len(frame_indices_list),
+            total_frames_num - 1,
         )
 
         # Use transformers transformers.video_utils.VideoMetadata format
@@ -233,7 +265,7 @@ def load_bytes(
             "fps": original_fps,
             "duration": duration,
             "video_backend": "opencv_dynamic",
-            "frames_indices": list(frame_indices),
+            "frames_indices": valid_frame_indices,
             "do_sample_frames": False,
         }
 

From 64192d562402a56dc1e3a2141cfe896a7f0b52e9 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 20 Nov 2025 13:23:22 +0800
Subject: [PATCH 225/578] [Bugfix] Revert custom attention mask for gemma3-mm
 (#28995)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/config/model.py                    |   5 -
 vllm/model_executor/models/gemma3_mm.py | 138 +-----------------------
 vllm/transformers_utils/config.py       |  11 --
 vllm/v1/worker/gpu_model_runner.py      |  19 ----
 4 files changed, 1 insertion(+), 172 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index d1e56a72a318..97cba6ea7295 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -32,7 +32,6 @@
     try_get_generation_config,
     try_get_safetensors_metadata,
     try_get_tokenizer_config,
-    uses_custom_attention_masks,
     uses_mrope,
 )
 from vllm.transformers_utils.gguf_utils import (
@@ -1625,10 +1624,6 @@ def uses_alibi(self) -> bool:
     def uses_mrope(self) -> bool:
         return uses_mrope(self.hf_config)
 
-    @property
-    def uses_custom_attention_masks(self) -> bool:
-        return uses_custom_attention_masks(self.hf_config)
-
     @property
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index fe83c8b63b01..43c69e5e1399 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -596,7 +596,7 @@ def _process_image_input(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return []
@@ -644,142 +644,6 @@ def forward(
 
         return hidden_states
 
-    def generate_attention_masks(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        mask_dtype: torch.dtype,
-    ) -> dict[str, Any]:
-        """Generate custom attention masks for Gemma3 multimodal inputs.
-
-        This is called by V1 engine's gpu_model_runner during preprocessing
-        to generate attention masks that allow bidirectional attention between
-        image tokens while maintaining causal attention for text.
-        """
-        # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
-        # This is a HACK. Fix this.
-        start_indices = (positions == 0).cpu().nonzero()
-        num_seqs = len(start_indices)
-        seq_lens = []
-        for i in range(num_seqs):
-            start_idx = start_indices[i]
-            end_idx = start_indices[i + 1] if i < num_seqs - 1 else len(input_ids)
-            seq_lens.append(end_idx - start_idx)
-
-        global_attn_masks = []
-        local_attn_masks = []
-        start_idx = 0
-        for seq_idx, seq_len in enumerate(seq_lens):
-            end_idx = start_idx + seq_len
-            input_token_ids = input_ids[start_idx:end_idx]
-
-            # Find image token positions
-            img_pos = input_token_ids == self.config.image_token_index
-
-            start_idx = end_idx
-
-            # Create a global causal mask
-            global_attn_mask = torch.empty(
-                1,
-                1,
-                seq_len,
-                seq_len,
-                dtype=mask_dtype,
-                device=input_ids.device,
-            )
-            global_attn_mask.fill_(float("-inf"))
-            # Fill the lower triangle with 0 (causal attention)
-            global_attn_mask = global_attn_mask.triu(diagonal=1)
-
-            # Enable bidirectional attention between image tokens
-            img_mask = torch.zeros_like(global_attn_mask)
-            img_mask[:, :, :, img_pos] += 1
-            img_mask[:, :, img_pos, :] += 1
-            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
-            global_attn_masks.append(global_attn_mask)
-
-            # GGUF compatibility: config might be Gemma3TextConfig directly
-            text_config = getattr(self.config, "text_config", self.config)
-            sliding_window = text_config.sliding_window
-            if sliding_window is not None:
-                # Create a local causal mask with sliding window (1024)
-                local_attn_mask = torch.ones_like(global_attn_mask)
-                local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window)
-                local_attn_mask = torch.where(
-                    local_attn_mask == 0, global_attn_mask, float("-inf")
-                )
-                local_attn_masks.append(local_attn_mask)
-
-        return {
-            "has_images": True,
-            "seq_lens": seq_lens,
-            "global_attn_masks": global_attn_masks,
-            "local_attn_masks": local_attn_masks,
-        }
-
-    def prepare_attn_masks(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        mask_dtype: torch.dtype,
-        **kwargs,
-    ):
-        kwargs["has_images"] = True
-        # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
-        # This is a HACK. Fix this.
-        start_indices = (positions == 0).cpu().nonzero()
-        num_seqs = len(start_indices)
-        seq_lens = []
-        for i in range(num_seqs):
-            start_idx = start_indices[i].item()
-            if i < num_seqs - 1:
-                end_idx = start_indices[i + 1].item()
-            else:
-                end_idx = len(input_ids)
-            seq_lens.append(end_idx - start_idx)
-        kwargs["seq_lens"] = seq_lens
-
-        global_attn_masks = []
-        local_attn_masks = []
-        start_idx = 0
-        for seq_len in seq_lens:
-            end_idx = start_idx + seq_len
-            input_token_ids = input_ids[start_idx:end_idx]
-            start_idx = end_idx
-            # Create a global causal mask.
-            global_attn_mask = torch.empty(
-                1,
-                1,
-                seq_len,
-                seq_len,
-                dtype=mask_dtype,
-                device=input_ids.device,
-            )
-            global_attn_mask.fill_(float("-inf"))
-            # Fill the lower triangle with 0.
-            global_attn_mask = global_attn_mask.triu(diagonal=1)
-
-            # Consider the bidirectional attention between image tokens.
-            img_mask = torch.zeros_like(global_attn_mask)
-            img_pos = input_token_ids == self.config.image_token_index
-            img_mask[:, :, :, img_pos] += 1
-            img_mask[:, :, img_pos, :] += 1
-            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
-            global_attn_masks.append(global_attn_mask)
-
-            sliding_window = self.config.text_config.sliding_window
-            if sliding_window is not None:
-                # Create a local causal mask with sliding window (1024).
-                local_attn_mask = torch.ones_like(global_attn_mask)
-                local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window)
-                local_attn_mask = torch.where(
-                    local_attn_mask == 0, global_attn_mask, float("-inf")
-                )
-                local_attn_masks.append(local_attn_mask)
-        kwargs["global_attn_masks"] = global_attn_masks
-        kwargs["local_attn_masks"] = local_attn_masks
-        return kwargs
-
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4ca155af03dc..df24738477e7 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -520,17 +520,6 @@ def is_interleaved(config: PretrainedConfig) -> bool:
     return False
 
 
-def uses_custom_attention_masks(config: PretrainedConfig) -> bool:
-    """Detect if model uses custom attention mask generation for multimodal.
-
-    Some multimodal models require custom attention masks that enable
-    bidirectional attention between image tokens while maintaining causal
-    attention for text tokens. Currently applies to Gemma3 multimodal models.
-    """
-    architectures = getattr(config, "architectures", [])
-    return "Gemma3ForConditionalGeneration" in architectures
-
-
 def _maybe_update_auto_config_kwargs(kwargs: dict[str, Any], model_type: str):
     """
     Update kwargs for AutoConfig initialization based on model_type
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 80f8344d4410..0490ed39c8c7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -324,7 +324,6 @@ def __init__(
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
-        self.uses_custom_attention_masks = model_config.uses_custom_attention_masks
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
             model_config
         )
@@ -2352,24 +2351,6 @@ def _preprocess(
                 **self._init_model_kwargs(num_scheduled_tokens),
                 **self._extract_mm_kwargs(scheduler_output),
             }
-
-            # Generate custom attention masks for models that require them.
-            # V1 pre-generates embeddings, so forward() skips prepare_attn_masks().
-            # Check mm_features (mm_embeds is empty during decode).
-            has_mm_features = any(
-                req_state.mm_features for req_state in self.requests.values()
-            )
-            if (
-                self.uses_custom_attention_masks
-                and has_mm_features
-                and hasattr(self.model, "generate_attention_masks")
-            ):
-                mask_kwargs = self.model.generate_attention_masks(
-                    self.input_ids.gpu[:num_scheduled_tokens],
-                    self.positions.gpu[:num_scheduled_tokens],
-                    mask_dtype=self.model.dtype,
-                )
-                model_kwargs.update(mask_kwargs)
         elif self.enable_prompt_embeds and is_first_rank:
             # Get the input embeddings for the tokens that are not input embeds,
             # then put them into the appropriate positions.

From a9705a290af05ad71023714074ad8bf1a50c60a3 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Thu, 20 Nov 2025 06:04:23 +0000
Subject: [PATCH 226/578] [Model][QwenVL] Replace `torch.repeat_interleave`
 with faster `np.repeat` (#28964)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 .../models/multimodal/generation/test_qwen2_vl.py | 14 ++------------
 vllm/model_executor/models/qwen2_vl.py            | 15 +++++++++------
 vllm/model_executor/models/qwen3_vl.py            | 12 +++++++-----
 3 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index e10b8e1e77af..e1b7dbf99f1f 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -128,12 +128,7 @@ def get_image_embeds(model):
             visual = model.visual
 
             pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
-            image_grid_thw_on_device = image_grid_thw.to(
-                visual.device, dtype=torch.int64
-            )
-            return visual(
-                pixel_values_on_device, grid_thw=image_grid_thw_on_device
-            ).cpu()
+            return visual(pixel_values_on_device, grid_thw=image_grid_thw).cpu()
 
     image_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
@@ -217,12 +212,7 @@ def get_image_embeds(model):
             visual = model.visual
 
             pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
-            video_grid_thw_on_device = video_grid_thw.to(
-                visual.device, dtype=torch.int64
-            )
-            return visual(
-                pixel_values_on_device, grid_thw=video_grid_thw_on_device
-            ).cpu()
+            return visual(pixel_values_on_device, grid_thw=video_grid_thw).cpu()
 
     video_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index d25ff2785bfe..479a7871e364 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -29,6 +29,7 @@
 from functools import partial
 from typing import Annotated, Any, Literal, TypeAlias
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -751,25 +752,27 @@ def forward(
 
         if isinstance(grid_thw, list):
             grid_thw_list = grid_thw
-            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
+            grid_thw = np.array(grid_thw, dtype=np.int32)
         else:
             grid_thw_list = grid_thw.tolist()
+            grid_thw = grid_thw.numpy()
 
         # compute position embedding
         rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
 
         # compute cu_seqlens
-        cu_seqlens = torch.repeat_interleave(
-            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
-        ).cumsum(dim=0, dtype=torch.int32)
-        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
-        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
+        cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            axis=0, dtype=np.int32
+        )
+        cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])
+        cu_seqlens = torch.from_numpy(cu_seqlens)
 
         # transformers
         x = x.unsqueeze(1)
 
         # pre-compute seqlens for attn mask to reduce cuMemcpy operations
         max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
         for blk in self.blocks:
             x = blk(
                 x,
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index c10aeaec5ab8..90c4894d33e8 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -553,18 +553,20 @@ def forward(
 
         if isinstance(grid_thw, list):
             grid_thw_list = grid_thw
-            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
+            grid_thw = np.array(grid_thw, dtype=np.int32)
         else:
             grid_thw_list = grid_thw.tolist()
+            grid_thw = grid_thw.numpy()
 
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list)
         hidden_states = hidden_states + pos_embeds
         rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
 
-        cu_seqlens = torch.repeat_interleave(
-            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
-        ).cumsum(dim=0, dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
-        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+        cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            axis=0, dtype=np.int32
+        )
+        cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])
+        cu_seqlens = torch.from_numpy(cu_seqlens)
 
         hidden_states = hidden_states.unsqueeze(1)
         max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)

From 1c7bcc55b86d6cb867072dfb890dec6c7e747a1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Wed, 19 Nov 2025 23:20:12 -0700
Subject: [PATCH 227/578] [Frontend] Allow parsed tool arguments (#28820)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/entrypoints/chat_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3b722c2d9277..03214c4d131b 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1437,7 +1437,8 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
             for item in message["tool_calls"]:
                 # if arguments is None or empty string, set to {}
                 if content := item["function"].get("arguments"):
-                    item["function"]["arguments"] = json.loads(content)
+                    if not isinstance(content, (dict, list)):
+                        item["function"]["arguments"] = json.loads(content)
                 else:
                     item["function"]["arguments"] = {}
 

From 20e4497be23f8e74882bfb0bd0db3d30dd821afc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 20 Nov 2025 14:39:10 +0800
Subject: [PATCH 228/578] [V0 Deprecation] Remove `num_lookahead_slots`
 (#29000)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/config/scheduler.py   |  9 ---------
 vllm/config/speculative.py | 10 ----------
 vllm/engine/arg_utils.py   | 11 -----------
 3 files changed, 30 deletions(-)

diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 8194295ffedb..b6078706daac 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -62,15 +62,6 @@ class SchedulerConfig:
     """For chunked prefill, a request is considered long if the prompt is
     longer than this number of tokens."""
 
-    num_lookahead_slots: int = Field(default=0, ge=0)
-    """The number of slots to allocate per sequence per
-    step, beyond the known token ids. This is used in speculative
-    decoding to store KV activations of tokens which may or may not be
-    accepted.
-
-    NOTE: This will be replaced by speculative config in the future; it is
-    present to enable correctness tests until then."""
-
     enable_chunked_prefill: bool = True
     """If True, prefill requests can be chunked based
     on the remaining `max_num_batched_tokens`.
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 13a8632413d9..a0c65b6049e1 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -634,16 +634,6 @@ def _verify_args(self) -> Self:
 
         return self
 
-    @property
-    def num_lookahead_slots(self) -> int:
-        """The number of additional slots the scheduler should allocate per
-        step, in addition to the slots allocated for each known token.
-
-        This is equal to the number of speculative tokens, as each speculative
-        token must be scored.
-        """
-        return self.num_speculative_tokens
-
     def use_eagle(self) -> bool:
         return self.method in ("eagle", "eagle3", "mtp")
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 68205b6079d7..74828bc109cb 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -488,7 +488,6 @@ class EngineArgs:
 
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
-    num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
     model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
     ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
 
@@ -1081,9 +1080,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--long-prefill-token-threshold",
             **scheduler_kwargs["long_prefill_token_threshold"],
         )
-        scheduler_group.add_argument(
-            "--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]
-        )
         # multi-step scheduling has been removed; corresponding arguments
         # are no longer supported.
         scheduler_group.add_argument(
@@ -1653,18 +1649,11 @@ def create_engine_config(
             target_parallel_config=parallel_config,
         )
 
-        # make sure num_lookahead_slots is set appropriately depending on
-        # whether speculative decoding is enabled
-        num_lookahead_slots = self.num_lookahead_slots
-        if speculative_config is not None:
-            num_lookahead_slots = speculative_config.num_lookahead_slots
-
         scheduler_config = SchedulerConfig(
             runner_type=model_config.runner_type,
             max_num_batched_tokens=self.max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
             max_model_len=model_config.max_model_len,
-            num_lookahead_slots=num_lookahead_slots,
             enable_chunked_prefill=self.enable_chunked_prefill,
             disable_chunked_mm_input=self.disable_chunked_mm_input,
             is_multimodal_model=model_config.is_multimodal_model,

From 7218f83992c7d61fc3845ea24407a1f3b909713e Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Thu, 20 Nov 2025 15:50:23 +0800
Subject: [PATCH 229/578] [ROCm][BugFix] Fix shared expert loading error when
 disable `VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS` (#28633)

Signed-off-by: ganyi <ygan@amd.com>
---
 vllm/model_executor/models/deepseek_v2.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index c50fc327e760..d0a116b97997 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -287,7 +287,10 @@ def __init__(
         )
 
         self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-        if config.n_shared_experts is None or self.is_rocm_aiter_moe_enabled:
+        self.is_fusion_moe_shared_experts_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
+        if config.n_shared_experts is None or self.is_fusion_moe_shared_experts_enabled:
             self.shared_experts = None
         else:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
@@ -327,7 +330,7 @@ def __init__(
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
             n_shared_experts=config.n_shared_experts
-            if rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+            if self.is_fusion_moe_shared_experts_enabled
             else None,
         )
 

From 1e1c06789e63a760d91aaf6e4ddfeabfe382c301 Mon Sep 17 00:00:00 2001
From: Bradley D <bradleyhd@meta.com>
Date: Wed, 19 Nov 2025 23:53:38 -0800
Subject: [PATCH 230/578] [ci][amd] fix EPLB execution test (#28742)

Signed-off-by: Bradley Davis <bradleyhd@meta.com>
---
 tests/distributed/test_eplb_execute.py | 423 +++++++++++++------------
 1 file changed, 213 insertions(+), 210 deletions(-)

diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 7b45ae82c72d..0a97749ac318 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import multiprocessing
 import os
 import random
 
 import pytest
 import torch
 import torch.distributed
+import torch.multiprocessing as mp
 
 from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
 from vllm.distributed.parallel_state import (
@@ -17,10 +17,12 @@
 )
 from vllm.utils.system_utils import update_environment_variables
 
+mp.set_start_method("spawn", force=True)
 
-def distributed_run(fn, world_size):
+
+def distributed_run(fn, world_size, *args):
     number_of_processes = world_size
-    processes: list[multiprocessing.Process] = []
+    processes: list[mp.Process] = []
     for i in range(number_of_processes):
         env: dict[str, str] = {}
         env["RANK"] = str(i)
@@ -29,7 +31,7 @@ def distributed_run(fn, world_size):
         env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
         env["MASTER_ADDR"] = "localhost"
         env["MASTER_PORT"] = "12345"
-        p = multiprocessing.Process(target=fn, args=(env,))
+        p = mp.Process(target=fn, args=(env, world_size, *args))
         processes.append(p)
         p.start()
 
@@ -40,24 +42,16 @@ def distributed_run(fn, world_size):
         assert p.exitcode == 0
 
 
-def worker_fn_wrapper(fn):
-    # `multiprocessing.Process` cannot accept environment variables directly
-    # so we need to pass the environment variables as arguments
-    # and update the environment variables in the function
-    def wrapped_fn(env):
-        update_environment_variables(env)
-        local_rank = os.environ["LOCAL_RANK"]
-        device = torch.device(f"cuda:{local_rank}")
-        torch.cuda.set_device(device)
-        init_distributed_environment()
-
-        # Ensure each worker process has the same random seed
-        random.seed(42)
-        torch.manual_seed(42)
-
-        fn()
+def set_env_vars_and_device(env: dict[str, str]) -> None:
+    update_environment_variables(env)
+    local_rank = os.environ["LOCAL_RANK"]
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    init_distributed_environment()
 
-    return wrapped_fn
+    # Ensure each worker process has the same random seed
+    random.seed(42)
+    torch.manual_seed(42)
 
 
 def create_expert_indices_with_redundancy(
@@ -275,6 +269,79 @@ def verify_redundant_experts_have_same_weights(
                     )
 
 
+def _test_rearrange_expert_weights_with_redundancy(
+    env, world_size, num_layers, num_local_experts, num_logical_experts
+) -> None:
+    # Initialize model parallel (using tensor parallel as an entrypoint
+    # to expert parallel)
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
+
+    ep_group = get_tp_group().cpu_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
+
+    # Test parameters
+    total_physical_experts = world_size * num_local_experts
+    hidden_sizes = [32, 64]  # Two different weight matrices
+
+    # Create old expert indices (with redundancy)
+    redundancy_config = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
+
+    old_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        redundancy_config,
+    )
+
+    # Create new expert indices (with redundancy)
+    new_redundancy_config = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
+    new_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        new_redundancy_config,
+    )
+
+    # Create expert weights
+    expert_weights = create_expert_weights(
+        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+    )
+
+    # Execute weight rearrangement
+    rearrange_expert_weights_inplace(
+        old_indices,
+        new_indices,
+        expert_weights,
+        ep_group,
+        is_profile=False,
+    )
+
+    # Verify the rearrangement result
+    verify_expert_weights_after_shuffle(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        ep_rank,
+        num_local_experts,
+    )
+
+    verify_redundant_experts_have_same_weights(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        world_size,
+        num_local_experts,
+    )
+
+
 @pytest.mark.parametrize(
     "world_size,num_layers,num_local_experts,num_logical_experts",
     [
@@ -305,78 +372,69 @@ def test_rearrange_expert_weights_with_redundancy(
 
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
-
-    @worker_fn_wrapper
-    def worker_fn():
-        # Initialize model parallel (using tensor parallel as an entrypoint
-        # to expert parallel)
-        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-        )
-
-        ep_group = get_tp_group().cpu_group
-        ep_rank = torch.distributed.get_rank()
-        device = torch.device(f"cuda:{ep_rank}")
-
-        # Test parameters
-        total_physical_experts = world_size * num_local_experts
-        hidden_sizes = [32, 64]  # Two different weight matrices
-
-        # Create old expert indices (with redundancy)
-        redundancy_config = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
-
-        old_indices = create_expert_indices_with_redundancy(
-            num_layers,
-            num_logical_experts,
-            total_physical_experts,
-            redundancy_config,
-        )
-
-        # Create new expert indices (with redundancy)
-        new_redundancy_config = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
-        new_indices = create_expert_indices_with_redundancy(
-            num_layers,
-            num_logical_experts,
-            total_physical_experts,
-            new_redundancy_config,
-        )
-
-        # Create expert weights
-        expert_weights = create_expert_weights(
-            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-        )
-
-        # Execute weight rearrangement
-        rearrange_expert_weights_inplace(
-            old_indices,
-            new_indices,
-            expert_weights,
-            ep_group,
-            is_profile=False,
-        )
-
-        # Verify the rearrangement result
-        verify_expert_weights_after_shuffle(
-            expert_weights,
-            new_indices,
-            hidden_sizes,
-            ep_rank,
-            num_local_experts,
-        )
-
-        verify_redundant_experts_have_same_weights(
-            expert_weights,
-            new_indices,
-            hidden_sizes,
-            world_size,
-            num_local_experts,
-        )
-
-    distributed_run(worker_fn, world_size)
+    distributed_run(
+        _test_rearrange_expert_weights_with_redundancy,
+        world_size,
+        num_layers,
+        num_local_experts,
+        num_logical_experts,
+    )
+
+
+def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
+
+    ep_group = get_tp_group().cpu_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
+
+    num_layers = 2
+    num_local_experts = 2
+    total_physical_experts = world_size * num_local_experts
+    num_logical_experts = total_physical_experts // 2  # Some redundancy
+    hidden_sizes = [32, 64]
+
+    # Create redundancy configuration
+    redundancy_config = [2] * num_logical_experts
+
+    # Same indices - no change
+    indices = create_expert_indices_with_redundancy(
+        num_layers, num_logical_experts, total_physical_experts, redundancy_config
+    )
+
+    expert_weights = create_expert_weights(
+        num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
+    )
+
+    # Save original weights
+    original_weights = []
+    for layer_weights in expert_weights:
+        layer_copy = []
+        for weight in layer_weights:
+            layer_copy.append(weight.clone())
+        original_weights.append(layer_copy)
+
+    # Execute rearrangement (should be no change)
+    rearrange_expert_weights_inplace(
+        indices,
+        indices,  # Same indices
+        expert_weights,
+        ep_group,
+        is_profile=False,
+    )
+
+    # Verify that the weights have not changed
+    for layer in range(num_layers):
+        for weight_idx in range(len(hidden_sizes)):
+            torch.testing.assert_close(
+                expert_weights[layer][weight_idx],
+                original_weights[layer][weight_idx],
+                msg=f"""Layer {layer}, weight {weight_idx}
+ should remain unchanged""",
+            )
 
 
 @pytest.mark.parametrize("world_size", [2, 4])
@@ -388,62 +446,69 @@ def test_rearrange_expert_weights_no_change(world_size):
 
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
+    distributed_run(_test_rearrange_expert_weights_no_change, world_size)
 
-    @worker_fn_wrapper
-    def worker_fn():
-        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-        )
-
-        ep_group = get_tp_group().cpu_group
-        ep_rank = torch.distributed.get_rank()
-        device = torch.device(f"cuda:{ep_rank}")
-
-        num_layers = 2
-        num_local_experts = 2
-        total_physical_experts = world_size * num_local_experts
-        num_logical_experts = total_physical_experts // 2  # Some redundancy
-        hidden_sizes = [32, 64]
-
-        # Create redundancy configuration
-        redundancy_config = [2] * num_logical_experts
-
-        # Same indices - no change
-        indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts, redundancy_config
-        )
-
-        expert_weights = create_expert_weights(
-            num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
-        )
-
-        # Save original weights
-        original_weights = []
-        for layer_weights in expert_weights:
-            layer_copy = []
-            for weight in layer_weights:
-                layer_copy.append(weight.clone())
-            original_weights.append(layer_copy)
-
-        # Execute rearrangement (should be no change)
-        rearrange_expert_weights_inplace(
-            indices,
-            indices,  # Same indices
-            expert_weights,
-            ep_group,
-            is_profile=False,
-        )
-
-        # Verify that the weights have not changed
-        for layer in range(num_layers):
-            for weight_idx in range(len(hidden_sizes)):
-                torch.testing.assert_close(
-                    expert_weights[layer][weight_idx],
-                    original_weights[layer][weight_idx],
-                    msg=f"Layer {layer}, weight {weight_idx} should remain unchanged",
-                )
 
-    distributed_run(worker_fn, world_size)
+def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
+
+    ep_group = get_tp_group().cpu_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
+
+    num_layers = 1
+    num_local_experts = 2
+    total_physical_experts = world_size * num_local_experts
+    num_logical_experts = total_physical_experts // 2
+    hidden_sizes = [32]
+
+    # Create different index distributions
+    old_redundancy = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
+    new_redundancy = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
+
+    old_indices = create_expert_indices_with_redundancy(
+        num_layers, num_logical_experts, total_physical_experts, old_redundancy
+    )
+    new_indices = create_expert_indices_with_redundancy(
+        num_layers, num_logical_experts, total_physical_experts, new_redundancy
+    )
+
+    expert_weights = create_expert_weights(
+        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+    )
+
+    # Save original weights
+    original_weights = []
+    for layer_weights in expert_weights:
+        layer_copy = []
+        for weight in layer_weights:
+            layer_copy.append(weight.clone())
+        original_weights.append(layer_copy)
+
+    # Execute profile mode rearrangement
+    rearrange_expert_weights_inplace(
+        old_indices,
+        new_indices,
+        expert_weights,
+        ep_group,
+        is_profile=True,  # Profile mode
+    )
+
+    # In profile mode, the weights should remain unchanged
+    for layer in range(num_layers):
+        for weight_idx in range(len(hidden_sizes)):
+            torch.testing.assert_close(
+                expert_weights[layer][weight_idx],
+                original_weights[layer][weight_idx],
+                msg="In profile mode, the weights should remain unchanged",
+            )
 
 
 @pytest.mark.parametrize("world_size", [2, 4])
@@ -452,66 +517,4 @@ def test_rearrange_expert_weights_profile_mode(world_size):
 
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
-
-    @worker_fn_wrapper
-    def worker_fn():
-        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-        )
-
-        ep_group = get_tp_group().cpu_group
-        ep_rank = torch.distributed.get_rank()
-        device = torch.device(f"cuda:{ep_rank}")
-
-        num_layers = 1
-        num_local_experts = 2
-        total_physical_experts = world_size * num_local_experts
-        num_logical_experts = total_physical_experts // 2
-        hidden_sizes = [32]
-
-        # Create different index distributions
-        old_redundancy = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
-        new_redundancy = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
-
-        old_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts, old_redundancy
-        )
-        new_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts, new_redundancy
-        )
-
-        expert_weights = create_expert_weights(
-            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-        )
-
-        # Save original weights
-        original_weights = []
-        for layer_weights in expert_weights:
-            layer_copy = []
-            for weight in layer_weights:
-                layer_copy.append(weight.clone())
-            original_weights.append(layer_copy)
-
-        # Execute profile mode rearrangement
-        rearrange_expert_weights_inplace(
-            old_indices,
-            new_indices,
-            expert_weights,
-            ep_group,
-            is_profile=True,  # Profile mode
-        )
-
-        # In profile mode, the weights should remain unchanged
-        for layer in range(num_layers):
-            for weight_idx in range(len(hidden_sizes)):
-                torch.testing.assert_close(
-                    expert_weights[layer][weight_idx],
-                    original_weights[layer][weight_idx],
-                    msg="In profile mode, the weights should remain unchanged",
-                )
-
-    distributed_run(worker_fn, world_size)
+    distributed_run(_test_rearrange_expert_weights_profile_mode, world_size)

From 2c52c7fd9a480f96ac93e63eccf9a3ee01686ad4 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 20 Nov 2025 03:52:23 -0500
Subject: [PATCH 231/578] [Bug] Fix torch dynamo warning Dynamo detected a call
 to a `functools.lru_cache` (#29038)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/v1/determinism/conftest.py              |  5 +--
 tests/v1/determinism/test_batch_invariance.py | 35 +++++--------------
 .../test_online_batch_invariance.py           | 12 +++++--
 tests/v1/determinism/utils.py                 | 20 +++++++++++
 vllm/model_executor/layers/batch_invariant.py | 20 ++++++-----
 5 files changed, 52 insertions(+), 40 deletions(-)

diff --git a/tests/v1/determinism/conftest.py b/tests/v1/determinism/conftest.py
index 3c2136e00584..bde02bbd0d5c 100644
--- a/tests/v1/determinism/conftest.py
+++ b/tests/v1/determinism/conftest.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import pytest
 
+import vllm.model_executor.layers.batch_invariant as batch_invariant
+
 
 @pytest.fixture(autouse=True)
 def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch):
     """Automatically enable batch invariant kernel overrides for all tests."""
+    monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", True)
     monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
-    yield
diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index d4e88891512c..74ae5e182da7 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -6,29 +6,16 @@
 
 import pytest
 import torch
-from utils import _extract_step_logprobs, _random_prompt, skip_unsupported
+from utils import (
+    BACKENDS,
+    _extract_step_logprobs,
+    _random_prompt,
+    resolve_model_name,
+    skip_unsupported,
+)
 
+import vllm.model_executor.layers.batch_invariant as batch_invariant
 from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-
-BACKENDS: list[str] = [
-    "FLASH_ATTN",
-    "FLASHINFER",
-]
-
-if current_platform.is_cuda() and current_platform.is_device_capability(90):
-    BACKENDS.append("FLASH_ATTN_MLA")
-
-DEFAULT_MODEL = "Qwen/Qwen3-1.7B"
-MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat"
-
-
-def resolve_model_name(backend: str) -> str:
-    """Resolve the model name for the given backend, respecting env overrides."""
-    model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL)
-    if backend.endswith("MLA") and model == DEFAULT_MODEL:
-        return MLA_MODEL
-    return model
 
 
 @skip_unsupported
@@ -454,14 +441,10 @@ def test_logprobs_without_batch_invariance_should_fail(
     The test will PASS if we detect differences (proving batch invariance matters).
     The test will FAIL if everything matches (suggesting batch invariance isn't needed).
     """
-    from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
-
-    vllm_is_batch_invariant.cache_clear()
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
 
     # CRITICAL: Disable batch invariance for this test
-    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
-
+    monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
     model_name = resolve_model_name(backend)
diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py
index 23f47863dd23..d74b435797f8 100644
--- a/tests/v1/determinism/test_online_batch_invariance.py
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -16,7 +16,8 @@
 from typing import Any
 
 import openai
-from utils import _random_prompt, skip_unsupported
+import pytest
+from utils import BACKENDS, _random_prompt, resolve_model_name, skip_unsupported
 
 from tests.utils import RemoteOpenAIServer
 
@@ -133,9 +134,14 @@ def _compare_bs1_vs_bsn_single_process(
 
 
 @skip_unsupported
-def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN():
+@pytest.mark.parametrize("backend", BACKENDS)
+def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
+    backend: str, monkeypatch: pytest.MonkeyPatch
+) -> None:
     random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
-    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    # Override backend for this test (and the RemoteOpenAIServer child process).
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+    model_name = resolve_model_name(backend)
     prompts_all = [_random_prompt(10, 50) for _ in range(32)]
 
     sp_kwargs: dict[str, Any] = {
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index 5141837faea0..7ee442551e2c 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import random
 
 import pytest
@@ -12,6 +13,25 @@
     reason="Requires CUDA and >= Hopper (SM90)",
 )
 
+BACKENDS: list[str] = [
+    "FLASH_ATTN",
+    "FLASHINFER",
+]
+
+if current_platform.is_cuda() and current_platform.is_device_capability(90):
+    BACKENDS.append("FLASH_ATTN_MLA")
+
+DEFAULT_MODEL = "Qwen/Qwen3-1.7B"
+MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+
+
+def resolve_model_name(backend: str) -> str:
+    """Resolve the model name for the given backend."""
+    model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL)
+    if backend.endswith("MLA") and model == DEFAULT_MODEL:
+        return MLA_MODEL
+    return model
+
 
 def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
     # Generate more realistic prompts that will actually produce varied tokens
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 5dbeb2917434..69fa6bdffd43 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from collections.abc import Callable
-from functools import cache
 from typing import Any
 
 import torch
@@ -785,16 +784,19 @@ def enable_batch_invariant_mode():
     torch.backends.cuda.preferred_blas_library(backend="cublaslt")
 
 
-@cache
-def vllm_is_batch_invariant():
-    env_key = "VLLM_BATCH_INVARIANT"
-    is_overridden = False
-    val = os.getenv(env_key, "0")
+def _read_vllm_batch_invariant() -> bool:
+    val = os.getenv("VLLM_BATCH_INVARIANT", "0")
     try:
-        is_overridden = int(val) != 0
+        return int(val) != 0
     except ValueError:
-        is_overridden = False
-    return is_overridden
+        return False
+
+
+VLLM_BATCH_INVARIANT: bool = _read_vllm_batch_invariant()
+
+
+def vllm_is_batch_invariant() -> bool:
+    return VLLM_BATCH_INVARIANT
 
 
 def override_envs_for_invariance():

From 322cb02872d806afcaaa7d0aac3fad7f304b7888 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 20 Nov 2025 03:48:09 -0600
Subject: [PATCH 232/578] [CI/Build][AMD] Fix import errors in
 tests/kernels/attention (#29032)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 .../attention/test_cascade_flash_attn.py      | 18 +++++++++++++-----
 tests/kernels/attention/test_flash_attn.py    | 19 ++++++++++++++-----
 tests/kernels/attention/test_flashinfer.py    | 12 ++++++++++--
 .../attention/test_flashinfer_mla_decode.py   |  3 ++-
 .../test_flashinfer_trtllm_attention.py       |  3 ++-
 tests/kernels/moe/test_flashinfer.py          |  9 ++++++++-
 6 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index 20f573821b25..d86041d71feb 100755
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -7,11 +7,19 @@
 
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states
-from vllm.vllm_flash_attn import (
-    fa_version_unsupported_reason,
-    flash_attn_varlen_func,
-    is_fa_version_supported,
-)
+
+try:
+    from vllm.vllm_flash_attn import (
+        fa_version_unsupported_reason,
+        flash_attn_varlen_func,
+        is_fa_version_supported,
+    )
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "vllm_flash_attn is not supported for vLLM on ROCm.",
+            allow_module_level=True,
+        )
 
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 192, 256]
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 26b8c77ab482..bbd5df5419f8 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -6,11 +6,20 @@
 import torch
 
 from vllm.platforms import current_platform
-from vllm.vllm_flash_attn import (
-    fa_version_unsupported_reason,
-    flash_attn_varlen_func,
-    is_fa_version_supported,
-)
+
+try:
+    from vllm.vllm_flash_attn import (
+        fa_version_unsupported_reason,
+        flash_attn_varlen_func,
+        is_fa_version_supported,
+    )
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "vllm_flash_attn is not supported for vLLM on ROCm.",
+            allow_module_level=True,
+        )
+
 
 NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [40, 72, 80, 128, 256]
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index 82ec2ef14e56..eedeec33e0d4 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -2,12 +2,20 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-import flashinfer
 import pytest
-import torch
 
 from vllm.platforms import current_platform
 
+try:
+    import flashinfer
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "flashinfer is not supported for vLLM on ROCm.", allow_module_level=True
+        )
+
+import torch
+
 NUM_HEADS = [(32, 8), (6, 1)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
diff --git a/tests/kernels/attention/test_flashinfer_mla_decode.py b/tests/kernels/attention/test_flashinfer_mla_decode.py
index 0350136677c6..d183f67d3919 100644
--- a/tests/kernels/attention/test_flashinfer_mla_decode.py
+++ b/tests/kernels/attention/test_flashinfer_mla_decode.py
@@ -3,7 +3,6 @@
 import pytest
 import torch
 import torch.nn.functional as F
-from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
 from torch import Tensor
 
 from vllm.platforms import current_platform
@@ -15,6 +14,8 @@
         reason="FlashInfer MLA Requires compute capability of 10 or above.",
         allow_module_level=True,
     )
+else:
+    from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
 
 
 def ref_mla(
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 693b849ebc5d..98ea40608b46 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import flashinfer
 import pytest
 import torch
 
@@ -16,6 +15,8 @@
     pytest.skip(
         "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
     )
+else:
+    import flashinfer
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = current_platform.fp8_dtype()
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index 218df4a2632c..638741e91619 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -22,7 +22,14 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8
 from vllm.model_executor.models.llama4 import Llama4MoE
 from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+
+try:
+    from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "flashinfer not supported for vLLM on ROCm", allow_module_level=True
+        )
 
 if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability(
     90

From a903d59ffaffd9160c517fa337b3ab0265a898c3 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 20 Nov 2025 02:51:36 -0800
Subject: [PATCH 233/578] cleanup at::Tag::needs_fixed_stride_order (#28974)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 csrc/cpu/torch_bindings.cpp |  7 ++--
 csrc/torch_bindings.cpp     | 64 +++++++++++--------------------------
 2 files changed, 20 insertions(+), 51 deletions(-)

diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index b07d20bab7dd..e0e3ef71b485 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -172,7 +172,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Quantization
 #if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
     defined(__powerpc64__)
-  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
   // Helper function to release oneDNN handlers
   ops.def("release_dnnl_matmul_handler(int handler) -> ()",
           &release_dnnl_matmul_handler);
@@ -208,15 +207,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Compute int8 quantized tensor for given scaling factor.
   ops.def(
       "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
-      "Tensor? azp) -> ()",
-      {stride_tag});
+      "Tensor? azp) -> ()");
   ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
 
   // Compute int8 quantized tensor and scaling factor
   ops.def(
       "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
-      "Tensor!? azp) -> ()",
-      {stride_tag});
+      "Tensor!? azp) -> ()");
   ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
            &dynamic_scaled_int8_quant);
 #endif
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index c3ae06a30e3e..5af74c2c2a6b 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -20,18 +20,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
   //
 
-  // The default behavior in PyTorch 2.6 was changed to "requires_contiguous",
-  // so we need
-  // to override this for many GEMMs with the following tag. Otherwise,
-  // torch.compile will force all input tensors to be contiguous(), which
-  // will break many custom ops that require column-major weight matrices.
-  // This was a bug and PyTorch 2.7 has since fixed this.
-#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6
-  #define stride_tag at::Tag::needs_fixed_stride_order
-#else
-  #define stride_tag
-#endif
-
   ops.def(
       "persistent_masked_m_silu_mul_quant(Tensor input, Tensor counts, Tensor! "
       "y_q, Tensor! y_s,"
@@ -241,15 +229,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Quantized GEMM for AWQ.
   ops.def(
       "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters) -> Tensor",
-      {stride_tag});
+      "Tensor _zeros, SymInt split_k_iters) -> Tensor");
   ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
 
   // Dequantization for AWQ.
   ops.def(
       "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor",
-      {stride_tag});
+      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor");
   ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
 
   // Note about marlin kernel 'workspace' arguments:
@@ -271,8 +257,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
       "Tensor b_scales, Tensor workspace, "
       "int b_q_type, "
-      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor",
-      {stride_tag});
+      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor");
   //  conditionally compiled so impl in source file
 
   // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
@@ -298,8 +283,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "   Tensor? channel_scales,"
       "   Tensor? token_scales,"
       "   str?    schedule"
-      ") -> Tensor",
-      {stride_tag});
+      ") -> Tensor");
   ops.def(
       "machete_prepack_B("
       "   Tensor B,"
@@ -319,8 +303,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? "
       "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
-      "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor",
-      {stride_tag});
+      "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
   // conditionally compiled so impl registration is in source file
 
   // gptq_marlin repack from GPTQ.
@@ -346,8 +329,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "   Tensor token_scales,"
       "   ScalarType? out_type,"
       "   str?   maybe_schedule"
-      ") -> Tensor",
-      {stride_tag});
+      ") -> Tensor");
   // pack scales
   ops.def("cutlass_pack_scale_fp8(Tensor scales) -> Tensor");
   // encode and reorder weight matrix
@@ -394,24 +376,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
       "                      Tensor block_scale_a, Tensor block_scale_b,"
-      "                      Tensor alpha) -> ()",
-      {stride_tag});
+      "                      Tensor alpha) -> ()");
   ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
 
   // cutlass blockwise scaledgroup GEMM
   ops.def(
       "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
       "Tensor scales_a, Tensor scales_b, "
-      "Tensor problem_sizes, Tensor expert_offsets) -> ()",
-      {stride_tag});
+      "Tensor problem_sizes, Tensor expert_offsets) -> ()");
   // conditionally compiled so impl registration is in source file
 
   // cutlass nvfp4 block scaled group GEMM
   ops.def(
       "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
       " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
-      " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
-      {stride_tag});
+      " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()");
   // conditionally compiled so impl registration is in source file
 
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
@@ -419,8 +398,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "cutlass_scaled_mm(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()",
-      {stride_tag});
+      "                  Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
 
   // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
@@ -429,8 +407,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
       "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()",
-      {stride_tag});
+      "                  Tensor? azp, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);
 
   // Check if cutlass scaled_mm is supported for CUDA devices of the given
@@ -449,8 +426,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
       "               Tensor problem_sizes, Tensor a_strides, "
       "               Tensor b_strides, Tensor c_strides, bool per_act_token, "
-      "               bool per_out_ch) -> ()",
-      {stride_tag});
+      "               bool per_out_ch) -> ()");
   ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
 
   // A function that computes data required to run fused MoE with w8a8 grouped
@@ -464,8 +440,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
       "                        Tensor! input_permutation, "
       "                        Tensor! output_permutation, int num_experts, "
-      "                        int n, int k, Tensor? blockscale_offsets) -> ()",
-      {stride_tag});
+      "                        int n, int k, Tensor? blockscale_offsets) -> "
+      "()");
   ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
 
   // A function that computes problem sizes for each expert's multiplication
@@ -476,8 +452,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                                 Tensor! problem_sizes1, "
       "                                 Tensor! problem_sizes2, "
       "                                 int num_experts, int n, int k, "
-      "                                 Tensor? blockscale_offsets) -> ()",
-      {stride_tag});
+      "                                 Tensor? blockscale_offsets) -> ()");
   ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
            &get_cutlass_moe_mm_problem_sizes);
 
@@ -492,8 +467,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                             Tensor! problem_sizes2, "
       "                             Tensor expert_num_tokens, "
       "                             int num_local_experts, int padded_m, "
-      "                             int n, int k) -> ()",
-      {stride_tag});
+      "                             int n, int k) -> ()");
   ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
            &get_cutlass_pplx_moe_mm_data);
 
@@ -517,8 +491,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
       "                         Tensor bt_nzs,"
       "                         Tensor bt_meta, Tensor a_scales,"
-      "                         Tensor b_scales, Tensor? bias) -> ()",
-      {stride_tag});
+      "                         Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
   // CUTLASS sparse matrix compressor
@@ -567,8 +540,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
       "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool "
       "use_v2_format, int bit) "
-      "-> Tensor",
-      {stride_tag});
+      "-> Tensor");
   ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
 
   // Post processing for GPTQ.

From fb8851f25485c3c94b0a71b77ff800f55ba328cf Mon Sep 17 00:00:00 2001
From: Vensen <vensenmu@gmail.com>
Date: Thu, 20 Nov 2025 18:52:02 +0800
Subject: [PATCH 234/578] [Bugfix][cache_kernels]: Fix OOB in cache_kernels.cu
 (#28760)

Signed-off-by: vensen <vensenmu@gmail.com>
Signed-off-by: Vensenmu <vensenmu@gmail.com>
---
 csrc/cache_kernels.cu               | 19 +++++----
 tests/kernels/test_cache_kernels.py | 65 +++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 7 deletions(-)
 create mode 100644 tests/kernels/test_cache_kernels.py

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 0aa0dc14c748..a6c953ee0eac 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -965,7 +965,9 @@ __global__ void gather_and_maybe_dequant_cache(
     }
   };
 
-  for (int pid = split_start; pid < full_blocks_end; ++pid) {
+  const auto loop_end =
+      std::min((int64_t)full_blocks_end, block_table_stride - offset);
+  for (int pid = split_start; pid < loop_end; ++pid) {
     auto block_id = batch_block_table[pid];
     auto block_start_ptr = src_cache + block_id * cache_block_stride;
     auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
@@ -976,12 +978,15 @@ __global__ void gather_and_maybe_dequant_cache(
   }
 
   if (partial_block_size) {
-    auto block_id = batch_block_table[full_blocks_end];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
-    for (int eid = 0; eid < partial_block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
-                 block_dst_ptr + eid * dst_entry_stride);
+    if (offset + full_blocks_end < block_table_stride) {
+      auto block_id = batch_block_table[full_blocks_end];
+      auto block_start_ptr = src_cache + block_id * cache_block_stride;
+      auto block_dst_ptr =
+          dst + full_blocks_end * block_size * dst_entry_stride;
+      for (int eid = 0; eid < partial_block_size; ++eid) {
+        copy_entry(block_start_ptr + eid * cache_entry_stride,
+                   block_dst_ptr + eid * dst_entry_stride);
+      }
     }
   }
 }
diff --git a/tests/kernels/test_cache_kernels.py b/tests/kernels/test_cache_kernels.py
new file mode 100644
index 000000000000..b5d66b4ede88
--- /dev/null
+++ b/tests/kernels/test_cache_kernels.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for CUDA kernels in cache_kernels.cu."""
+
+import pytest
+import torch
+
+try:
+    from vllm import _custom_ops as ops
+except ImportError:
+    pytest.skip(
+        "Could not import vllm._custom_ops. (pip install -e .)", allow_module_level=True
+    )
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Need CUDA device")
+def test_gather_cache_oob():
+    """
+    Tests for OOB read in gather_and_maybe_dequant_cache (Issue #27909).
+    This test constructs a boundary case identified in the issue where
+    seq_starts causes the block_table offset to read out of bounds.
+    """
+
+    batch_size = 1
+    block_size = 64
+    entry_size = 128
+
+    block_table = torch.tensor([[1, 2]], dtype=torch.int32, device="cuda")
+
+    # This will result in offset = 128 / block_size = 128 / 64 = 2
+    # This will cause the kernel to try to read from
+    # block_table[0, 2], but its size is only 2.
+    seq_starts = torch.tensor([128], dtype=torch.int32, device="cuda")
+
+    seq_len = 65
+    cu_seq_lens = torch.tensor([0, seq_len], dtype=torch.int32, device="cuda")
+
+    # src_cache: [num_blocks, block_size, entry_size]
+    num_blocks = 5
+    src_cache = torch.randn(
+        (num_blocks, block_size, entry_size), dtype=torch.float16, device="cuda"
+    )
+
+    dst = torch.empty((seq_len, entry_size), dtype=torch.float16, device="cuda")
+
+    scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
+
+    # Calling the C++ function gather_and_maybe_dequant_cache
+    ops.gather_and_maybe_dequant_cache(
+        src_cache,
+        dst,
+        block_table,
+        cu_seq_lens,
+        batch_size,
+        "auto",  # kv_cache_dtype
+        scale,
+        seq_starts,
+    )
+
+    torch.cuda.synchronize()
+    assert True
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From dc45efc8ef7fc1e2571331eaf4671e1652e2a865 Mon Sep 17 00:00:00 2001
From: Dezhan <dezhantu@gmail.com>
Date: Thu, 20 Nov 2025 02:52:36 -0800
Subject: [PATCH 235/578] [BugFix] Fix Llama4 Pipeline Parallelism Assert Error
 (#28577)

Co-authored-by: Dezhan Tu <dztu@meta.com>
---
 vllm/model_executor/models/llama4.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 4c6d1d424475..e1bdfc3405f7 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -53,6 +53,7 @@
 from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
 from .utils import (
     AutoWeightsLoader,
+    PPMissingLayer,
     extract_layer_index,
     fast_topk,
     is_pp_missing_parameter,
@@ -729,6 +730,9 @@ def set_moe_parameters(self):
         self.moe_layers = []
         example_moe = None
         for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
             assert isinstance(layer, Llama4DecoderLayer)
             if isinstance(layer.feed_forward, Llama4MoE):
                 # Pick last one layer since the first ones may be dense layers.
@@ -765,6 +769,9 @@ def update_physical_experts_metadata(
         self.num_local_physical_experts = num_local_physical_experts
         self.num_redundant_experts = num_physical_experts - self.num_logical_experts
         for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
             if isinstance(layer.feed_forward, Llama4MoE):
                 moe = layer.feed_forward
                 moe.n_local_physical_experts = num_local_physical_experts

From edfe867208482ccadbf0ef503fc43e1fbb1e48f6 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Thu, 20 Nov 2025 18:52:53 +0800
Subject: [PATCH 236/578] [Misc] don't cache `CUTLASS_REVISION` var in
 CMakeLists.txt (#28518)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae8e6175443f..a4cf51d17e98 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.2.1")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})

From 66483a9d00e4e26647dd26b4c49f6eca73972b8c Mon Sep 17 00:00:00 2001
From: cjackal <44624812+cjackal@users.noreply.github.com>
Date: Thu, 20 Nov 2025 19:53:09 +0900
Subject: [PATCH 237/578] [Chore] Update `xgrammar` version from 0.1.25 to
 0.1.27 (#28221)

Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 1058ab91a02a..f2d1c0762ef6 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -24,7 +24,7 @@ outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
+xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

From 6eb745d9bdf5b69bb63f897b32465c62ecb9e14a Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Thu, 20 Nov 2025 02:53:50 -0800
Subject: [PATCH 238/578] Add truncate arg to yarn to match openai
 implementation of gpt-oss (#28244)

Signed-off-by: ashors1 <ashors@nvidia.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
---
 .../layers/rotary_embedding/__init__.py            |  1 +
 .../layers/rotary_embedding/common.py              | 14 +++++++-------
 .../layers/rotary_embedding/yarn_scaling_rope.py   |  3 +++
 vllm/model_executor/models/gpt_oss.py              |  1 +
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index ae8a7d93b50e..152d9401b8e9 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -197,6 +197,7 @@ def get_rope(
                     "beta_fast",
                     "beta_slow",
                     "apply_yarn_scaling",
+                    "truncate",
                 )
             }
             if "mrope_section" in rope_parameters:
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 196533b61795..13f8d15cc0f7 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -117,13 +117,13 @@ def yarn_find_correction_range(
     dim: int,
     base: float = 10000,
     max_position_embeddings: int = 2048,
-) -> tuple[int, int]:
-    low = math.floor(
-        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
-    )
-    high = math.ceil(
-        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
-    )
+    truncate: bool = True,
+) -> tuple[float | int, float | int]:
+    low = yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    high = yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    if truncate:
+        low = math.floor(low)
+        high = math.ceil(high)
     return max(low, 0), min(high, dim - 1)  # Clamp values just in case
 
 
diff --git a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
index ff46ad74b302..f01ca1e23121 100644
--- a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
@@ -28,12 +28,14 @@ def __init__(
         beta_fast: int = 32,
         beta_slow: int = 1,
         apply_yarn_scaling: bool = True,
+        truncate: bool = True,
     ) -> None:
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
         self.attn_factor = attn_factor
         self.beta_fast = beta_fast
         self.beta_slow = beta_slow
+        self.truncate = truncate
         # Get n-d magnitude scaling corrected for interpolation
         self.mscale = (
             float(yarn_get_mscale(self.scaling_factor) * attn_factor)
@@ -57,6 +59,7 @@ def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
             self.rotary_dim,
             self.base,
             self.max_position_embeddings,
+            self.truncate,
         )
         # Get n-d rotational scaling corrected for extrapolation
         inv_freq_mask = (
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 25048330f797..8835acb8ec65 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -78,6 +78,7 @@ def __init__(
                 ],
                 "beta_fast": config.rope_parameters["beta_fast"],
                 "beta_slow": config.rope_parameters["beta_slow"],
+                "truncate": config.rope_parameters.get("truncate", True),
             },
             is_neox_style=True,
         )

From 06c20c9904644d8f65523bb747756b2eae706b8e Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Thu, 20 Nov 2025 18:54:01 +0800
Subject: [PATCH 239/578] [ROCm] Add AMD GPU support on Deepseek v3.2 and
 SparseMLA (#26670)

Signed-off-by: ganyi <ygan@amd.com>
---
 csrc/cache_kernels.cu                         |   4 +
 vllm/attention/ops/rocm_aiter_mla_sparse.py   | 210 +++++++++++
 vllm/model_executor/models/deepseek_v2.py     |  22 +-
 vllm/platforms/rocm.py                        |  13 +-
 vllm/utils/deep_gemm.py                       |   5 +-
 .../attention/backends/mla/flashmla_sparse.py |   2 +-
 vllm/v1/attention/backends/mla/indexer.py     |  15 +-
 .../backends/mla/rocm_aiter_mla_sparse.py     | 325 ++++++++++++++++++
 vllm/v1/worker/utils.py                       |   2 +-
 9 files changed, 583 insertions(+), 15 deletions(-)
 create mode 100644 vllm/attention/ops/rocm_aiter_mla_sparse.py
 create mode 100644 vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index a6c953ee0eac..32960cc8073b 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -552,7 +552,11 @@ __global__ void indexer_k_quant_and_cache_kernel(
 #ifndef USE_ROCM
   __syncwarp();
 #endif
+#if defined(__gfx942__)
+  float scale = fmaxf(amax, 1e-4) / 224.0f;
+#else
   float scale = fmaxf(amax, 1e-4) / 448.0f;
+#endif
   if (use_ue8m0) {
     scale = exp2f(ceilf(log2f(scale)));
   }
diff --git a/vllm/attention/ops/rocm_aiter_mla_sparse.py b/vllm/attention/ops/rocm_aiter_mla_sparse.py
new file mode 100644
index 000000000000..080e92ecc940
--- /dev/null
+++ b/vllm/attention/ops/rocm_aiter_mla_sparse.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+from functools import lru_cache
+
+import torch
+
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+
+# Take from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_attention.py#L84
+def fp8_mqa_logits_torch(
+    q: torch.Tensor,
+    kv: tuple[torch.Tensor, torch.Tensor],
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits for a single sequence without KV paging.
+
+    Args:
+        q: Query tensor of shape [M, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
+            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
+            [N, 1]) with dtype `torch.float32`.
+        weights: weights of shape [M, H], dtype `torch.float32`.
+        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
+            shape [M], dtype int32.
+        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
+            shape [M], dtype int32.
+
+    Returns:
+        Logits tensor of shape [M, N], dtype `torch.float32`.
+    """
+    kv, scale = kv
+    seq_len_kv = kv.shape[0]
+    k = kv.to(torch.bfloat16)
+    q = q.to(torch.bfloat16)
+
+    mask_lo = (
+        torch.arange(0, seq_len_kv, device="cuda")[None, :] >= cu_seqlen_ks[:, None]
+    )
+    mask_hi = (
+        torch.arange(0, seq_len_kv, device="cuda")[None, :] < cu_seqlen_ke[:, None]
+    )
+    mask = mask_lo & mask_hi
+
+    score = torch.einsum("mhd,nd->hmn", q, k).float() * scale
+    logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
+    logits = logits.masked_fill(~mask, float("-inf"))
+
+    return logits
+
+
+def rocm_fp8_mqa_logits(
+    q: torch.Tensor,
+    kv: tuple[torch.Tensor, torch.Tensor],
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits for a single sequence without KV paging.
+
+    Args:
+        q: Query tensor of shape [M, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
+            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
+            [N, 1]) with dtype `torch.float32`.
+        weights: weights of shape [M, H], dtype `torch.float32`.
+        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
+            shape [M], dtype int32.
+        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
+            shape [M], dtype int32.
+
+    Returns:
+        Logits tensor of shape [M, N], dtype `torch.float32`.
+    """
+
+    # TODO(ganyi): Temporarily workaround, will remove the module check and reference
+    # path after aiter merge this kernel into main
+    @lru_cache
+    def has_mqa_logits_module():
+        return importlib.util.find_spec("aiter.ops.triton.fp8_mqa_logits") is not None
+
+    if rocm_aiter_ops.is_enabled() and has_mqa_logits_module():
+        from aiter.ops.triton.fp8_mqa_logits import fp8_mqa_logits
+
+        kv, scale = kv
+        return fp8_mqa_logits(q, kv, scale, weights, cu_seqlen_ks, cu_seqlen_ke)
+    else:
+        return fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)
+
+
+# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_attention.py#L156
+def fp8_paged_mqa_logits_torch(
+    q: torch.Tensor,
+    kv_cache: torch.Tensor,
+    weights: torch.Tensor,
+    context_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    max_model_len: int,
+):
+    from vllm.utils.math_utils import cdiv
+
+    fp8_dtype = current_platform.fp8_dtype()
+    batch_size, next_n, _, dim = q.size()
+    kv_cache, scale = kv_cache[..., :dim], kv_cache[..., dim:]
+    scale = scale.contiguous().view(torch.float)
+    q = q.float()
+    kv_cache = kv_cache.view(fp8_dtype).float() * scale
+    num_block, block_size, _, dim = kv_cache.size()
+    logits = torch.full(
+        [batch_size * next_n, max_model_len],
+        float("-inf"),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    context_lens = context_lens.tolist()
+    for i in range(batch_size):
+        context_len = context_lens[i]
+        q_offsets = torch.arange(context_len - next_n, context_len, device="cuda")
+        weight_slice = (
+            weights[i * next_n : (i + 1) * next_n, :].transpose(0, 1).contiguous()
+        )
+        for block_rk in range(cdiv(context_len, block_size)):
+            block_idx = block_tables[i][block_rk]
+            qx, kx = q[i], kv_cache[block_idx]
+            k_offsets = torch.arange(
+                block_rk * block_size, (block_rk + 1) * block_size, device="cuda"
+            )
+            mask = (k_offsets[None, :] < context_len) & (
+                k_offsets[None, :] <= q_offsets[:, None]
+            )
+            s = torch.where(
+                mask[None, :, :],
+                (qx.transpose(0, 1) @ kx.transpose(0, 1).transpose(1, 2)).to(
+                    logits.dtype
+                ),
+                float("-inf"),
+            )
+            s = torch.relu(s) * weight_slice[..., None]
+            s = s.sum(dim=0)
+            logits[
+                i * next_n : (i + 1) * next_n,
+                block_rk * block_size : (block_rk + 1) * block_size,
+            ] = torch.where(k_offsets[None, :] <= q_offsets[:, None], s, float("-inf"))
+    return logits
+
+
+def rocm_fp8_paged_mqa_logits(
+    q_fp8: torch.Tensor,
+    kv_cache_fp8: torch.Tensor,
+    weights: torch.Tensor,
+    context_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    schedule_metadata: torch.Tensor,
+    max_model_len: int,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits using paged KV-cache.
+
+    Args:
+        q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape
+            [num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last
+            4 bytes per (block,pos) store the `float` dequant scale.
+        weights: Tensor of shape [B * next_n, H], dtype `torch.float32`.
+        context_lens: Tensor of shape [B], dtype int32; effective context length
+            for each batch element.
+        block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical
+            block indices to physical blocks in the paged cache.
+        schedule_metadata: Returned by `get_paged_mqa_logits_metadata`;
+            used to distribute work across SMs.
+        max_model_len: Maximum sequence length used to size the logits output.
+
+    Returns:
+        Logits tensor of shape [B * next_n, max_model_len], dtype
+        `torch.float32`.
+    """
+
+    if rocm_aiter_ops.is_enabled():
+        from aiter.ops.triton.pa_mqa_logits import deepgemm_fp8_paged_mqa_logits_stage1
+
+        batch_size, next_n, heads, _ = q_fp8.shape
+        out_qk = torch.full(
+            (heads, batch_size * next_n, max_model_len),
+            float("-inf"),
+            device="cuda",
+            dtype=torch.float32,
+        )
+        deepgemm_fp8_paged_mqa_logits_stage1(
+            q_fp8,
+            kv_cache_fp8,
+            weights,
+            out_qk,
+            context_lens,
+            block_tables,
+            max_model_len,
+        )
+        return out_qk.sum(dim=0)
+    else:
+        return fp8_paged_mqa_logits_torch(
+            q_fp8, kv_cache_fp8, weights, context_lens, block_tables, max_model_len
+        )
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d0a116b97997..7cfd381592b4 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -594,6 +594,7 @@ def sparse_attn_indexer(
 ) -> torch.Tensor:
     # careful! this will be None in dummy run
     attn_metadata = get_forward_context().attn_metadata
+    fp8_dtype = current_platform.fp8_dtype()
     # assert isinstance(attn_metadata, dict)
     if not isinstance(attn_metadata, dict):
         return sparse_attn_indexer_fake(
@@ -633,7 +634,7 @@ def sparse_attn_indexer(
             k_fp8 = torch.empty(
                 [chunk.total_seq_lens, head_dim],
                 device=k.device,
-                dtype=torch.float8_e4m3fn,
+                dtype=fp8_dtype,
             )
             k_scale = torch.empty(
                 [chunk.total_seq_lens, 4],
@@ -647,7 +648,12 @@ def sparse_attn_indexer(
                 chunk.block_table,
                 chunk.cu_seq_lens,
             )
-            logits = fp8_mqa_logits(
+            fp8_mqa_logits_func = fp8_mqa_logits
+            if current_platform.is_rocm():
+                from vllm.attention.ops.rocm_aiter_mla_sparse import rocm_fp8_mqa_logits
+
+                fp8_mqa_logits_func = rocm_fp8_mqa_logits
+            logits = fp8_mqa_logits_func(
                 q_fp8[chunk.token_start : chunk.token_end],
                 (k_fp8, k_scale.view(torch.float32)),
                 weights[chunk.token_start : chunk.token_end],
@@ -692,7 +698,14 @@ def sparse_attn_indexer(
         next_n = padded_q_fp8_decode_tokens.shape[1]
         assert batch_size == decode_metadata.seq_lens.shape[0]
         num_padded_tokens = batch_size * next_n
-        logits = fp8_paged_mqa_logits(
+        fp8_paged_mqa_logits_func = fp8_paged_mqa_logits
+        if current_platform.is_rocm():
+            from vllm.attention.ops.rocm_aiter_mla_sparse import (
+                rocm_fp8_paged_mqa_logits,
+            )
+
+            fp8_paged_mqa_logits_func = rocm_fp8_paged_mqa_logits
+        logits = fp8_paged_mqa_logits_func(
             padded_q_fp8_decode_tokens,
             kv_cache,
             weights[:num_padded_tokens],
@@ -749,7 +762,8 @@ def sparse_attn_indexer_fake(
     _flattened_kv = torch.empty(
         [total_seq_lens, head_dim + 4], device=k.device, dtype=torch.uint8
     )
-    _k_fp8 = _flattened_kv[..., :head_dim].view(torch.float8_e4m3fn).contiguous()
+    fp8_dtype = current_platform.fp8_dtype()
+    _k_fp8 = _flattened_kv[..., :head_dim].view(fp8_dtype).contiguous()
     _k_scale = _flattened_kv[..., head_dim:].view(torch.float32).contiguous()
     return topk_indices_buffer
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f07f068a9249..1a2f9226ddce 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -225,7 +225,18 @@ def get_attn_backend_cls(
         from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
-            raise NotImplementedError("Sparse Attention is not supported on ROCm.")
+            if kv_cache_dtype.startswith("fp8"):
+                raise ValueError(
+                    "ROCMAiterMLASparseBackend doesn't support fp8 kv_cache_dtype."
+                )
+            assert block_size == 1, (
+                "Sparse MLA backend on ROCm only supports block size 1 for now."
+            )
+            logger.info_once("Using Sparse MLA backend on V1 engine.")
+            return (
+                "vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse."
+                "ROCMAiterMLASparseBackend"
+            )
 
         if use_mla:
             if selected_backend is None:
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 6b0a383a0e28..b25c1e3e1ece 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -325,6 +325,7 @@ def _align(x: int, y: int) -> int:
 def per_block_cast_to_fp8(
     x: torch.Tensor, block_size: list[int] = DEFAULT_BLOCK_SIZE, use_ue8m0: bool = False
 ) -> tuple[torch.Tensor, torch.Tensor]:
+    fp8_dtype = current_platform.fp8_dtype()
     assert x.dim() == 2
     m, n = x.shape
     block_m, block_n = block_size
@@ -334,9 +335,9 @@ def per_block_cast_to_fp8(
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    sf = x_amax / 448.0
+    sf = x_amax / 224.0 if current_platform.is_fp8_fnuz() else x_amax / 448.0
     sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf
-    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    x_scaled = (x_view * (1.0 / sf)).to(fp8_dtype)
     return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
         x_view.size(0), x_view.size(2)
     )
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index bb8d914d1571..3f2cc8c38327 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -168,7 +168,7 @@ def _convert_req_index_to_global_index_kernel(
     inblock_off = tok % BLOCK_SIZE
 
     # Guard block_table access
-    valid_block = block_id < max_num_blocks_per_req
+    valid_block = (block_id < max_num_blocks_per_req) & (block_id >= 0)
     bt_ptr = block_table_ptr + req * bt_stride0 + block_id * bt_stride1
     base = tl.load(bt_ptr, mask=valid_block, other=0)
 
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index 37aa5dad89a0..cc0988435768 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -11,7 +11,8 @@
 )
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata
+from vllm.platforms import current_platform
+from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata, is_deep_gemm_supported
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
@@ -23,7 +24,9 @@
 
 
 class DeepseekV32IndexerBackend(AttentionBackend):
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
+    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [
+        1 if current_platform.is_rocm() else 64
+    ]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -328,10 +331,10 @@ def build(
             requires_padding = (decode_lens_cpu.max() > decode_lens_cpu.min()).item()
 
             seq_lens = common_attn_metadata.seq_lens[:num_decodes]
-
-            self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata(
-                seq_lens, self.kv_cache_spec.block_size, self.num_sms
-            )
+            if is_deep_gemm_supported():
+                self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata(
+                    seq_lens, self.kv_cache_spec.block_size, self.num_sms
+                )
             decode_metadata = DeepSeekV32IndexerDecodeMetadata(
                 block_table=common_attn_metadata.block_table_tensor[:num_decodes, ...],
                 seq_lens=common_attn_metadata.seq_lens[:num_decodes],
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
new file mode 100644
index 000000000000..c0e7f0e380b9
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, ClassVar, Optional
+
+import numpy as np
+import torch
+
+from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionLayer,
+    AttentionMetadata,
+)
+from vllm.attention.backends.utils import get_mla_dims
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (
+    MLACommonBaseImpl,
+)
+from vllm.v1.attention.backends.mla.flashmla_sparse import (
+    triton_convert_req_index_to_global_index,
+)
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+if TYPE_CHECKING:
+    from vllm.model_executor.models.deepseek_v2 import Indexer
+logger = init_logger(__name__)
+
+
+class ROCMAiterMLASparseBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_AITER_MLA_SPARSE"
+
+    @staticmethod
+    def get_metadata_cls() -> type[AttentionMetadata]:
+        return ROCMAiterMLASparseMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["ROCMAiterMLASparseMetadataBuilder"]:
+        return ROCMAiterMLASparseMetadataBuilder
+
+    @staticmethod
+    def get_impl_cls() -> type["ROCMAiterMLASparseImpl"]:
+        return ROCMAiterMLASparseImpl
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
+        cache_dtype_str: str = "auto",
+    ) -> tuple[int, ...]:
+        return (num_blocks, block_size, head_size)
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16]
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [576]
+
+
+@dataclass
+class ROCMAiterMLASparseMetadata:
+    num_reqs: int
+    max_query_len: int
+    max_seq_len: int
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    query_start_loc: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    block_table: torch.Tensor
+    req_id_per_token: torch.Tensor
+    block_size: int = 1
+    topk_tokens: int = 2048
+
+
+@dataclass
+class ROCMAiterMLASparseMetadataBuilder(
+    AttentionMetadataBuilder[ROCMAiterMLASparseMetadata]
+):
+    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
+
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.kv_cache_spec = kv_cache_spec
+        self.model_config = vllm_config.model_config
+        parallel_config = vllm_config.parallel_config
+        self.device = device
+
+        self.num_heads = self.model_config.get_num_attention_heads(parallel_config)
+        self.mla_dims = get_mla_dims(self.model_config)
+        self.topk_tokens = vllm_config.model_config.hf_config.index_topk
+        self.topk_tokens_tensor = torch.tensor(
+            [self.topk_tokens], device=device, dtype=torch.int32
+        )
+        self.max_model_len_tensor = torch.tensor(
+            [self.model_config.max_model_len], device=device, dtype=torch.int32
+        )
+        # this is ignored by `flash_mla_with_kvcache` if indices not None
+        self.dummy_block_table = torch.empty(
+            (1, 1), dtype=torch.int32, device=self.device
+        )
+
+        self.req_id_per_token_buffer = torch.empty(
+            (vllm_config.scheduler_config.max_num_batched_tokens,),
+            dtype=torch.int32,
+            device=device,
+        )
+
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> ROCMAiterMLASparseMetadata:
+        num_tokens = common_attn_metadata.num_actual_tokens
+        starts = np.asarray(common_attn_metadata.query_start_loc_cpu, dtype=np.int32)
+        seg_lengths = np.diff(starts)
+        req_id_per_token = np.repeat(
+            np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths
+        )
+        # Zero-fill for cudagraphs
+        self.req_id_per_token_buffer.fill_(0)
+        self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_(
+            torch.from_numpy(req_id_per_token), non_blocking=True
+        )
+        req_id_per_token = self.req_id_per_token_buffer[:num_tokens]
+
+        metadata = ROCMAiterMLASparseMetadata(
+            num_reqs=common_attn_metadata.num_reqs,
+            max_query_len=common_attn_metadata.max_query_len,
+            max_seq_len=common_attn_metadata.max_seq_len,
+            num_actual_tokens=common_attn_metadata.num_actual_tokens,
+            query_start_loc=common_attn_metadata.query_start_loc,
+            slot_mapping=common_attn_metadata.slot_mapping,
+            block_table=common_attn_metadata.block_table_tensor,
+            req_id_per_token=req_id_per_token,
+            block_size=self.kv_cache_spec.block_size,
+            topk_tokens=self.topk_tokens,
+        )
+        return metadata
+
+
+# Take from
+# https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla_prefill.py#L72
+def reference_mla_sparse_prefill(
+    q: torch.Tensor, kv: torch.Tensor, indices: torch.Tensor, sm_scale: float, d_v: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    import math
+
+    def log2sumexp2(a: torch.Tensor, dim: int) -> torch.Tensor:
+        return torch.logsumexp(a * math.log(2), dim=dim) * math.log2(math.e)
+
+    skv = kv.shape[0]
+    sq = q.shape[0]
+    topk = indices.shape[-1]
+    dqk = q.shape[-1]
+    indices = indices[:, 0, :]  # [s_q, topk]
+    invalid_indices_mask = (indices < 0) | (indices >= skv)
+    indices[invalid_indices_mask] = 0
+    qs = q  # [s_q, h_q, d_qk]
+    kvs = kv[:, 0, :][indices].view(sq, topk, dqk)  # [s_q, topk, d_qk]
+
+    attn_score = (qs @ kvs.transpose(1, 2)).float()  # [s_q, h_q, topk]
+    attn_score.masked_fill_(invalid_indices_mask.unsqueeze(1), float("-inf"))
+    attn_score *= sm_scale * math.log2(math.e)
+    lse = log2sumexp2(attn_score, dim=-1)  # [s_q, h_q]
+    attn_score = torch.exp2(attn_score - lse.unsqueeze(-1))  # [s_q, h_q, topk]
+    result = attn_score.to(q.dtype) @ kvs[:, :, :d_v]
+    return (result, lse)
+
+
+class ROCMAiterMLASparseImpl(MLACommonBaseImpl[ROCMAiterMLASparseMetadata]):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        topk_indice_buffer: torch.Tensor | None = None,
+        indexer: Optional["Indexer"] = None,
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
+        self.softmax_scale = scale
+        assert indexer is not None
+        self.topk_indices_buffer = indexer.topk_indices_buffer
+        self.is_fp8bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled()
+
+    def _forward_bf16_kv(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        topk_indices: torch.Tensor,
+        attn_metadata: ROCMAiterMLASparseMetadata,
+    ) -> torch.Tensor:
+        num_tokens = q.shape[0]
+        kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view(
+            -1, 1, kv_c_and_k_pe_cache.shape[-1]
+        )
+
+        topk_indices = topk_indices.view(num_tokens, 1, -1)
+        output = reference_mla_sparse_prefill(
+            q, kv_c_and_k_pe_cache, topk_indices, self.softmax_scale, 512
+        )[0]
+        return output[:, : self.num_heads, :]
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        q: torch.Tensor,
+        k_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: ROCMAiterMLASparseMetadata,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        # NOTE(lucas): for the sparse FlashMLA kernels the kernels want to use
+        # MQA 576/512 approach for both prefill and decode
+
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None or output_block_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported for ROCMAiterMLASparse"
+            )
+
+        if attn_metadata is None:
+            # The zero fill is required when used with DP + EP
+            # to ensure all ranks within a DP group compute the
+            # same expert outputs.
+            return output.fill_(0)
+
+        num_actual_toks = attn_metadata.num_actual_tokens
+
+        # Inputs and outputs may be padded for CUDA graphs
+
+        q = q[:num_actual_toks, ...]
+        k_c_normed = k_c_normed[:num_actual_toks, ...]
+        k_pe = k_pe[:num_actual_toks, ...]
+
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        # Convert from (B, N, P) to (N, B, P)
+        q_nope = q_nope.transpose(0, 1)
+        if self.is_fp8bmm_enabled:
+            # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L)
+            ql_nope = rocm_aiter_ops.triton_fp8_bmm(
+                q_nope, self.W_K, self.W_K_scale, group_size=128, transpose_bm=True
+            )
+        else:
+            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+            ql_nope = torch.bmm(q_nope, self.W_UK_T)
+            # Convert from (N, B, L) to (B, N, L)
+            ql_nope = ql_nope.transpose(0, 1)
+
+        topk_indices = self.topk_indices_buffer[:num_actual_toks]
+
+        topk_indices_global = triton_convert_req_index_to_global_index(
+            attn_metadata.req_id_per_token,
+            attn_metadata.block_table,
+            topk_indices,
+            BLOCK_SIZE=attn_metadata.block_size,
+            NUM_TOPK_TOKENS=attn_metadata.topk_tokens,
+        )
+
+        q = torch.cat([ql_nope, q_pe], dim=-1)
+
+        # write the latent and rope to kv cache
+        if kv_cache.numel() > 0:
+            ops.concat_and_cache_mla(
+                k_c_normed,
+                k_pe.squeeze(1),
+                kv_cache,
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=layer._k_scale,
+            )
+
+        attn_out = self._forward_bf16_kv(
+            q, kv_cache, topk_indices_global, attn_metadata
+        )
+
+        self._v_up_proj(attn_out, out=output[:num_actual_toks])
+        return output
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 095407a8b959..9e99ea964ee0 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -316,7 +316,7 @@ def bind_kv_cache(
             # TODO - analyze where runner_kv_caches is used and the right
             # way to ensure it properly reflects multiple attention layers
             # in the same decoder block.
-            if current_platform.is_cuda() or current_platform.is_xpu():
+            if current_platform.is_cuda_alike() or current_platform.is_xpu():
                 # We know that the GPU runner is not impacted by this
                 # case. Some test code depends on runner_kv_caches, but
                 # not in a way that's impacted by ignoring this.

From c0c2dd1e0b75c70706f4d8dbcd1d75f1c1750e14 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Thu, 20 Nov 2025 12:55:10 +0200
Subject: [PATCH 240/578] [BugFix] kv_offloading: Fix bug in loading of partial
 cpu blocks (#28951)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/v1/kv_offload/test_cpu_gpu.py  |  4 ++--
 vllm/v1/kv_offload/worker/cpu_gpu.py | 20 +++++++++-----------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py
index 0d4fa344d298..a248104e16d2 100644
--- a/tests/v1/kv_offload/test_cpu_gpu.py
+++ b/tests/v1/kv_offload/test_cpu_gpu.py
@@ -103,8 +103,8 @@ def test_transfer(
         for i in range(gpu_blocks_per_cpu_block):
             cpu_blocks_in_gpu_block_size.append(i + base_block_id)
 
-    # maybe skip a GPU block to test writing to the middle of a CPU block
-    if gpu_to_cpu:
+    # maybe skip a GPU block to test reading from the middle of a CPU block
+    if not gpu_to_cpu:
         gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1 :]
         cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[
             gpu_blocks_per_cpu_block - 1 :
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index 0f2ec4a1b41f..111046377a5d 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -135,22 +135,20 @@ def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
         assert src_blocks.ndim == 1
         assert dst_blocks.ndim == 1
 
-        dst_sub_blocks_to_skip = -src_blocks.size % dst_block_size_factor
         src_sub_block_count = src_blocks.size * src_block_size_factor
+        dst_sub_block_count = dst_blocks.size * dst_block_size_factor
+        src_sub_blocks_to_skip = -dst_blocks.size % src_block_size_factor
 
-        assert (
-            src_sub_block_count
-            == dst_blocks.size * dst_block_size_factor - dst_sub_blocks_to_skip
-        )
+        assert dst_sub_block_count == src_sub_block_count - src_sub_blocks_to_skip
 
-        src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64)
-        expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0])
+        src_to_dst = np.empty((dst_sub_block_count, 2), dtype=np.int64)
         expand_block_ids(
-            dst_blocks,
-            dst_block_size_factor,
-            src_to_dst[:, 1],
-            skip_count=dst_sub_blocks_to_skip,
+            src_blocks,
+            src_block_size_factor,
+            src_to_dst[:, 0],
+            skip_count=src_sub_blocks_to_skip,
         )
+        expand_block_ids(dst_blocks, dst_block_size_factor, src_to_dst[:, 1])
         src_to_dst_tensor = torch.from_numpy(src_to_dst)
 
         event = self.events_pool.pop() if self.events_pool else torch.Event()

From c9e093116c00781dda86df7a77e976c614b35d51 Mon Sep 17 00:00:00 2001
From: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
Date: Thu, 20 Nov 2025 20:00:19 +0900
Subject: [PATCH 241/578] [MODEL] Implement plamo3 (#28834)

Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
---
 docs/models/supported_models.md             |   1 +
 tests/distributed/test_pipeline_parallel.py |   1 +
 tests/models/registry.py                    |   4 +
 vllm/model_executor/models/plamo3.py        | 431 ++++++++++++++++++++
 vllm/model_executor/models/registry.py      |   1 +
 5 files changed, 438 insertions(+)
 create mode 100644 vllm/model_executor/models/plamo3.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 80fe143269a7..f0531ced0aaa 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -436,6 +436,7 @@ th {
 | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
 | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ |
 | `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ |
+| `Plamo3ForCausalLM` | PLaMo3 | `pfnet/plamo-3-nict-2b-base`, `pfnet/plamo-3-nict-8b-base`, etc. | | ✅︎ |
 | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ |
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ |
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 0ab94d30858f..89f035d2cdd6 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -130,6 +130,7 @@ def iter_params(self, model_id: str):
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
     "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
     "pfnet/plamo-2-1b": PPTestSettings.fast(),
+    "pfnet/plamo-3-nict-2b-base": PPTestSettings.fast(),
     "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
     # Tests TransformersForCausalLM
     "hmellor/Ilama-3.2-1B": PPTestSettings.fast(),
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 094f921e4305..1999e3cd2de2 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -402,6 +402,10 @@ def check_available_online(
         "pfnet/plamo-2-1b",
         trust_remote_code=True,
     ),
+    "Plamo3ForCausalLM": _HfExamplesInfo(
+        "pfnet/plamo-3-nict-2b-base",
+        trust_remote_code=True,
+    ),
     "QWenLMHeadModel": _HfExamplesInfo(
         "Qwen/Qwen-7B-Chat",
         max_transformers_version="4.53",
diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py
new file mode 100644
index 000000000000..5bb07722a5fc
--- /dev/null
+++ b/vllm/model_executor/models/plamo3.py
@@ -0,0 +1,431 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only PLaMo3 model."""
+
+from collections.abc import Iterable
+from itertools import islice
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    LoaderFunction,
+    composed_weight_loader,
+    default_weight_loader,
+)
+from vllm.model_executor.models.interfaces import SupportsPP
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.sequence import IntermediateTensors
+
+
+# Only used for type hinting.
+class Plamo3Config(PretrainedConfig):  # type: ignore
+    model_type: str = "plamo3"
+
+    hidden_size: int
+    num_hidden_layers: int
+    rms_norm_eps: float
+    # Attention
+    num_attention_heads: int
+    head_dim: int
+    num_key_value_heads: int
+    # vllm rename `sliding_window` attr to `interleaved_sliding_window`
+    # if `sliding_window` is list
+    interleaved_sliding_window: list[int | None]
+    sliding_window_pattern: int
+    rope_theta: int
+    rope_local_theta: int
+    # MLP
+    intermediate_size: int
+    # Tokenizer
+    vocab_size: int
+
+
+def rms_norm_weight_loader(offset: float) -> LoaderFunction:
+    return composed_weight_loader(
+        default_weight_loader,
+        lambda x: x + offset,
+    )
+
+
+class DenseMLP(nn.Module):
+    def __init__(
+        self,
+        config: Plamo3Config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            prefix=f"{prefix}.gate_up_proj",
+            quant_config=quant_config,
+            return_bias=False,
+        )
+        self.act = SiluAndMul()
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            prefix=f"{prefix}.down_proj",
+            quant_config=quant_config,
+            return_bias=False,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        h = self.gate_up_proj(hidden_states)
+        h = self.act(h)
+        return self.down_proj(h)
+
+
+class Plamo3AttentionMixer(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        layer_idx = extract_layer_index(prefix)
+        full_attn = config.interleaved_sliding_window[layer_idx] is None
+
+        self.rope_theta = config.rope_theta if full_attn else config.rope_local_theta
+        self.rope_scaling = (
+            config.rope_scaling if hasattr(config, "rope_scaling") else None
+        )
+        max_position = config.max_position_embeddings
+        if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
+            vllm_config.model_config.max_model_len, int
+        ):
+            max_position = min(max_position, vllm_config.model_config.max_model_len)
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+        )
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.q_norm.weight, {"weight_loader": rms_norm_weight_loader(offset=1.0)}
+        )
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.k_norm.weight, {"weight_loader": rms_norm_weight_loader(offset=1.0)}
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            per_layer_sliding_window=config.interleaved_sliding_window[layer_idx],
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q_shape = q.shape
+        q = q.reshape(q_shape[:-1] + (q_shape[-1] // self.head_dim, self.head_dim))
+        q = self.q_norm.forward_native(q).reshape(q_shape)
+        k_shape = k.shape
+        k = k.reshape(k_shape[:-1] + (k_shape[-1] // self.head_dim, self.head_dim))
+        k = self.k_norm.forward_native(k).reshape(k_shape)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Plamo3DecoderLayer(nn.Module):
+    def __init__(
+        self, vllm_config: VllmConfig, prefix: str = "", **kwargs: Any
+    ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.mixer = Plamo3AttentionMixer(
+            vllm_config=vllm_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.mlp = DenseMLP(
+            config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
+        self.pre_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.pre_mixer_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0)},
+        )
+        self.post_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.post_mixer_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0 / 5)},
+        )
+        self.pre_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.pre_mlp_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0)},
+        )
+        self.post_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.post_mlp_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0 / (5**1.5))},
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs: Any,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.pre_mixer_norm(hidden_states)
+        else:
+            hidden_states, residual = self.pre_mixer_norm(hidden_states, residual)
+
+        hidden_states = self.mixer(
+            positions=positions, hidden_states=hidden_states, residual=residual
+        )
+        hidden_states = self.post_mixer_norm(hidden_states)
+        # Fully Connected
+        hidden_states, residual = self.pre_mlp_norm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_norm(hidden_states)
+        return hidden_states, residual
+
+
+class Plamo3Decoder(torch.nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            num_hidden_layers,
+            lambda prefix: Plamo3DecoderLayer(vllm_config, prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Plamo3Model(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+        self.layers = Plamo3Decoder(vllm_config, prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0)},
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        hidden_states, residual = self.layers(
+            positions=positions, hidden_states=hidden_states, residual=residual
+        )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Plamo3ForCausalLM(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.scheduler_config = vllm_config.scheduler_config
+
+        self.model = Plamo3Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        self.vocab_size = self.config.vocab_size
+        self.unpadded_vocab_size = self.config.vocab_size
+
+        num_embeddings = ((self.vocab_size + 15) // 16) * 16
+        self.lm_head = ParallelLMHead(
+            num_embeddings,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            prefix=f"{prefix}.lm_head",
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
+        self.logits_processor = LogitsProcessor(
+            self.unpadded_vocab_size, self.config.vocab_size
+        )
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a2de597c87d8..494398760620 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -157,6 +157,7 @@
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
     "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"),
+    "Plamo3ForCausalLM": ("plamo3", "Plamo3ForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),

From 371b1d4c61335ed4c1d7fb2acee75274cc6d4551 Mon Sep 17 00:00:00 2001
From: Samit <285365963@qq.com>
Date: Thu, 20 Nov 2025 19:01:03 +0800
Subject: [PATCH 242/578] [RL] Add Pause and Resume Generation for Asynchronous
 RL Training (#28037)

Signed-off-by: SamitHuang <285365963@qq.com>
Signed-off-by: Samit <285365963@qq.com>
Signed-off-by: samithuang <285365963@qq.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 vllm/engine/protocol.py               | 27 ++++++++++
 vllm/entrypoints/openai/api_server.py | 78 +++++++++++++++++++++++++++
 vllm/v1/engine/async_llm.py           | 64 ++++++++++++++++++++++
 vllm/v1/engine/output_processor.py    | 13 +++++
 4 files changed, 182 insertions(+)

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 462d2c4e50e7..5e3374f9f6a1 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -149,6 +149,33 @@ async def add_lora(self, lora_request: LoRARequest) -> bool:
         """Load a new LoRA adapter into the engine for future requests."""
         ...
 
+    @abstractmethod
+    async def pause_generation(
+        self,
+        *,
+        wait_for_inflight_requests: bool = False,
+        clear_cache: bool = True,
+    ) -> None:
+        """Pause new generation/encoding requests.
+
+        Args:
+            wait_for_inflight_requests: When ``True`` waits for in-flight requests
+                to finish before pausing. When ``False`` (default), aborts in-flight
+                requests immediately.
+            clear_cache: Whether to clear KV and prefix caches after draining.
+        """
+        ...
+
+    @abstractmethod
+    async def resume_generation(self) -> None:
+        """Resume accepting generation/encoding requests."""
+        ...
+
+    @abstractmethod
+    async def is_paused(self) -> bool:
+        """Return whether the engine is currently paused."""
+        ...
+
     async def scale_elastic_ep(
         self, new_data_parallel_size: int, drain_timeout: int = 300
     ) -> None:
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 3974f45a7135..70174250ceab 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -394,6 +394,84 @@ async def get_server_load_metrics(request: Request):
     return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
 
 
+@router.post("/pause")
+async def pause_generation(
+    raw_request: Request,
+    wait_for_inflight_requests: bool = Query(False),
+    clear_cache: bool = Query(True),
+) -> JSONResponse:
+    """Pause generation requests to allow weight updates.
+
+    Args:
+        wait_for_inflight_requests: When ``True`` waits for in-flight
+            requests to finish before pausing. When ``False`` (default),
+            aborts any in-flight requests immediately.
+        clear_cache: Whether to clear KV/prefix caches after draining.
+    """
+
+    engine = engine_client(raw_request)
+
+    try:
+        await engine.pause_generation(
+            wait_for_inflight_requests=wait_for_inflight_requests,
+            clear_cache=clear_cache,
+        )
+        return JSONResponse(
+            content={"status": "paused"},
+            status_code=HTTPStatus.OK.value,
+        )
+
+    except ValueError as err:
+        return JSONResponse(
+            content={"error": str(err)},
+            status_code=HTTPStatus.BAD_REQUEST.value,
+        )
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to pause generation")
+        return JSONResponse(
+            content={"error": f"Failed to pause generation: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+
+@router.post("/resume")
+async def resume_generation(raw_request: Request) -> JSONResponse:
+    """Resume generation after a pause."""
+
+    engine = engine_client(raw_request)
+
+    try:
+        await engine.resume_generation()
+        return JSONResponse(
+            content={"status": "resumed"},
+            status_code=HTTPStatus.OK.value,
+        )
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to resume generation")
+        return JSONResponse(
+            content={"error": f"Failed to resume generation: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+
+@router.get("/is_paused")
+async def is_paused(raw_request: Request) -> JSONResponse:
+    """Return the current pause status."""
+
+    engine = engine_client(raw_request)
+
+    try:
+        paused = await engine.is_paused()
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to fetch pause status")
+        return JSONResponse(
+            content={"error": f"Failed to fetch pause status: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+    return JSONResponse(content={"is_paused": paused})
+
+
 @router.post(
     "/tokenize",
     dependencies=[Depends(validate_json_request)],
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index abf2c8cfa453..c64b3cccfc65 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -152,6 +152,10 @@ def __init__(
             )
             self.logger_manager.log_engine_initialized()
 
+        # Pause / resume state for async RL workflows.
+        self._pause_cond = asyncio.Condition()
+        self._paused = False
+
         self.output_handler: asyncio.Task | None = None
         try:
             # Start output handler eagerly if we are in the asyncio eventloop.
@@ -404,6 +408,10 @@ async def generate(
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
 
+            # Wait until generation is resumed if the engine is paused.
+            async with self._pause_cond:
+                await self._pause_cond.wait_for(lambda: not self._paused)
+
             if tokenization_kwargs is None:
                 tokenization_kwargs = {}
                 truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
@@ -551,6 +559,58 @@ async def abort(self, request_id: str | Iterable[str]) -> None:
         if self.log_requests:
             logger.info("Aborted request(s) %s.", ",".join(request_ids))
 
+    async def pause_generation(
+        self,
+        *,
+        wait_for_inflight_requests: bool = False,
+        clear_cache: bool = True,
+    ) -> None:
+        """
+        Pause generation to allow model weight updates.
+
+        New generation/encoding requests are blocked until resume.
+
+        Args:
+            wait_for_inflight_requests: When ``True`` waits for in-flight
+                requests to finish before pausing. When ``False`` (default),
+                immediately aborts any in-flight requests.
+            clear_cache: Whether to clear KV cache and prefix cache after
+                draining. Set to ``False`` to preserve cache for faster resume.
+                Default is ``True`` (clear caches).
+        """
+
+        async with self._pause_cond:
+            if self._paused:
+                return
+            self._paused = True
+
+        if not wait_for_inflight_requests:
+            request_ids = list(self.output_processor.request_states.keys())
+            if request_ids:
+                await self.abort(request_ids)
+
+        # Wait for running requests to drain before clearing cache.
+        if self.output_processor.has_unfinished_requests():
+            await self.output_processor.wait_for_requests_to_drain()
+
+        # Clear cache
+        if clear_cache:
+            await self.reset_prefix_cache()
+            await self.reset_mm_cache()
+
+    async def resume_generation(self) -> None:
+        """Resume generation after :meth:`pause_generation`."""
+
+        async with self._pause_cond:
+            self._paused = False
+            self._pause_cond.notify_all()  # Wake up all waiting requests
+
+    async def is_paused(self) -> bool:
+        """Return whether the engine is currently paused."""
+
+        async with self._pause_cond:
+            return self._paused
+
     async def encode(
         self,
         prompt: PromptType,
@@ -582,6 +642,10 @@ async def encode(
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
 
+            # Respect pause state before accepting new requests.
+            async with self._pause_cond:
+                await self._pause_cond.wait_for(lambda: not self._paused)
+
             if tokenization_kwargs is None:
                 tokenization_kwargs = {}
             _validate_truncation_size(
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index bdbbfe2595f8..0453c4a77f0c 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -350,6 +350,8 @@ def __init__(
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates(log_stats)
         self.tracer: Tracer | None = None
+        self._requests_drained = asyncio.Event()
+        self._requests_drained.set()
 
     def get_num_unfinished_requests(self):
         return len(self.request_states)
@@ -357,6 +359,11 @@ def get_num_unfinished_requests(self):
     def has_unfinished_requests(self) -> bool:
         return len(self.request_states) > 0
 
+    async def wait_for_requests_to_drain(self) -> None:
+        if not self.request_states:
+            return
+        await self._requests_drained.wait()
+
     def propagate_error(self, e: Exception):
         """Propagate error to all generate() tasks."""
 
@@ -396,6 +403,8 @@ def abort_requests(
                     child_reqs = self.abort_requests(child_reqs)
                     request_ids_to_abort.extend(child_reqs)
                 self.parent_requests.pop(request_id, None)
+        if not self.request_states:
+            self._requests_drained.set()
         return request_ids_to_abort
 
     def add_request(
@@ -420,6 +429,8 @@ def add_request(
             log_stats=self.log_stats,
             stream_interval=self.stream_interval,
         )
+        if self._requests_drained.is_set():
+            self._requests_drained.clear()
         self.request_states[request_id] = req_state
         if parent_req:
             self.parent_requests[parent_req.request_id] = parent_req
@@ -511,6 +522,8 @@ def process_outputs(
                 parent_req = req_state.parent_req
                 if parent_req and not parent_req.child_requests:
                     self.parent_requests.pop(parent_req.request_id, None)
+                if not self.request_states:
+                    self._requests_drained.set()
                 if not engine_core_output.finished:
                     # If req not finished in EngineCore, but Detokenizer
                     # detected stop string, abort needed in EngineCore.

From 93c8672ceb06f6e9c282a96fcd85a7ce41293693 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Thu, 20 Nov 2025 03:05:50 -0800
Subject: [PATCH 243/578] [Bugfix] Fix spec decode memory regression after
 #28549 (#28819)

Signed-off-by: zhewenli <zhewenli@meta.com>
---
 vllm/model_executor/models/deepseek_eagle.py | 5 -----
 vllm/model_executor/models/llama4_eagle.py   | 7 -------
 vllm/model_executor/models/llama_eagle.py    | 5 -----
 vllm/v1/spec_decode/eagle.py                 | 7 +++++--
 4 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index 3fb04c3b70dd..4d7a37292cb0 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -8,7 +8,6 @@
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -172,10 +171,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                     )
                     break
                 else:
-                    # if PP disabled then draft will share embed with target
-                    if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                        continue
-
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index 660c8f1bb522..0146b3057928 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -23,7 +23,6 @@
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -127,17 +126,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # if PP disabled then draft will share embed with target
-                if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         for name in params_dict:
-            # if PP disabled then draft will share embed with target
-            if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                continue
             assert name in loaded_params, f"{name} is not loaded!"
         return loaded_params
 
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 90ab5c50361b..05cb456e7776 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -9,7 +9,6 @@
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -155,10 +154,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # if PP disabled then draft will share embed with target
-                if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                    continue
-
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 406bb696bd4c..ba37bc81607f 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1028,8 +1028,11 @@ def load_model(self, target_model: nn.Module) -> None:
                 elif (
                     isinstance(target_embed_tokens.weight, torch.Tensor)
                     and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
-                    and torch.equal(
-                        target_embed_tokens.weight, self.model.model.embed_tokens.weight
+                    and torch.allclose(
+                        target_embed_tokens.weight.cpu(),
+                        self.model.model.embed_tokens.weight.cpu(),
+                        rtol=1e-5,
+                        atol=1e-7,
                     )
                 ):
                     share_embeddings = True

From a2e9ebe9e242295a58e400835ef98a14b29c4fb0 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Thu, 20 Nov 2025 20:14:29 +0800
Subject: [PATCH 244/578] [BugFix] Fix flash_attn import in `siglip2navit.py`
 (#29082)

Signed-off-by: Fanli Lin <fanli.lin@intel.com>
---
 vllm/model_executor/models/siglip2navit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 46f5e67d659e..c185b45345bd 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -191,7 +191,7 @@ def apply_rotary_pos_emb(
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
     if is_flash_attn_backend and not current_platform.is_xpu():
-        from flash_attn.layers.rotary import apply_rotary_emb
+        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
 
         apply_rotary_emb_func = apply_rotary_emb
     else:

From 82b05b15e61badfd0c5912d4c3eebc88043c9ef8 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Thu, 20 Nov 2025 23:34:11 +0700
Subject: [PATCH 245/578] [BugFix] [FEAT] Enable fastsafetensors for ROCm
 platform (#28225)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 requirements/rocm.txt                                          | 1 +
 .../fastsafetensors_loader/test_fastsafetensors_loader.py      | 3 ++-
 .../model_loader/fastsafetensors_loader/test_weight_utils.py   | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 6f1cca90e5e2..abbd33d6e124 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -15,3 +15,4 @@ setuptools-scm>=8
 runai-model-streamer[s3,gcs]==0.15.0
 conch-triton-kernels==1.2.1
 timm>=1.0.17
+fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
diff --git a/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
index f154df6dfc23..c5b3c731ffc6 100644
--- a/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
+++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
@@ -19,7 +19,8 @@
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs"
+    not current_platform.is_cuda_alike(),
+    reason="fastsafetensors requires NVIDIA/AMD GPUs",
 )
 def test_model_loader_download_files(vllm_runner):
     with vllm_runner(test_model, load_format="fastsafetensors") as llm:
diff --git a/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
index bd216f0e41a4..1975eb61b25d 100644
--- a/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
+++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
@@ -17,7 +17,8 @@
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs"
+    not current_platform.is_cuda_alike(),
+    reason="fastsafetensors requires NVIDIA/AMD GPUs",
 )
 def test_fastsafetensors_model_loader():
     with tempfile.TemporaryDirectory() as tmpdir:

From 56f45eddaff817ec7118bf9a73c5e4b560738bed Mon Sep 17 00:00:00 2001
From: rookie <66160395+zhanggzh@users.noreply.github.com>
Date: Fri, 21 Nov 2025 01:02:30 +0800
Subject: [PATCH 246/578] [Frontend] Optimize beam search loop by sorting and
 then splicing (#19347)

Signed-off-by: zhangguozhu <zhangguozhu@360.cn>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: zhangguozhu <zhangguozhu@360.cn>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 vllm/entrypoints/openai/serving_engine.py | 103 +++++++++++++++-------
 1 file changed, 70 insertions(+), 33 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index c50b0c4a23e1..127b8e6dcb87 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -10,6 +10,7 @@
 from http import HTTPStatus
 from typing import Any, ClassVar, Generic, TypeAlias, TypeVar
 
+import numpy as np
 import torch
 from fastapi import Request
 from pydantic import BaseModel, ConfigDict, Field, TypeAdapter
@@ -389,8 +390,9 @@ async def beam_search(
 
         sort_beams_key = create_sort_beams_key_function(eos_token_id, length_penalty)
 
+        logprobs_num = 2 * beam_width
         beam_search_params = SamplingParams(
-            logprobs=2 * beam_width,
+            logprobs=logprobs_num,
             max_tokens=1,
             temperature=temperature,
         )
@@ -443,40 +445,75 @@ async def beam_search(
             output = [x[0] for x in await asyncio.gather(*tasks)]
 
             new_beams = []
-            for i, current_beam in enumerate(all_beams):
-                result = output[i]
-
+            # Store all new tokens generated by beam
+            all_beams_token_id = []
+            # Store the cumulative probability of all tokens
+            # generated by beam search
+            all_beams_logprob = []
+            # Iterate through all beam inference results
+            for i, result in enumerate(output):
+                current_beam = all_beams[i]
                 if result.outputs[0].logprobs is not None:
                     logprobs = result.outputs[0].logprobs[0]
-                    for token_id, logprob_obj in logprobs.items():
-                        if token_id == eos_token_id and not ignore_eos:
-                            completed.append(
-                                BeamSearchSequence(
-                                    tokens=current_beam.tokens + [token_id]
-                                    if include_stop_str_in_output
-                                    else current_beam.tokens,
-                                    logprobs=current_beam.logprobs + [logprobs],
-                                    cum_logprob=current_beam.cum_logprob
-                                    + logprob_obj.logprob,
-                                    finish_reason="stop",
-                                    stop_reason=eos_token_id,
-                                )
-                            )
-                        else:
-                            new_beams.append(
-                                BeamSearchSequence(
-                                    tokens=current_beam.tokens + [token_id],
-                                    logprobs=current_beam.logprobs + [logprobs],
-                                    lora_request=current_beam.lora_request,
-                                    cum_logprob=current_beam.cum_logprob
-                                    + logprob_obj.logprob,
-                                    multi_modal_data=current_beam.multi_modal_data,
-                                    mm_processor_kwargs=current_beam.mm_processor_kwargs,
-                                )
-                            )
-
-            sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True)
-            all_beams = sorted_beams[:beam_width]
+                    all_beams_token_id.extend(list(logprobs.keys()))
+                    all_beams_logprob.extend(
+                        [
+                            current_beam.cum_logprob + obj.logprob
+                            for obj in logprobs.values()
+                        ]
+                    )
+
+            # Handle the token for the end of sentence (EOS)
+            all_beams_token_id = np.array(all_beams_token_id)
+            all_beams_logprob = np.array(all_beams_logprob)
+
+            if not ignore_eos:
+                # Get the index position of eos token in all generated results
+                eos_idx = np.where(all_beams_token_id == eos_token_id)[0]
+                for idx in eos_idx:
+                    current_beam = all_beams[idx // logprobs_num]
+                    result = output[idx // logprobs_num]
+                    assert result.outputs[0].logprobs is not None
+                    logprobs_entry = result.outputs[0].logprobs[0]
+                    completed.append(
+                        BeamSearchSequence(
+                            tokens=current_beam.tokens + [eos_token_id]
+                            if include_stop_str_in_output
+                            else current_beam.tokens,
+                            logprobs=current_beam.logprobs + [logprobs_entry],
+                            cum_logprob=float(all_beams_logprob[idx]),
+                            finish_reason="stop",
+                            stop_reason=eos_token_id,
+                        )
+                    )
+                # After processing, set the log probability of the eos condition
+                # to negative infinity.
+                all_beams_logprob[eos_idx] = -np.inf
+
+            # Processing non-EOS tokens
+            # Get indices of the top beam_width probabilities
+            topn_idx = np.argpartition(np.negative(all_beams_logprob), beam_width)[
+                :beam_width
+            ]
+
+            for idx in topn_idx:
+                current_beam = all_beams[idx // logprobs_num]
+                result = output[idx // logprobs_num]
+                token_id = int(all_beams_token_id[idx])
+                assert result.outputs[0].logprobs is not None
+                logprobs_entry = result.outputs[0].logprobs[0]
+                new_beams.append(
+                    BeamSearchSequence(
+                        tokens=current_beam.tokens + [token_id],
+                        logprobs=current_beam.logprobs + [logprobs_entry],
+                        lora_request=current_beam.lora_request,
+                        cum_logprob=float(all_beams_logprob[idx]),
+                        multi_modal_data=current_beam.multi_modal_data,
+                        mm_processor_kwargs=current_beam.mm_processor_kwargs,
+                    )
+                )
+
+            all_beams = new_beams
 
         completed.extend(all_beams)
         sorted_completed = sorted(completed, key=sort_beams_key, reverse=True)

From 22924383e14a7a37ee86cf6e15f39e13efc86f7c Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 20 Nov 2025 11:07:06 -0600
Subject: [PATCH 247/578] Updating the mirror of test-amd.yaml as of 2025-11-18
 (#29016)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/test-amd.yaml | 41 ++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 37c6bd427672..4e2ff5c5a6bd 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -61,7 +61,7 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
   timeout_in_minutes: 10
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
@@ -73,6 +73,7 @@ steps:
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
   - tests/transformers_utils
+  - tests/config
   no_gpu: true
   commands:
   - python3 standalone_tests/lazy_imports.py
@@ -80,6 +81,7 @@ steps:
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s transformers_utils
+  - pytest -v -s config
 
 - label: Python-only Installation Test # 10min
   timeout_in_minutes: 20
@@ -390,6 +392,15 @@ steps:
   commands:
     - pytest -v -s v1/attention
 
+- label: V1 Test attention (B200) # 10min
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+
 - label: V1 Test others (CPU) # 5 mins
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
@@ -529,7 +540,7 @@ steps:
   - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
     # Limit to no custom ops to reduce running time
     # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -694,7 +705,7 @@ steps:
   - vllm/model_executor/models/whisper.py
   commands: # LMEval
   # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
-  - pytest -s entrypoints/openai/correctness/  --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
+  - pytest -s entrypoints/openai/correctness/
 
 - label: OpenAI-Compatible Tool Use # 23 min
   timeout_in_minutes: 35
@@ -995,12 +1006,12 @@ steps:
   optional: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
     - pytest -v -s tests/models/test_transformers.py
-    - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py
+    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
     - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
@@ -1045,7 +1056,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
 
-- label: Blackwell Fusion Tests # 30 min
+- label: Blackwell Fusion and Compile Tests # 30 min
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   gpu: b200
@@ -1066,7 +1077,9 @@ steps:
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
     # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell Fusion E2E Tests # 30 min
   timeout_in_minutes: 40
@@ -1088,15 +1101,13 @@ steps:
   commands:
     - nvidia-smi
     # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/test_fusions_e2e.py
 
 - label: ROCm GPT-OSS Eval
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
   agent_pool: mi325_1
-  mirror_hardwares: [amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction]
   optional: true # run on nightlies
   source_file_dependencies:
   - tests/evals/gpt_oss
@@ -1416,7 +1427,9 @@ steps:
     - pytest -v -s tests/compile/distributed/test_async_tp.py
     - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/compile/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py

From e5bfcb6a88cda4f91e3c7074d7e76bb5d1d36362 Mon Sep 17 00:00:00 2001
From: Pan Li <1162953505@qq.com>
Date: Fri, 21 Nov 2025 01:38:31 +0800
Subject: [PATCH 248/578] [BugFix][PD]: make example proxy usable with
 P2pNcclConnector (#26628)

Signed-off-by: PAN <1162953505@qq.com>
---
 .../disagg_prefill_proxy_server.py            | 249 +++++++++++-------
 .../online_serving/disaggregated_prefill.sh   |  19 +-
 2 files changed, 169 insertions(+), 99 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index 904f80534914..d072c03c440b 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -5,11 +5,12 @@
 import asyncio
 import logging
 import os
+import time
+import uuid
+from urllib.parse import urlparse
 
 import aiohttp
 from quart import Quart, Response, make_response, request
-from rate_limiter import RateLimiter
-from request_queue import RequestQueue
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -24,26 +25,8 @@ def parse_args():
     parser.add_argument(
         "--timeout",
         type=float,
-        default=300,
-        help="Timeout for backend service requests in seconds (default: 300)",
-    )
-    parser.add_argument(
-        "--max-concurrent",
-        type=int,
-        default=100,
-        help="Maximum concurrent requests to backend services (default: 100)",
-    )
-    parser.add_argument(
-        "--queue-size",
-        type=int,
-        default=500,
-        help="Maximum number of requests in the queue (default: 500)",
-    )
-    parser.add_argument(
-        "--rate-limit",
-        type=int,
-        default=40,
-        help="Maximum requests per second (default: 40)",
+        default=6 * 60 * 60,
+        help="Timeout for backend service requests in seconds (default: 21600)",
     )
     parser.add_argument(
         "--port",
@@ -54,14 +37,32 @@ def parse_args():
     parser.add_argument(
         "--prefill-url",
         type=str,
-        default="http://localhost:8100/v1/completions",
-        help="Prefill service endpoint URL",
+        default="http://localhost:8100",
+        help="Prefill service base URL (protocol + host[:port])",
     )
     parser.add_argument(
         "--decode-url",
         type=str,
-        default="http://localhost:8200/v1/completions",
-        help="Decode service endpoint URL",
+        default="http://localhost:8200",
+        help="Decode service base URL (protocol + host[:port])",
+    )
+    parser.add_argument(
+        "--kv-host",
+        type=str,
+        default="localhost",
+        help="Hostname or IP used by KV transfer (default: localhost)",
+    )
+    parser.add_argument(
+        "--prefill-kv-port",
+        type=int,
+        default=14579,
+        help="Prefill KV port (default: 14579)",
+    )
+    parser.add_argument(
+        "--decode-kv-port",
+        type=int,
+        default=14580,
+        help="Decode KV port (default: 14580)",
     )
 
     return parser.parse_args()
@@ -73,70 +74,129 @@ def main():
 
     # Initialize configuration using command line parameters
     AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
-    MAX_CONCURRENT_REQUESTS = args.max_concurrent
-    REQUEST_QUEUE_SIZE = args.queue_size
-    RATE_LIMIT = args.rate_limit
     PREFILL_SERVICE_URL = args.prefill_url
     DECODE_SERVICE_URL = args.decode_url
     PORT = args.port
 
-    app = Quart(__name__)
+    PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}"
+    DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}"
 
-    # Initialize the rate limiter and request queue
-    rate_limiter = RateLimiter(RATE_LIMIT)
-    request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
+    logger.info(
+        "Proxy resolved KV addresses -> prefill: %s, decode: %s",
+        PREFILL_KV_ADDR,
+        DECODE_KV_ADDR,
+    )
+
+    app = Quart(__name__)
 
-    # Attach the configuration object to the application instance
+    # Attach the configuration object to the application instance so helper
+    # coroutines can read the resolved backend URLs and timeouts without using
+    # globals.
     app.config.update(
         {
             "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
-            "rate_limiter": rate_limiter,
-            "request_queue": request_queue,
             "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
             "DECODE_SERVICE_URL": DECODE_SERVICE_URL,
+            "PREFILL_KV_ADDR": PREFILL_KV_ADDR,
+            "DECODE_KV_ADDR": DECODE_KV_ADDR,
         }
     )
 
-    # Start queue processing on app startup
-    @app.before_serving
-    async def startup():
-        """Start request processing task when app starts serving"""
-        asyncio.create_task(request_queue.process())
-
-    async def forward_request(url, data):
-        """Forward request to backend service with rate limiting and error handling"""
-        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-
-        # Use rate limiter as context manager
-        async with (
-            rate_limiter,
-            aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
-        ):
-            try:
-                async with session.post(
-                    url=url, json=data, headers=headers
-                ) as response:
-                    if response.status == 200:
-                        # Stream response chunks
-                        async for chunk_bytes in response.content.iter_chunked(1024):
-                            yield chunk_bytes
-                    else:
-                        # Handle backend service errors
-                        error_text = await response.text()
-                        logger.error(
-                            "Backend service error: %s - %s",
-                            response.status,
-                            error_text,
-                        )
-                        yield b'{"error": "Backend service error"}'
-            except aiohttp.ClientError as e:
-                # Handle connection errors
-                logger.error("Connection error to %s: %s", url, str(e))
-                yield b'{"error": "Service unavailable"}'
-            except asyncio.TimeoutError:
-                # Handle timeout errors
-                logger.error("Timeout connecting to %s", url)
-                yield b'{"error": "Service timeout"}'
+    def _normalize_base_url(url: str) -> str:
+        """Remove any trailing slash so path joins behave predictably."""
+        return url.rstrip("/")
+
+    def _get_host_port(url: str) -> str:
+        """Return the hostname:port portion for logging and KV headers."""
+        parsed = urlparse(url)
+        host = parsed.hostname or "localhost"
+        port = parsed.port
+        if port is None:
+            port = 80 if parsed.scheme == "http" else 443
+        return f"{host}:{port}"
+
+    PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL)
+    DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL)
+    KV_TARGET = _get_host_port(DECODE_SERVICE_URL)
+
+    def _build_headers(request_id: str) -> dict[str, str]:
+        """Construct the headers expected by vLLM's P2P disagg connector."""
+        headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET}
+        api_key = os.environ.get("OPENAI_API_KEY")
+        if api_key:
+            headers["Authorization"] = f"Bearer {api_key}"
+        return headers
+
+    async def _run_prefill(
+        request_path: str,
+        payload: dict,
+        headers: dict[str, str],
+        request_id: str,
+    ):
+        url = f"{PREFILL_BASE}{request_path}"
+        start_ts = time.perf_counter()
+        logger.info("[prefill] start request_id=%s url=%s", request_id, url)
+        try:
+            async with (
+                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+                session.post(url=url, json=payload, headers=headers) as resp,
+            ):
+                if resp.status != 200:
+                    error_text = await resp.text()
+                    raise RuntimeError(
+                        f"Prefill backend error {resp.status}: {error_text}"
+                    )
+                await resp.read()
+                logger.info(
+                    "[prefill] done request_id=%s status=%s elapsed=%.2fs",
+                    request_id,
+                    resp.status,
+                    time.perf_counter() - start_ts,
+                )
+        except asyncio.TimeoutError as exc:
+            raise RuntimeError(f"Prefill service timeout at {url}") from exc
+        except aiohttp.ClientError as exc:
+            raise RuntimeError(f"Prefill service unavailable at {url}") from exc
+
+    async def _stream_decode(
+        request_path: str,
+        payload: dict,
+        headers: dict[str, str],
+        request_id: str,
+    ):
+        url = f"{DECODE_BASE}{request_path}"
+        # Stream tokens from the decode service once the prefill stage has
+        # materialized KV caches on the target workers.
+        logger.info("[decode] start request_id=%s url=%s", request_id, url)
+        try:
+            async with (
+                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+                session.post(url=url, json=payload, headers=headers) as resp,
+            ):
+                if resp.status != 200:
+                    error_text = await resp.text()
+                    logger.error(
+                        "Decode backend error %s - %s", resp.status, error_text
+                    )
+                    err_msg = (
+                        '{"error": "Decode backend error ' + str(resp.status) + '"}'
+                    )
+                    yield err_msg.encode()
+                    return
+                logger.info(
+                    "[decode] streaming response request_id=%s status=%s",
+                    request_id,
+                    resp.status,
+                )
+                async for chunk_bytes in resp.content.iter_chunked(1024):
+                    yield chunk_bytes
+                logger.info("[decode] finished streaming request_id=%s", request_id)
+        except asyncio.TimeoutError:
+            logger.error("Decode service timeout at %s", url)
+            yield b'{"error": "Decode service timeout"}'
+        except aiohttp.ClientError as exc:
+            logger.error("Decode service error at %s: %s", url, exc)
+            yield b'{"error": "Decode service unavailable"}'
 
     async def process_request():
         """Process a single request through prefill and decode stages"""
@@ -146,13 +206,27 @@ async def process_request():
             # Create prefill request (max_tokens=1)
             prefill_request = original_request_data.copy()
             prefill_request["max_tokens"] = 1
+            if "max_completion_tokens" in prefill_request:
+                prefill_request["max_completion_tokens"] = 1
 
             # Execute prefill stage
-            async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
-                continue
+            # The request id encodes both KV socket addresses so the backend can
+            # shuttle tensors directly via NCCL once the prefill response
+            # completes.
+            request_id = (
+                f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_"
+                f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}"
+            )
+
+            headers = _build_headers(request_id)
+            await _run_prefill(request.path, prefill_request, headers, request_id)
 
             # Execute decode stage and stream response
-            generator = forward_request(DECODE_SERVICE_URL, original_request_data)
+            # Pass the unmodified user request so the decode phase can continue
+            # sampling with the already-populated KV cache.
+            generator = _stream_decode(
+                request.path, original_request_data, headers, request_id
+            )
             response = await make_response(generator)
             response.timeout = None  # Disable timeout for streaming response
             return response
@@ -168,23 +242,10 @@ async def process_request():
     @app.route("/v1/completions", methods=["POST"])
     async def handle_request():
         """Handle incoming API requests with concurrency and rate limiting"""
-        # Create task for request processing
-        task = asyncio.create_task(process_request())
-
-        # Enqueue request or reject if queue is full
-        if not await request_queue.enqueue(task):
-            return Response(
-                response=b'{"error": "Server busy, try again later"}',
-                status=503,
-                content_type="application/json",
-            )
-
         try:
-            # Return the response from the processing task
-            return await task
+            return await process_request()
         except asyncio.CancelledError:
-            # Handle task cancellation (timeout or queue full)
-            logger.warning("Request cancelled due to timeout or queue full")
+            logger.warning("Request cancelled")
             return Response(
                 response=b'{"error": "Request cancelled"}',
                 status=503,
diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
index d434e22b1ae8..cd2f2e44a4d6 100644
--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@@ -24,7 +24,14 @@ cleanup() {
     exit 0
 }
 
-export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+if [[ -z "${VLLM_HOST_IP:-}" ]]; then
+    export VLLM_HOST_IP=127.0.0.1
+    echo "Using default VLLM_HOST_IP=127.0.0.1 (override by exporting VLLM_HOST_IP before running this script)"
+else
+    echo "Using provided VLLM_HOST_IP=${VLLM_HOST_IP}"
+fi
+
 
 # install quart first -- required for disagg prefill proxy serve
 if python3 -c "import quart" &> /dev/null; then
@@ -38,7 +45,7 @@ fi
 wait_for_server() {
   local port=$1
   timeout 1200 bash -c "
-    until curl -s localhost:${port}/v1/completions > /dev/null; do
+    until curl -i localhost:${port}/v1/models > /dev/null; do
       sleep 1
     done" && return 0 || return 1
 }
@@ -48,21 +55,23 @@ wait_for_server() {
 
 # prefilling instance, which is the KV producer
 CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
+    --host 0.0.0.0 \
     --port 8100 \
     --max-model-len 100 \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' &
 
-# decoding instance, which is the KV consumer
+# decoding instance, which is the KV consumer  
 CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
+    --host 0.0.0.0 \
     --port 8200 \
     --max-model-len 100 \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":"1e10","kv_port":"14580","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8200","send_type":"PUT_ASYNC"}}' &
 
 # wait until prefill and decode instances are ready
 wait_for_server 8100

From 647464719b131963dccdc3a28cfe52d1af293cda Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Thu, 20 Nov 2025 20:09:59 +0200
Subject: [PATCH 249/578] [KVConnector][Core] Support cross-layer KV blocks
 (#27743)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 .../unit/test_offloading_connector.py         |   8 +-
 tests/v1/kv_offload/test_cpu_offloading.py    | 145 +++++++++------
 tests/v1/worker/test_gpu_model_runner.py      |   5 +-
 vllm/attention/backends/abstract.py           |  29 ++-
 .../kv_transfer/kv_connector/v1/base.py       |  33 +++-
 .../kv_connector/v1/offloading_connector.py   |  43 ++++-
 vllm/v1/attention/backends/flash_attn.py      |  12 +-
 vllm/v1/attention/backends/flashinfer.py      |  12 +-
 vllm/v1/attention/backends/mla/common.py      |   9 +
 vllm/v1/attention/backends/mla/indexer.py     |   6 +-
 vllm/v1/kv_offload/cpu.py                     |  17 +-
 vllm/v1/kv_offload/spec.py                    |   6 +-
 vllm/v1/kv_offload/worker/cpu_gpu.py          |  12 +-
 vllm/v1/worker/gpu_model_runner.py            |  41 ++++-
 .../worker/kv_connector_model_runner_mixin.py | 165 ++++++++++++++++++
 15 files changed, 453 insertions(+), 90 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py
index 23b6c4802d10..69565f584ab8 100644
--- a/tests/v1/kv_connector/unit/test_offloading_connector.py
+++ b/tests/v1/kv_connector/unit/test_offloading_connector.py
@@ -19,6 +19,7 @@
 )
 from vllm.forward_context import ForwardContext
 from vllm.utils.hashing import sha256
+from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
     get_request_block_hasher,
@@ -92,7 +93,7 @@ def get_manager(self) -> OffloadingManager:
         return self.manager
 
     def get_handlers(
-        self, _
+        self, _, __
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         yield GPULoadStoreSpec, MockLoadStoreSpec, self.handler
         yield MockLoadStoreSpec, GPULoadStoreSpec, self.handler
@@ -138,7 +139,10 @@ def __init__(
         self.worker_connector = OffloadingConnector(vllm_config, KVConnectorRole.WORKER)
 
         # register worker kv_caches to enable OffloadingWorker creations
-        self.worker_connector.register_kv_caches(kv_caches={"a": torch.empty(0)})
+        self.worker_connector.register_cross_layers_kv_cache(
+            kv_cache=torch.empty(0),
+            attn_backend=FlashAttentionBackend,
+        )
 
         # extract connector of scheduler
         scheduler_connector = self.scheduler.connector
diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index b654ea4298db..3ee41c40859d 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -12,8 +12,10 @@
 from vllm import LLM, SamplingParams, TokensPrompt
 from vllm.config import KVEventsConfig, KVTransferConfig
 from vllm.distributed.kv_events import BlockStored, KVEventBatch
+from vllm.utils.system_utils import set_env_var
 
-CPU_BLOCK_SIZES = [16, 48]
+CPU_BLOCK_SIZES = [48]
+ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
 
 
 class MockSubscriber:
@@ -63,8 +65,88 @@ def close(self):
         self.sub.close()
 
 
+def _latency_test(llm: LLM, subscriber: MockSubscriber):
+    sampling_params = SamplingParams(max_tokens=1)
+
+    num_times_cpu_better_than_cold = 0
+    num_tests = 10
+    total_cold_time = 0.0
+    total_gpu_hit_time = 0.0
+    total_cpu_hit_time = 0.0
+    prompt_token_ids = [0] * 10001
+    for i in tqdm(range(num_tests), desc="Running tests"):
+        prompt_token_ids[0] = i
+        prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)]
+
+        # run generation - this should trigger saving KV cache
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        cold_time = time.time() - start_time
+        total_cold_time += cold_time
+
+        # run generation again - should hit the GPU prefix cache
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        gpu_hit_time = time.time() - start_time
+        total_gpu_hit_time += gpu_hit_time
+
+        # reset prefix cache to avoid GPU hit.
+        llm.reset_prefix_cache()
+
+        assert subscriber.get_new_cpu_stored_events()
+
+        # run generation again - this should trigger loading from CPU
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        cpu_hit_time = time.time() - start_time
+        total_cpu_hit_time += cpu_hit_time
+
+        if cpu_hit_time < cold_time:
+            num_times_cpu_better_than_cold += 1
+
+    print("Average times:")
+    print(f"    Cold: {total_cold_time * 1000 / num_tests:.2f}ms")
+    print(f"    GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms")
+    print(f"    CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms")
+
+    assert num_times_cpu_better_than_cold >= 0.8 * num_tests
+
+
+def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
+    sampling_params = SamplingParams(max_tokens=1)
+    cpu_block_size = (
+        llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config[
+            "block_size"
+        ]
+    )
+
+    subscriber.get_new_cpu_stored_events()
+
+    # prepend prompt to be cpu block aligned
+    prompt = "Let's count to 10. One, two, three, four,"
+    while (
+        len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size
+        != 0
+    ):
+        prompt = ". " + prompt
+
+    assert subscriber.get_new_cpu_stored_events()
+
+    test_count = 100
+    success_count = 0
+    for i in range(test_count):
+        if (
+            llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text
+            == " five"
+        ):
+            success_count += 1
+
+    assert success_count >= 0.5 * test_count
+
+
 @pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES)
-def test_cpu_offloading(cpu_block_size: int) -> None:
+@pytest.mark.parametrize("attn_backend", ATTN_BACKENDS)
+def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
     """
     Tests OffloadingConnector with CPUOffloadingSpec.
     """
@@ -92,61 +174,20 @@ def test_cpu_offloading(cpu_block_size: int) -> None:
         topic="test",
     )
 
-    llm = LLM(
-        model="meta-llama/Llama-3.2-1B-Instruct",
-        gpu_memory_utilization=0.5,
-        kv_events_config=kv_events_config,
-        kv_transfer_config=kv_transfer_config,
-    )
-
-    sampling_params = SamplingParams(temperature=0, max_tokens=1)
+    with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend):
+        llm = LLM(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            gpu_memory_utilization=0.5,
+            kv_events_config=kv_events_config,
+            kv_transfer_config=kv_transfer_config,
+        )
 
     events_endpoint = events_endpoint.replace("*", "127.0.0.1")
     subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
 
     try:
-        num_times_cpu_better_than_cold = 0
-        num_tests = 10
-        total_cold_time = 0.0
-        total_gpu_hit_time = 0.0
-        total_cpu_hit_time = 0.0
-        prompt_token_ids = [0] * 10001
-        for i in tqdm(range(num_tests), desc="Running tests"):
-            prompt_token_ids[0] = i
-            prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)]
-
-            # run generation - this should trigger saving KV cache
-            start_time = time.time()
-            llm.generate(prompts, sampling_params, use_tqdm=False)
-            cold_time = time.time() - start_time
-            total_cold_time += cold_time
-
-            # run generation again - should hit the GPU prefix cache
-            start_time = time.time()
-            llm.generate(prompts, sampling_params, use_tqdm=False)
-            gpu_hit_time = time.time() - start_time
-            total_gpu_hit_time += gpu_hit_time
-
-            # reset prefix cache to avoid GPU hit.
-            llm.reset_prefix_cache()
-
-            assert subscriber.get_new_cpu_stored_events()
-
-            # run generation again - this should trigger loading from CPU
-            start_time = time.time()
-            llm.generate(prompts, sampling_params, use_tqdm=False)
-            cpu_hit_time = time.time() - start_time
-            total_cpu_hit_time += cpu_hit_time
-
-            if cpu_hit_time < cold_time:
-                num_times_cpu_better_than_cold += 1
-
-        print("Average times:")
-        print(f"    Cold: {total_cold_time * 1000 / num_tests:.2f}ms")
-        print(f"    GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms")
-        print(f"    CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms")
-
-        assert num_times_cpu_better_than_cold >= 0.8 * num_tests
+        _latency_test(llm, subscriber)
+        _accuracy_test(llm, subscriber)
     finally:
         subscriber.close()
         del llm
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 824e45897835..01c1364f7ee6 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -483,7 +483,10 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
     # Permutation that gets you back to expected kv shape
     for test_stride in ((1, 4, 0, 2, 3), (0, 1, 2, 3, 4)):
 
-        def rnd_stride_order(test_stride=test_stride):
+        def rnd_stride_order(
+            include_num_layers_dimension: bool = False, test_stride=test_stride
+        ):
+            assert not include_num_layers_dimension
             return test_stride
 
         # Patch the attention backend class and re-trigger the KV cache creation
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 188becb6ad6f..67ded8847524 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -76,7 +76,34 @@ def get_kv_cache_shape(
         raise NotImplementedError
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
+        """
+        Get the physical (memory layout) ordering of the kv cache dimensions.
+        e.g. if the KV cache shape is
+        [2, num_blocks, block_size, num_heads, head_size],
+        and get_kv_cache_stride_order returns (1, 3, 0, 2, 4) then the physical
+        ordering of dimensions is
+        [num_blocks, num_heads, 2, block_size, head_size].
+
+        If this function is unimplemented / raises NotImplementedError,
+        the physical layout of the KV cache will match the logical shape.
+
+        Args:
+            include_num_layers_dimension: if True, includes an additional
+                num_layers dimension, which is assumed to be prepended
+                to the logical KV cache shape.
+                With the above example, a return value (2, 4, 0, 1, 3, 5)
+                corresponds to
+                [num_blocks, num_heads, num_layers, 2, block_size, head_size].
+
+                If an additional dimension is NOT included in the returned
+                tuple, the physical layout will not include a layers dimension.
+
+        Returns:
+            A tuple of ints which is a permutation of range(len(shape)).
+        """
         raise NotImplementedError
 
     @classmethod
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index f85eb414b222..74f09278b7bb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -38,7 +38,7 @@
 import enum
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Iterable
-from typing import TYPE_CHECKING, Any, Literal, Optional
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional
 
 import torch
 
@@ -47,7 +47,7 @@
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -142,6 +142,18 @@ class KVConnectorMetadata(ABC):  # noqa: B024
 
 
 class KVConnectorBase_V1(ABC):
+    """
+    Base class for KV connectors.
+
+    Attributes:
+        prefer_cross_layer_blocks (bool): Indicates whether this connector
+            prefers KV blocks that hold KV data for all layers (for speeding
+            up KV data transfers).
+            Defaults to False.
+    """
+
+    prefer_cross_layer_blocks: ClassVar[bool] = False
+
     def __init__(
         self,
         vllm_config: "VllmConfig",
@@ -226,6 +238,23 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """
         return
 
+    def register_cross_layers_kv_cache(
+        self, kv_cache: torch.Tensor, attn_backend: type["AttentionBackend"]
+    ):
+        """
+        Initialize with a single KV cache tensor used by all layers.
+        The first dimension should be num_layers.
+        This function will only be called for models with uniform layers,
+        and only if the prefers_cross_layer_blocks is set to True.
+        Only one of the functions
+        {register_kv_caches, register_cross_layers_kv_cache} will be called.
+
+        Args:
+            kv_cache: a cross-layers kv cache tensor
+            attn_backend: The attention backend that corresponds to all layers
+        """
+        return
+
     def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
         """
         Set the xPU-specific ops for copying KV between host and device.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index 582e42cc466a..8cd09014cab1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -4,12 +4,12 @@
 from collections.abc import Iterable, Iterator
 from dataclasses import dataclass
 from itertools import islice
-from typing import Any
+from typing import Any, ClassVar
 
 import torch
 
-from vllm.attention import AttentionMetadata
-from vllm.config import VllmConfig
+from vllm.attention import Attention, AttentionBackend, AttentionMetadata
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
@@ -42,6 +42,8 @@ class OffloadingConnectorMetadata(KVConnectorMetadata):
 
 
 class OffloadingConnector(KVConnectorBase_V1):
+    prefer_cross_layer_blocks: ClassVar[bool] = True
+
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -63,6 +65,12 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         assert self.connector_worker is not None
         self.connector_worker.register_kv_caches(kv_caches)
 
+    def register_cross_layers_kv_cache(
+        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
+    ):
+        assert self.connector_worker is not None
+        self.connector_worker.register_cross_layers_kv_cache(kv_cache, attn_backend)
+
     def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
         assert self.connector_worker is not None
         assert isinstance(self._connector_metadata, OffloadingConnectorMetadata)
@@ -422,10 +430,35 @@ def _generate_job_id(self) -> int:
         self._job_counter = job_id + 1
         return job_id
 
-    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
-        for src_cls, dst_cls, handler in self.spec.get_handlers(kv_caches):
+    def _register_handlers(
+        self,
+        kv_caches: dict[str, torch.Tensor],
+        attn_backends: dict[str, type[AttentionBackend]],
+    ):
+        for src_cls, dst_cls, handler in self.spec.get_handlers(
+            kv_caches, attn_backends
+        ):
             self.worker.register_handler(src_cls, dst_cls, handler)
 
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        layer_names = list(kv_caches.keys())
+        layers = get_layers_from_vllm_config(
+            self.spec.vllm_config, Attention, layer_names
+        )
+        attn_backends = {
+            layer_name: layers[layer_name].get_attn_backend()
+            for layer_name in layer_names
+        }
+        self._register_handlers(kv_caches, attn_backends)
+
+    def register_cross_layers_kv_cache(
+        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
+    ):
+        cross_layer_name = "ALL_LAYERS"
+        kv_caches = {cross_layer_name: kv_cache}
+        attn_backends = {cross_layer_name: attn_backend}
+        self._register_handlers(kv_caches, attn_backends)
+
     def start_load_kv(self, metadata: OffloadingConnectorMetadata):
         for req_id, transfer_spec in metadata.reqs_to_load.items():
             job_id = self._generate_job_id()
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index cf3c1d05f5b3..9fa6b1dfd19d 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -99,12 +99,20 @@ def get_kv_cache_shape(
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
         # `stride_order` indicates the permutation that gets
         # us from `get_kv_cache_shape` to the actual memory layout we want.
         cache_layout = get_kv_cache_layout()
-        if cache_layout == "NHD":
+        if cache_layout == "NHD" and include_num_layers_dimension:
+            # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size)
+            return (2, 0, 1, 3, 4, 5)
+        elif cache_layout == "NHD":
             stride_order = (0, 1, 2, 3, 4)
+        elif cache_layout == "HND" and include_num_layers_dimension:
+            # (num_blocks, num_kv_heads, num_layers, 2, block_size, head_size)
+            return (2, 4, 0, 1, 3, 5)
         elif cache_layout == "HND":
             stride_order = (0, 1, 3, 2, 4)
         else:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 4da1637d96eb..3ad7e8c52fc1 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -309,12 +309,20 @@ def get_kv_cache_shape(
         return (num_blocks, 2, block_size, num_kv_heads, head_size)
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
         # `stride_order` indicates the permutation that gets us from
         # `get_kv_cache_shape` to the actual memory layout we want.
         cache_layout = get_kv_cache_layout()
-        if cache_layout == "NHD":
+        if cache_layout == "NHD" and include_num_layers_dimension:
+            # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size)
+            return (1, 0, 2, 3, 4, 5)
+        elif cache_layout == "NHD":
             stride_order = (0, 1, 2, 3, 4)
+        elif cache_layout == "HND" and include_num_layers_dimension:
+            # (num_blocks, 2, num_kv_heads, num_layers, block_size, head_size)
+            return (1, 2, 4, 0, 3, 5)
         elif cache_layout == "HND":
             stride_order = (0, 1, 3, 2, 4)
         else:
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 32f406980f2e..43aef8a7cca9 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -308,6 +308,15 @@ def get_kv_cache_shape(
     ) -> tuple[int, ...]:
         return (num_blocks, block_size, head_size)
 
+    @staticmethod
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
+        # `stride_order` indicates the permutation that gets
+        # us from `get_kv_cache_shape` to the actual memory layout we want.
+        # (num_blocks, num_layers, block_size, head_size)
+        return (1, 0, 2, 3) if include_num_layers_dimension else (0, 1, 2)
+
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [576]
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index cc0988435768..d38361e0fcbf 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -48,7 +48,11 @@ def get_kv_cache_shape(
         return (num_blocks, block_size, head_size)
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
+        if include_num_layers_dimension:
+            return (0, 1, 2, 3)
         return (0, 1, 2)
 
 
diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py
index 4b1bbe6f0cc2..86747299eb10 100644
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -4,8 +4,8 @@
 
 import torch
 
-from vllm.config import VllmConfig, get_layers_from_vllm_config
-from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.attention import AttentionBackend
+from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
 from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
@@ -63,7 +63,9 @@ def get_manager(self) -> OffloadingManager:
         return self._manager
 
     def get_handlers(
-        self, kv_caches: dict[str, torch.Tensor]
+        self,
+        kv_caches: dict[str, torch.Tensor],
+        attn_backends: dict[str, type[AttentionBackend]],
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         if not self._handler:
             if not current_platform.is_cuda_alike():
@@ -71,15 +73,6 @@ def get_handlers(
                     "CPU Offloading is currently only supported on CUDA-alike GPUs"
                 )
 
-            layer_names = list(kv_caches.keys())
-            layers = get_layers_from_vllm_config(
-                self.vllm_config, AttentionLayerBase, layer_names
-            )
-            attn_backends = {
-                layer_name: layers[layer_name].get_attn_backend()
-                for layer_name in layer_names
-            }
-
             self._handler = CpuGpuOffloadingHandler(
                 attn_backends=attn_backends,
                 gpu_block_size=self.gpu_block_size,
diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py
index a3c539a47d45..c1813a4ff4ea 100644
--- a/vllm/v1/kv_offload/spec.py
+++ b/vllm/v1/kv_offload/spec.py
@@ -11,6 +11,7 @@
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 if TYPE_CHECKING:
+    from vllm.attention import AttentionBackend
     from vllm.config import VllmConfig
 
 logger = init_logger(__name__)
@@ -48,13 +49,16 @@ def get_manager(self) -> OffloadingManager:
 
     @abstractmethod
     def get_handlers(
-        self, kv_caches: dict[str, torch.Tensor]
+        self,
+        kv_caches: dict[str, torch.Tensor],
+        attn_backends: dict[str, type["AttentionBackend"]],
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         """
         Get offloading handlers along with their respective src and dst types.
 
         Args:
             kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor.
+            attn_backends: A dictionary of layer_name -> AttentionBackend.
 
         Yields:
             Tuples of (src_type, dst_type, offloading_handler).
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index 111046377a5d..bb163f0043fc 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -83,10 +83,18 @@ def __init__(
             self.gpu_tensors.append(gpu_tensor)
 
             gpu_shape = gpu_tensor.shape
-            test_shape = attn_backends[layer_name].get_kv_cache_shape(
+            attn_backend = attn_backends[layer_name]
+            test_shape = attn_backend.get_kv_cache_shape(
                 num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256
             )
-            if test_shape[0] == 1234:
+
+            if len(gpu_shape) != len(test_shape):
+                # cross-layers tensor
+                # shape is (num_blocks, ...)
+                assert len(gpu_shape) == len(test_shape) + 1
+                num_blocks_idx = 0
+                self.kv_dim_before_num_blocks.append(False)
+            elif test_shape[0] == 1234:
                 # shape is (num_blocks, ...)
                 num_blocks_idx = 0
                 self.kv_dim_before_num_blocks.append(False)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0490ed39c8c7..4b0a08ab57e1 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -349,6 +349,9 @@ def __init__(
         # self.model: nn.Module  # Set after load_model
         # Initialize in initialize_kv_cache
         self.kv_caches: list[torch.Tensor] = []
+        # Initialize in initialize_kv_cache_tensors
+        self.cross_layers_kv_cache: torch.Tensor | None = None
+        self.cross_layers_attn_backend: type[AttentionBackend] | None = None
         # indexes: [kv_cache_group_id][attn_group]
         self.attn_groups: list[list[AttentionGroup]] = []
         # self.kv_cache_config: KVCacheConfig
@@ -4930,12 +4933,30 @@ def initialize_kv_cache_tensors(
             Dict[str, torch.Tensor]: A map between layer names to their
             corresponding memory buffer for KV cache.
         """
-        # Initialize the memory buffer for KV cache
-        kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
-        # Change the memory buffer to the desired shape
-        kv_caches = self._reshape_kv_cache_tensors(
-            kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes
-        )
+
+        # Try creating KV caches optimized for kv-connector transfers
+        cache_dtype = self.cache_config.cache_dtype
+        if self.use_uniform_kv_cache(self.attn_groups, cache_dtype):
+            kv_caches, cross_layers_kv_cache, attn_backend = (
+                self.allocate_uniform_kv_caches(
+                    kv_cache_config,
+                    self.attn_groups,
+                    cache_dtype,
+                    self.device,
+                    kernel_block_sizes,
+                )
+            )
+            self.cross_layers_kv_cache = cross_layers_kv_cache
+            self.cross_layers_attn_backend = attn_backend
+        else:
+            # Fallback to the general case
+            # Initialize the memory buffer for KV cache
+            kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
+
+            # Change the memory buffer to the desired shape
+            kv_caches = self._reshape_kv_cache_tensors(
+                kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes
+            )
 
         # Set up cross-layer KV cache sharing
         for layer_name, target_layer_name in self.shared_kv_cache_layers.items():
@@ -5017,7 +5038,13 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
 
         if has_kv_transfer_group():
             kv_transfer_group = get_kv_transfer_group()
-            kv_transfer_group.register_kv_caches(kv_caches)
+            if self.cross_layers_kv_cache is not None:
+                assert self.cross_layers_attn_backend is not None
+                kv_transfer_group.register_cross_layers_kv_cache(
+                    self.cross_layers_kv_cache, self.cross_layers_attn_backend
+                )
+            else:
+                kv_transfer_group.register_kv_caches(kv_caches)
             kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
 
         if self.dcp_world_size > 1:
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index db037a9fccd5..e59361f21372 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -11,7 +11,11 @@
     TYPE_CHECKING,  # noqa: UP035
 )
 
+import torch
+
+from vllm.attention import AttentionBackend
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.distributed.kv_transfer import (
     ensure_kv_transfer_shutdown,
     get_kv_transfer_group,
@@ -21,11 +25,13 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
+from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
 from vllm.v1.outputs import (
     EMPTY_MODEL_RUNNER_OUTPUT,
     KVConnectorOutput,
     ModelRunnerOutput,
 )
+from vllm.v1.worker.utils import AttentionGroup
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -142,3 +148,162 @@ def get_kv_connector_stats() -> KVConnectorStats | None:
         if has_kv_transfer_group():
             return get_kv_transfer_group().get_kv_connector_stats()
         return None
+
+    @staticmethod
+    def use_uniform_kv_cache(
+        attn_groups: list[list[AttentionGroup]],
+        cache_dtype: CacheDType,
+    ) -> bool:
+        """
+        Determines whether a uniform KV layout should be used.
+        A uniform layout means all layers KV caches will share the same
+        underlying tensor, where for a given block number, the respective
+        KV data for all layers will be contiguous.
+        This will allow efficient KV transfer of per-block KV data for all
+        layers at once.
+        Note this layout will only be applied given 3 conditions:
+        1. The KV Cache config contains just a single group where all layers
+            have the same page size.
+        2. A KV connector is configured, and the KV connector instance prefers
+            to use this layout (prefer_cross_layer_blocks() returns True)
+        2. The flash attention backend supports this layout
+            (get_kv_cache_stride_order(True) includes a placement for a
+            num_layers dimension)
+
+        Note that the actual placement of the num_layers dimensions
+        in the unified layers tensors will be determined by the attention
+        backend.
+        Thus, the layers KV data may still not be contiguous per block
+        if the attention backend does not support it.
+
+        Args:
+            attn_groups: The list of attention groups for this model
+            cache_dtype: The KV cache dtype
+        Returns:
+            True if we should use a uniform KV cache layout.
+        """
+
+        if not has_kv_transfer_group():
+            return False
+        if not get_kv_transfer_group().prefer_cross_layer_blocks:
+            return False
+
+        if len(attn_groups) != 1 or len(attn_groups[0]) != 1:
+            return False
+
+        attn_group = attn_groups[0][0]
+        kv_cache_spec = attn_group.kv_cache_spec
+        if not isinstance(kv_cache_spec, AttentionSpec):
+            return False
+
+        attn_backend = attn_group.backend
+        kv_cache_shape = attn_backend.get_kv_cache_shape(
+            1234,
+            kv_cache_spec.block_size,
+            kv_cache_spec.num_kv_heads,
+            kv_cache_spec.head_size,
+            cache_dtype_str=cache_dtype,
+        )
+
+        try:
+            kv_cache_stride_order = attn_backend.get_kv_cache_stride_order(
+                include_num_layers_dimension=True
+            )
+        except (AttributeError, NotImplementedError):
+            return False
+
+        # check that attention backend include a layers dimension
+        return len(kv_cache_stride_order) == len(kv_cache_shape) + 1
+
+    @staticmethod
+    def allocate_uniform_kv_caches(
+        kv_cache_config: KVCacheConfig,
+        attn_groups: list[list[AttentionGroup]],
+        cache_dtype: CacheDType,
+        device: torch.device,
+        kernel_block_sizes: list[int],
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, type[AttentionBackend]]:
+        """
+        Initializes and reshapes KV caches for the simple case where all
+        layers have the same layout.
+
+        This function assumes use_uniform_kv_cache() returned True.
+
+        Args:
+            kv_cache_config: The KV cache config
+            attn_groups: The list of attention groups for this model
+            cache_dtype: The KV cache dtype
+            device: The torch device to allocate on.
+            kernel_block_sizes: The kernel block sizes for each KV cache group.
+        Returns:
+            A tuple (kv_caches, cross_layers_kv_cache, attn_backend) where:
+                kv_caches is a dict mapping between layer names to their
+                    corresponding memory buffer for KV cache.
+                cross_layers_kv_cache is the cross layers kv cache tensor
+                attn_backend is the attention backend matching this tensor
+        """
+        attn_group = attn_groups[0][0]
+        kv_cache_spec = attn_group.kv_cache_spec
+        assert isinstance(kv_cache_spec, AttentionSpec)
+
+        tensor_sizes = set(
+            kv_cache_tensor.size for kv_cache_tensor in kv_cache_config.kv_cache_tensors
+        )
+        assert len(tensor_sizes) == 1
+        tensor_size = tensor_sizes.pop()
+
+        page_size = kv_cache_spec.page_size_bytes
+        assert tensor_size % page_size == 0
+        num_blocks = tensor_size // page_size
+        num_layers = len(kv_cache_config.kv_cache_tensors)
+        total_size = tensor_size * num_layers
+
+        assert len(kernel_block_sizes) == 1
+        kernel_block_size = kernel_block_sizes[0]
+        num_blocks_per_kv_block = kv_cache_spec.block_size // kernel_block_size
+        kernel_num_blocks = num_blocks * num_blocks_per_kv_block
+
+        attn_backend = attn_group.backend
+        kv_cache_shape = attn_backend.get_kv_cache_shape(
+            kernel_num_blocks,
+            kernel_block_size,
+            kv_cache_spec.num_kv_heads,
+            kv_cache_spec.head_size,
+            cache_dtype_str=cache_dtype,
+        )
+
+        # prepend a num_layers dimension into the shape
+        kv_cache_shape = (num_layers,) + kv_cache_shape
+
+        try:
+            kv_cache_stride_order = attn_backend.get_kv_cache_stride_order(
+                include_num_layers_dimension=True
+            )
+            assert len(kv_cache_stride_order) == len(kv_cache_shape)
+        except (AttributeError, NotImplementedError):
+            kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
+
+        kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
+
+        logger.info("Allocating a cross layer KV cache of shape %s", kv_cache_shape)
+
+        # allocate one contiguous buffer for all layers
+        cross_layers_kv_cache = (
+            torch.zeros(total_size, dtype=torch.int8, device=device)
+            .view(kv_cache_spec.dtype)
+            .view(kv_cache_shape)
+        )
+
+        # Maintain original KV shape view.
+        inv_order = [
+            kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order))
+        ]
+        permuted_kv_cache = cross_layers_kv_cache.permute(*inv_order)
+
+        kv_caches = {}
+        for i, kv_cache_tensor in enumerate(kv_cache_config.kv_cache_tensors):
+            tensor = permuted_kv_cache[i]
+            for layer_name in kv_cache_tensor.shared_by:
+                kv_caches[layer_name] = tensor
+
+        return kv_caches, cross_layers_kv_cache, attn_backend

From 114b0e25004b7e7cf0a23dc65f407471bd5de7e8 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Thu, 20 Nov 2025 10:22:40 -0800
Subject: [PATCH 250/578] [chore] Update annotate release scripts (#29077)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/scripts/annotate-release.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index 56bb5cedaa0a..df805e085080 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -23,8 +23,8 @@ To download the wheel (by version):
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
 
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`
 
 To download and upload the image:
@@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 
-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest rm vllm/vllm-openai:latest
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
-EOF 
\ No newline at end of file
+EOF 

From 4d01b6428448225807e6605d04e37e29fe729b44 Mon Sep 17 00:00:00 2001
From: Software Developer <7852635+dsuhinin@users.noreply.github.com>
Date: Thu, 20 Nov 2025 21:00:33 +0100
Subject: [PATCH 251/578] [Bugfix] - Add Trace Headers to Beam Search Path
 (#29100)

Signed-off-by: dsuhinin <suhinin.dmitriy@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py       | 1 +
 vllm/entrypoints/openai/serving_completion.py | 1 +
 vllm/entrypoints/openai/serving_engine.py     | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 59e1c8d53179..6cc685acd672 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -319,6 +319,7 @@ async def create_chat_completion(
                         request_id=request_id,
                         params=sampling_params,
                         lora_request=lora_request,
+                        trace_headers=trace_headers,
                     )
                 else:
                     engine_request, tokenization_kwargs = await self._process_inputs(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a114b77ebc16..1cfb45ef4036 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -216,6 +216,7 @@ async def create_completion(
                         request_id=request_id,
                         params=sampling_params,
                         lora_request=lora_request,
+                        trace_headers=trace_headers,
                     )
                 else:
                     engine_request, tokenization_kwargs = await self._process_inputs(
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 127b8e6dcb87..7dab5dbacd28 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -343,6 +343,7 @@ async def beam_search(
         request_id: str,
         params: BeamSearchParams,
         lora_request: LoRARequest | None = None,
+        trace_headers: Mapping[str, str] | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         beam_width = params.beam_width
         max_tokens = params.max_tokens
@@ -437,6 +438,7 @@ async def beam_search(
                             beam_search_params,
                             request_id_item,
                             lora_request=lora_req,
+                            trace_headers=trace_headers,
                         )
                     )
                 )

From 3d84ef9054af190ce68333be3e4d16fe928be754 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 20 Nov 2025 14:39:49 -0600
Subject: [PATCH 252/578] [CI/Build][AMD] Skip if flash_attn_varlen_func not
 available in test_aiter_flash_attn.py (#29043)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/attention/test_aiter_flash_attn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py
index 1dec46e33f22..8f58c470d217 100644
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@@ -6,6 +6,7 @@
 import torch
 
 import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
+from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
 from vllm.platforms import current_platform
 
 NUM_HEADS = [(4, 4), (8, 2)]
@@ -100,6 +101,8 @@ def test_varlen_with_paged_kv(
     num_blocks: int,
     q_dtype: torch.dtype | None,
 ) -> None:
+    if not is_flash_attn_varlen_func_available():
+        pytest.skip("flash_attn_varlen_func required to run this test.")
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)

From 5e5a7eb16f121f05e19c8bdf88247744ab9d1b83 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 20 Nov 2025 14:45:56 -0600
Subject: [PATCH 253/578] [CI/Build] Make test_attention_selector.py run tests
 on correct platform (#29064)

Signed-off-by: Randall Smith <ransmith@amd.com>
Signed-off-by: rasmith <Randall.Smith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/kernels/attention/test_attention_selector.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 3b8e939300a2..9be56a33f76c 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -7,6 +7,7 @@
 import torch
 
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.rocm import RocmPlatform
@@ -47,9 +48,11 @@ def clear_cache():
 
 
 def generate_params():
+    is_rocm = current_platform.is_rocm()
     params = []
+    device_list = ["cuda", "cpu"] if not is_rocm else ["hip", "cpu"]
     for use_mla in [True, False]:
-        for device in ["cuda", "hip", "cpu"]:
+        for device in device_list:
             backends = (
                 DEVICE_MLA_BACKENDS[device]
                 if use_mla

From 3fd74189db13c9793325d9a36539d891873d1ae4 Mon Sep 17 00:00:00 2001
From: Driss Guessous <32754868+drisspg@users.noreply.github.com>
Date: Thu, 20 Nov 2025 13:21:54 -0800
Subject: [PATCH 254/578] Fixes bench (#29058)

Signed-off-by: drisspg <drisspguessous@gmail.com>
---
 vllm/compilation/caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 16e34c2711e9..63b7ad7279e3 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -12,6 +12,7 @@
 
 import vllm.envs as envs
 from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
 
 try:
@@ -138,7 +139,7 @@ def compilation_config_hash_factors(vllm_config: VllmConfig) -> list[str]:
     factors = []
     # 0. factors come from the env, for example, The values of
     # VLLM_PP_LAYER_PARTITION will affect the computation graph.
-    env_hash = envs.compute_hash()
+    env_hash = hash_factors(envs.compile_factors())
     factors.append(env_hash)
 
     # 1. factors come from the vllm_config (it mainly summarizes how the

From 8237ab8a2bed14bec5cafbec75033c8e1d54d852 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 20 Nov 2025 15:35:14 -0600
Subject: [PATCH 255/578] [CI/Build] Skip lm-format-enforcer tests in
 test_struct_output_generate.py for now (#29021)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 .../llm/test_struct_output_generate.py        | 28 +++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index a7d769c8542a..316e152e7395 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -47,10 +47,34 @@
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
     ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None),
+    pytest.param(
+        "mistralai/Ministral-8B-Instruct-2410",
+        "lm-format-enforcer",
+        "auto",
+        None,
+        marks=pytest.mark.skip(
+            reason=(
+                "Flaky: lm-format-enforcer intermittently returns"
+                "incomplete JSON."
+                "See https://github.com/noamgat/lm-format-enforcer/issues/169"
+            )
+        ),
+    ),
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
-    ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None),
+    pytest.param(
+        "Qwen/Qwen2.5-1.5B-Instruct",
+        "lm-format-enforcer",
+        "auto",
+        None,
+        marks=pytest.mark.skip(
+            reason=(
+                "Flaky: lm-format-enforcer intermittently returns"
+                "incomplete JSON."
+                "See https://github.com/noamgat/lm-format-enforcer/issues/169"
+            )
+        ),
+    ),
     # FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402
     # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
     # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),

From c7a29d2c8d07ce6188d0c4bb19df6fd1d0e9bc74 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Thu, 20 Nov 2025 15:44:37 -0600
Subject: [PATCH 256/578] [CI/Build] Remove skip global cleanup in
 test_struct_output_generate.py (#29022)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 316e152e7395..a00600b87eca 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -121,7 +121,6 @@ def test_guided_decoding_deprecated():
     assert sp1.structured_outputs == guided_decoding
 
 
-@pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
     "model_name, backend, tokenizer_mode, speculative_config",
     PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
@@ -626,7 +625,6 @@ def test_structured_output(
                 )
 
 
-@pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
     "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
     [
@@ -711,7 +709,6 @@ def test_structured_output_with_reasoning_matrices(
         jsonschema.validate(instance=output_json, schema=reasoning_schema)
 
 
-@pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
     unsupported_json_schema: dict[str, Any],
@@ -758,7 +755,6 @@ def test_structured_output_auto_mode(
         assert isinstance(parsed_json, dict)
 
 
-@pytest.mark.skip_global_cleanup
 def test_guidance_no_additional_properties():
     llm = LLM(
         model="Qwen/Qwen2.5-1.5B-Instruct",

From dd39f91edb0588e2dd77eb55c758eb1e35907af8 Mon Sep 17 00:00:00 2001
From: Rob Mulla <RobMulla@users.noreply.github.com>
Date: Thu, 20 Nov 2025 19:05:59 -0500
Subject: [PATCH 257/578] [Doc] cleanup TPU documentation and remove outdated
 examples (#29048)

Signed-off-by: Rob Mulla <rob.mulla@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/.nav.yml                                 |   6 +-
 docs/configuration/tpu.md                     | 111 ------------------
 docs/features/README.md                       |  37 +++---
 docs/features/quantization/README.md          |  29 +++--
 docs/models/hardware_supported_models/tpu.md  |  34 ------
 .../offline_inference/profiling_tpu/README.md |  70 -----------
 .../profiling_tpu/profiling.py                | 110 -----------------
 examples/offline_inference/tpu.py             |  58 ---------
 8 files changed, 40 insertions(+), 415 deletions(-)
 delete mode 100644 docs/configuration/tpu.md
 delete mode 100644 docs/models/hardware_supported_models/tpu.md
 delete mode 100644 examples/offline_inference/profiling_tpu/README.md
 delete mode 100644 examples/offline_inference/profiling_tpu/profiling.py
 delete mode 100644 examples/offline_inference/tpu.py

diff --git a/docs/.nav.yml b/docs/.nav.yml
index 3151ea0e2ec2..c8bf00efb237 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -24,14 +24,16 @@ nav:
       - deployment/integrations
     - Training: training
     - Configuration:
-      - configuration/README.md
       - configuration/*
+      - TPU: https://docs.vllm.ai/projects/tpu/en/latest/
     - Models:
       - models/supported_models.md
       - models/generative_models.md
       - models/pooling_models.md
       - models/extensions
-      - Hardware Supported Models: models/hardware_supported_models
+      - Hardware Supported Models:
+        - models/hardware_supported_models/*
+        - TPU: https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/
     - Features: features
   - Developer Guide:
     - contributing/README.md
diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
deleted file mode 100644
index 2d24c9c6e2e9..000000000000
--- a/docs/configuration/tpu.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# TPU Optimization Tips
-
-This doc serves as a collection of handy tips for optimizing your vLLM on TPU workload.
-
-## Get started
-
-Looking for setup and installation instructions? Find them [here](https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/).
-
-### TPU workload sizing
-
-When selecting the ideal number of chips for a single serving instance, it's important to account for both the model size and the average request context length. Adequate HBM for the KV cache is essential to ensure a sufficient number of concurrent requests can be processed.
-
-The following colab [calculator](https://colab.research.google.com/github/ericehanley/rightsize-vllm/blob/main/HBM_Calculator.ipynb) will tell you:
-
-- KV cache size requirement per token and per request
-- TPU/GPU memory consumed by the model weights
-- TPU/GPU memory allocated for the KV cache
-- Maximum \# of requests you can approximately set (--max-num-seqs)
-
-This approach serves as a general rule of thumb.
-
-#### Latency-throughput tradeoff
-
-As with rightsizing the number of chips for your workload, consider adjusting `--max-num-seqs` to fine-tune the latency-throughput balance. Decreasing `--max-num-seqs` and/or increasing the number of chips can help reduce latency.
-
-`--max-num-seqs` defines the number of concurrent decode slots, effectively limiting the number of requests the server can process tokens for simultaneously. Increasing this value allows the server to pre-allocate more HBM to handle a higher number of concurrent requests, which can maximize overall throughput. However, this often increases the end-to-end (e2e) latency per request.
-
-Therefore, carefully tuning `--max-num-seqs` is crucial to achieving the desired balance between latency and throughput for your specific workload.
-
-In a similar way, `--max-num-batch-tokens` can be adjusted down to improve latency, or adjusted up to improve throughput.
-
-#### Compilation and Caching
-
-Coming from a GPU background, one of the key differences you'll notice with TPUs is an initial compilation step. TPUs are specialized accelerators (ASICs) that achieve maximum performance by executing pre-compiled, static computation graphs via the XLA compiler. Unlike GPUs, which can handle dynamic input shapes more flexibly, TPUs require a specific compiled graph for each tensor shape (e.g., batch size and sequence length) they process.
-
-To manage this, vLLM performs a one-time "warmup" process when you first launch the server. During this phase, it pre-compiles the model for various common input shapes and saves these compiled graphs to a cache on disk or remote storage (located at `~/.cache/vllm/xla_cache` by default). This process can range significantly, anywhere from a few minutes to an hour depending on the size of the model and context length used.
-
-Although the first compilation can take some time, for all subsequent server launches, vLLM can load these graphs directly from the cache, eliminating the compilation time for future runs.
-
-Use `VLLM_XLA_CACHE_PATH` environment variable to write to shareable storage for future deployed nodes (like when using autoscaling).
-
-#### Reducing compilation time
-
-This initial compilation time ranges significantly and is impacted by many of the arguments discussed in this optimization doc. Factors that influence the length of time to compile are things like model size and `--max-num-batch-tokens`. Other arguments you can tune are things like `VLLM_TPU_MOST_MODEL_LEN`.
-
-### Optimize based on your data
-
-#### max-model-len vs. most-model-len
-
-![most_model_len](../assets/design/tpu/most_model_len.png)
-
-If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
-
-For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`.
-
-The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time.
-
-#### Padding
-
-For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.)
-
-The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
-
-1. the default exponential padding (pad to the nearest power of 2)
-2. bucket padding (pad to the nearest linearly increasing bucket).
-
-When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`.
-
-For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512].
-
-The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320.
-
-However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
-
-#### Quantization
-
-If possible, use the precision that matches the chip’s hardware acceleration:
-
-- v5e has int4/int8 hardware acceleration in the MXU
-- v6e has int4/int8 hardware acceleration in the MXU
-
-Supported quantized formats and features in vLLM on TPU [Jul '25]:
-
-- INT8 W8A8
-- INT8 W8A16
-- FP8 KV cache
-- [WIP] FP8 W8A8
-- [WIP] AWQ
-- [WIP] FP4 W4A8
-
-#### Parallelization
-
-Don't set TP to be less than the number of chips on a single-host deployment.
-
-Although it’s common to do this with GPUs, don't try to fragment 2 or 8 different workloads across 8 chips on a single host. If you need 1 or 4 chips, just create an instance with 1 or 4 chips (these are partial-host machine types).
-
-### Tune your workloads
-
-Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
-
-### Future Topics We'll Cover
-
-#### Profiling
-
-The auto-tuner provides a profile of optimized configurations as its final step. However, interpreting this profile can be challenging for new users. We plan to expand this section in the future with more detailed guidance. In the meantime, you can learn how to collect a TPU profile using vLLM's native profiling tools [here](../examples/offline_inference/profiling_tpu.md). This profile can provide valuable insights into your workload's performance.
-
-#### SPMD
-
-More details to come.
-
-**Want us to cover something that isn't listed here? Open up an issue please and cite this doc. We'd love to hear your questions or tips.**
diff --git a/docs/features/README.md b/docs/features/README.md
index ad9de9ff8f36..5faf3768f321 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -59,20 +59,23 @@ th:not(:first-child) {
 
 ### Feature x Hardware
 
-| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU | Intel GPU |
-|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------|
-| [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
-| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
-| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ | ✅        |
-| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
-| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ | ✅        |
-| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅     | ❌ | ✅        |
-| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅       |
+| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | Intel GPU |
+|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------| ------------|
+| [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
+| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
+| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅       |
+| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ✅        |
+| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅     | ✅        |
+| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+
+!!! note
+    For information on feature support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation.
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index 74f005c496ee..7b5287bad3bb 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -43,24 +43,27 @@ th:not(:first-child) {
 }
 </style>
 
-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | Google TPU   |
-|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|
-| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌           |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌           |
-| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎           |
-| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌           |
-| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌           |
-| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌           |
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        |
+| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        |
+| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        |
+| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        |
 
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - ✅︎ indicates that the quantization method is supported on the specified hardware.
 - ❌ indicates that the quantization method is not supported on the specified hardware.
 
+!!! note
+    For information on quantization support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation.
+
 !!! note
     This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
diff --git a/docs/models/hardware_supported_models/tpu.md b/docs/models/hardware_supported_models/tpu.md
deleted file mode 100644
index 7b0a5ba6e72d..000000000000
--- a/docs/models/hardware_supported_models/tpu.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# TPU
-
-## Supported Models
-
-### Text-only Language Models
-
-| Model                                               | Architecture                   | Supported |
-|-----------------------------------------------------|--------------------------------|-----------|
-| mistralai/Mixtral-8x7B-Instruct-v0.1                | MixtralForCausalLM             | 🟨 |
-| mistralai/Mistral-Small-24B-Instruct-2501           | MistralForCausalLM             | ✅ |
-| mistralai/Codestral-22B-v0.1                        | MistralForCausalLM             | ✅ |
-| mistralai/Mixtral-8x22B-Instruct-v0.1               | MixtralForCausalLM             | ❌ |
-| meta-llama/Llama-3.3-70B-Instruct                   | LlamaForCausalLM               | ✅ |
-| meta-llama/Llama-3.1-8B-Instruct                    | LlamaForCausalLM               | ✅ |
-| meta-llama/Llama-3.1-70B-Instruct                   | LlamaForCausalLM               | ✅ |
-| meta-llama/Llama-4-*                                | Llama4ForConditionalGeneration | ❌ |
-| microsoft/Phi-3-mini-128k-instruct                  | Phi3ForCausalLM                | 🟨 |
-| microsoft/phi-4                                     | Phi3ForCausalLM                | ❌ |
-| google/gemma-3-27b-it                               | Gemma3ForConditionalGeneration | 🟨 |
-| google/gemma-3-4b-it                                | Gemma3ForConditionalGeneration | ❌ |
-| deepseek-ai/DeepSeek-R1                             | DeepseekV3ForCausalLM          | ❌ |
-| deepseek-ai/DeepSeek-V3                             | DeepseekV3ForCausalLM          | ❌ |
-| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8  | LlamaForCausalLM               | ✅ |
-| RedHatAI/Meta-Llama-3.1-70B-Instruct-quantized.w8a8 | LlamaForCausalLM               | ✅ |
-| Qwen/Qwen3-8B                                       | Qwen3ForCausalLM               | ✅ |
-| Qwen/Qwen3-32B                                      | Qwen3ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-7B-Instruct                            | Qwen2ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-32B                                    | Qwen2ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-14B-Instruct                           | Qwen2ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-1.5B-Instruct                          | Qwen2ForCausalLM               | 🟨 |
-
-✅ Runs and optimized.  
-🟨 Runs and correct but not optimized to green yet.  
-❌ Does not pass accuracy test or does not run.  
diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md
deleted file mode 100644
index 8c9c1c92b676..000000000000
--- a/examples/offline_inference/profiling_tpu/README.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# vLLM TPU Profiling
-
-This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes.
-
-Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes.
-
-We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [Google TPU installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/google_tpu.html).
-
-> In all examples below, we run several warmups before (so `--enforce-eager` is okay)
-
-## Profile Examples
-
-### Generate Prefill Trace
-
-This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations.
-
-```bash
-export XLA_HLO_DEBUG=1
-export MODEL=Qwen/Qwen2.5-7B-Instruct
-export VLLM_TPU_PROFILE_DURATION_MS=3000
-export VLLM_TPU_PROFILE_DELAY_MS=0
-
-python3 profiling.py \
-    --model $MODEL \
-    --input-len 1024 --output-len 1 \
-    --batch-size 1 --enforce-eager \
-    --max-model-len 2048 \
-    --tensor-parallel-size 1 \
-    --profile-result-dir profiles
-```
-
-### Generate Decode Trace
-
-This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
-
-```bash
-export XLA_HLO_DEBUG=1
-export MODEL=meta-llama/Llama-3.1-70B-Instruct
-export VLLM_TPU_PROFILE_DURATION_MS=2000
-export VLLM_TPU_PROFILE_DELAY_MS=1000
-
-rm -rf ~/.cache/vllm/xla_cache
-python3 profiling.py \
-    --model $MODEL \
-    --input-len 1 \
-    --output-len 128 \
-    --batch-size 32 \
-    --enforce-eager \
-    --profile-result-dir profiles \
-    --max-model-len 2048 --tensor-parallel-size 8
-```
-
-## Visualizing the profiles
-
-Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
-
-Here are most likely the dependencies you need to install:
-
-```bash
-pip install tensorflow-cpu \
-    tensorboard-plugin-profile \
-    etils \
-    importlib_resources
-```
-
-Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
-
-```bash
-tensorboard --logdir profiles/ --port 6006
-```
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
deleted file mode 100644
index 3b127e4fd29d..000000000000
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import dataclasses
-import os
-import time
-
-import numpy as np
-import torch_xla.debug.profiler as xp
-from tqdm import tqdm
-
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptType
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-
-DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
-DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
-
-
-def main(args: argparse.Namespace):
-    print(args)
-
-    engine_args = EngineArgs.from_cli_args(args)
-    llm = LLM(**dataclasses.asdict(engine_args))
-    server = xp.start_server(9012)  # noqa: F841
-
-    sampling_params = SamplingParams(
-        temperature=0.0,
-        ignore_eos=True,
-        max_tokens=args.output_len,
-    )
-    print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(
-        10000, size=(args.batch_size, args.input_len)
-    )
-    dummy_prompts: list[PromptType] = [
-        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
-    ]
-
-    def run_to_completion():
-        start_time = time.perf_counter()
-        llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
-        end_time = time.perf_counter()
-        latency = end_time - start_time
-        return latency
-
-    # Warmup
-    print("Warming up...")
-    warmup_latencies = []
-    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        warmup_latencies.append(run_to_completion())
-    print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s")
-
-    # Profile
-    profile_dir = args.profile_result_dir
-    print(f"Profiling (results will be saved to '{profile_dir}')...")
-    # Enable tracing on server
-    xp.trace_detached(
-        "localhost:9012", profile_dir, delay_ms=DELAY_MS, duration_ms=DURATION_MS
-    )
-    if DELAY_MS == 0:
-        time.sleep(1.0)
-    profile_latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profile iterations"):
-        profile_latencies.append(run_to_completion())
-    print(f"Average profile latency: {np.mean(profile_latencies):.4f}s")
-
-    return
-
-
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description="Benchmark the latency of processing a single batch of "
-        "requests till completion."
-    )
-    parser.add_argument("--input-len", type=int, default=32)
-    parser.add_argument("--output-len", type=int, default=128)
-    parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument(
-        "--num-iters-warmup",
-        type=int,
-        default=5,
-        help="Number of iterations to run for warmup.",
-    )
-    parser.add_argument(
-        "--num-iters",
-        type=int,
-        default=1,
-        help="Number of iterations to run for profiling.",
-    )
-    parser.add_argument(
-        "--profile-result-dir",
-        type=str,
-        default="profiles",
-        help=(
-            "path to save the pytorch profiler output. Can be visualized "
-            "with ui.perfetto.dev or Tensorboard "
-            "(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)."
-        ),
-    )
-
-    parser = EngineArgs.add_cli_args(parser)
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
deleted file mode 100644
index 0093b63b0b1f..000000000000
--- a/examples/offline_inference/tpu.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-from vllm import LLM, SamplingParams
-
-prompts = [
-    "A robot may not injure a human being",
-    "It is only with the heart that one can see rightly;",
-    "The greatest glory in living lies not in never falling,",
-]
-answers = [
-    " or, through inaction, allow a human being to come to harm.",
-    " what is essential is invisible to the eye.",
-    " but in rising every time we fall.",
-]
-N = 1
-# Currently, top-p sampling is disabled. `top_p` should be 1.0.
-sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
-
-
-def main():
-    parser = argparse.ArgumentParser(description="TPU offline inference example")
-    parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode")
-    args = parser.parse_args()
-
-    llm_args = {
-        "model": "Qwen/Qwen2-1.5B-Instruct",
-        "max_num_batched_tokens": 64,
-        "max_num_seqs": 4,
-        "max_model_len": 128,
-    }
-    if args.use_spmd:
-        os.environ["VLLM_XLA_USE_SPMD"] = "1"
-        # Can only hardcode the number of chips for now.
-        # calling xr.global_runtime_device_count() beforeing init SPMD env in
-        # torch_xla will mess up the distributed env.
-        llm_args["tensor_parallel_size"] = 8
-        # Use Llama, for num_kv_heads = 8.
-        llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct"
-
-    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-    # In real workloads, `enforce_eager` should be `False`.
-    llm = LLM(**llm_args)
-    outputs = llm.generate(prompts, sampling_params)
-    print("-" * 50)
-    for output, answer in zip(outputs, answers):
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
-        assert generated_text.startswith(answer)
-        print("-" * 50)
-
-
-if __name__ == "__main__":
-    main()

From 986ab5db6325fb4a5d937084ca7921a95641504a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 20 Nov 2025 19:42:33 -0500
Subject: [PATCH 258/578] [CI Bugfix] Fix Kernels DeepGEMM Test (H100) (#29106)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 71249a9543c7..6169b279dc8a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -554,7 +554,6 @@ steps:
   timeout_in_minutes: 45
   gpu: h100
   num_gpus: 1
-  optional: true
   source_file_dependencies:
   - tools/install_deepgemm.sh
   - vllm/utils/deep_gemm.py
@@ -565,10 +564,10 @@ steps:
   - tests/kernels/moe/test_batched_deepgemm.py
   - tests/kernels/attention/test_deepgemm_attention.py
   commands:
-    - pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm
-    - pytest -v -s tests/kernels/moe/test_deepgemm.py
-    - pytest -v -s tests/kernels/moe/test_batched_deepgemm.py
-    - pytest -v -s tests/kernels/attention/test_deepgemm_attention.py
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
 
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35

From 87cbbdff639f96766d4f6604cc970394c550dc5b Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 20 Nov 2025 20:16:52 -0500
Subject: [PATCH 259/578] Update model references for OLMo3 (#29099)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/models/supported_models.md | 2 +-
 tests/models/registry.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index f0531ced0aaa..626904a97415 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -424,7 +424,7 @@ th {
 | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ |
 | `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ |
 | `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ |
-| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ |
+| `OLMo3ForCausalLM` | OLMo3 | `allenai/Olmo-3-7B-Instruct`, `allenai/Olmo-3-32B-Think`, etc. | ✅︎ | ✅︎ |
 | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
 | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ |
 | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 1999e3cd2de2..b088e16756d7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -370,7 +370,7 @@ def check_available_online(
     ),
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
     "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
-    "Olmo3ForCausalLM": _HfExamplesInfo("shanearora/2025-sep-a-base-model"),
+    "Olmo3ForCausalLM": _HfExamplesInfo("allenai/Olmo-3-7B-Instruct"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
     "OpenPanguMTPModel": _HfExamplesInfo(
         "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1",

From df44df01431e8af444222addddd2789c0483d70a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 20 Nov 2025 20:41:49 -0500
Subject: [PATCH 260/578] [Feature] Shared Experts Overlap with FI deepgemm
 swap kernel, 2.2% throughput improvement and 3.6% TTFT improvement (#28879)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../fused_moe/fused_moe_modular_method.py     |  1 +
 vllm/model_executor/layers/fused_moe/layer.py | 69 +++++++++-------
 .../layers/fused_moe/modular_kernel.py        | 79 +++++++++++++++++--
 .../layers/fused_moe/prepare_finalize.py      |  3 +-
 4 files changed, 119 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index 43974ba917e4..c6dc95acdb63 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -50,6 +50,7 @@ def make(
                 prepare_finalize,
                 old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
                 shared_experts,
+                getattr(moe_layer, "shared_experts_stream", None),
             ),
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index d9525a7439c3..b2f554efd8a6 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -850,6 +850,45 @@ def update_expert_map(self):
                     dp_size=get_dp_group().world_size,
                 )
 
+    def _maybe_setup_shared_experts_stream(
+        self,
+        hidden_states: torch.Tensor,
+        has_separate_shared_experts: bool,
+        use_chunked_impl: bool,
+    ) -> tuple[bool, torch.Tensor | None]:
+        use_shared_experts_stream = (
+            has_separate_shared_experts
+            and not use_chunked_impl
+            and self.shared_experts_stream is not None
+            and (
+                hidden_states.shape[0]
+                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+            )
+        )
+
+        hidden_states_clone: torch.Tensor | None = None
+        if use_shared_experts_stream:
+            assert self.shared_experts_stream is not None
+
+            # Clone BEFORE switching streams to avoid race condition
+            # where routed_expert kernel may mutate hidden_states.
+            hidden_states_clone = hidden_states.clone()
+
+            # Record that the clone will be used by shared_experts_stream
+            # to avoid gc issue from deallocation of hidden_states_clone
+            # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
+            # NOTE: We dont need shared_output.record_stream(current_stream())
+            # because we synch the streams before using shared_output.
+            hidden_states_clone.record_stream(self.shared_experts_stream)
+
+            # Mark sync start point for the separate shared experts
+            # stream here since we want to run in parallel with the
+            # router/gate (next op below)
+            assert self.shared_experts_stream is not None
+            self.shared_experts_stream.wait_stream(current_stream())
+
+        return use_shared_experts_stream, hidden_states_clone
+
     def _load_per_tensor_weight_scale(
         self,
         shard_id: str,
@@ -1819,36 +1858,12 @@ def forward_impl(
 
         use_chunked_impl = self.use_dp_chunking
 
-        use_shared_experts_stream = (
-            has_separate_shared_experts
-            and not use_chunked_impl
-            and self.shared_experts_stream is not None
-            and (
-                hidden_states.shape[0]
-                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+        use_shared_experts_stream, hidden_states_clone = (
+            self._maybe_setup_shared_experts_stream(
+                hidden_states, has_separate_shared_experts, use_chunked_impl
             )
         )
 
-        if use_shared_experts_stream:
-            assert self.shared_experts_stream is not None
-
-            # Clone BEFORE switching streams to avoid race condition
-            # where routed_expert kernel may mutate hidden_states.
-            hidden_states_clone = hidden_states.clone()
-
-            # Record that the clone will be used by shared_experts_stream
-            # to avoid gc issue from deallocation of hidden_states_clone
-            # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
-            # NOTE: We dont need shared_output.record_stream(current_stream())
-            # because we synch the streams before using shared_output.
-            hidden_states_clone.record_stream(self.shared_experts_stream)
-
-            # Mark sync start point for the separate shared experts
-            # stream here since we want to run in parallel with the
-            # router/gate (next op below)
-            assert self.shared_experts_stream is not None
-            self.shared_experts_stream.wait_stream(current_stream())
-
         # If router/gate provided, then apply it here.
         # (Note: This code runs only when "overlapped mode" is on to allow
         #        parallel execution of shared experts with the FusedMoE via
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 093affe51f50..4af7af9257df 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -16,6 +16,7 @@
     count_expert_num_tokens,
     disable_inplace,
 )
+from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.v1.worker.ubatching import (
     dbo_current_ubatch_id,
@@ -709,11 +710,13 @@ def __init__(
         prepare_finalize: FusedMoEPrepareAndFinalize,
         fused_experts: FusedMoEPermuteExpertsUnpermute,
         shared_experts: torch.nn.Module | None = None,
+        shared_experts_stream: torch.cuda.Stream | None = None,
     ):
         super().__init__()
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
         self.shared_experts = shared_experts
+        self.shared_experts_stream = shared_experts_stream
 
         self._post_init_setup()
         assert (
@@ -890,6 +893,34 @@ def _slice_expert_tokens_metadata(
             expert_num_tokens_cpu=c_expert_num_tokens_cpu,
         )
 
+    def _maybe_setup_shared_experts_stream(
+        self, hidden_states: torch.Tensor
+    ) -> tuple[bool, torch.Tensor | None]:
+        # decide whether to run shared experts on a separate CUDA stream to
+        # overlap with the main fused MoE kernel.
+        use_shared_experts_stream = (
+            self.shared_experts is not None
+            and self.shared_experts_stream is not None
+            and hidden_states.is_cuda
+            and (
+                hidden_states.shape[0]
+                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+            )
+        )
+
+        hidden_states_clone: torch.Tensor | None = None
+        if use_shared_experts_stream and self.shared_experts_stream is not None:
+            # TODO: Optimize this (complicated)
+            # Note: this clone adds overhead but is required
+            # for correctness with multiple CUDA streams and CUDA graph capture.
+            hidden_states_clone = hidden_states.clone()
+            # record that the clone will be used by the separate stream so its
+            # lifetime is correctly tracked.
+            hidden_states_clone.record_stream(self.shared_experts_stream)
+            self.shared_experts_stream.wait_stream(torch.cuda.current_stream())
+
+        return use_shared_experts_stream, hidden_states_clone
+
     def _prepare(
         self,
         hidden_states: torch.Tensor,
@@ -1077,12 +1108,30 @@ def _finalize(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
+        hidden_states_clone: torch.Tensor | None = None,
+        use_shared_experts_stream: bool = False,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """
         The _finalize method is a wrapper around self.prepare_finalize.finalize
         that handles DBO, async and shared expert overlap.
         """
-        shared_output: torch.Tensor | None = None
+
+        def maybe_run_shared_experts() -> torch.Tensor | None:
+            if self.shared_experts is None:
+                return None
+
+            if (
+                not use_shared_experts_stream
+                or self.shared_experts_stream is not None
+                and (not hidden_states.is_cuda or not torch.cuda.is_available())
+            ):
+                # fall back to running on the current stream
+                return self.shared_experts(hidden_states)
+
+            assert hidden_states_clone is not None
+            # launch shared experts on the dedicated stream.
+            with torch.cuda.stream(self.shared_experts_stream):
+                return self.shared_experts(hidden_states_clone)
 
         if not self.prepare_finalize.supports_async():
             assert not dbo_enabled()
@@ -1095,8 +1144,7 @@ def _finalize(
                 apply_router_weight_on_input,
                 self.fused_experts.finalize_weight_and_reduce_impl(),
             )
-            if self.shared_experts is not None:
-                shared_output = self.shared_experts(hidden_states)
+            shared_output = maybe_run_shared_experts()
         else:
             finalize_ret = self.prepare_finalize.finalize_async(
                 output,
@@ -1107,8 +1155,7 @@ def _finalize(
                 self.fused_experts.finalize_weight_and_reduce_impl(),
             )
 
-            if self.shared_experts is not None:
-                shared_output = self.shared_experts(hidden_states)
+            shared_output = maybe_run_shared_experts()
 
             # TODO(lucas): refactor this in the alternative schedules followup
             # currently unpack if we have hook + receiver pair or just
@@ -1131,12 +1178,28 @@ def _finalize(
 
             receiver()
 
+        self._wait_for_shared_experts_stream(hidden_states, use_shared_experts_stream)
+
         if self.shared_experts is None:
             return output
         else:
             assert shared_output is not None
             return shared_output, output
 
+    def _wait_for_shared_experts_stream(
+        self, hidden_states: torch.Tensor, use_shared_experts_stream: bool
+    ) -> None:
+        # ensure that any work enqueued on the shared_experts_stream is
+        # completed before the shared_output tensor is consumed
+        if (
+            self.shared_experts is not None
+            and use_shared_experts_stream
+            and self.shared_experts_stream is not None
+            and hidden_states.is_cuda
+            and current_platform.is_cuda()
+        ):
+            torch.cuda.current_stream().wait_stream(self.shared_experts_stream)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1183,6 +1246,10 @@ def forward(
         else:
             output = torch.zeros_like(hidden_states)
 
+        use_shared_experts_stream, hidden_states_clone = (
+            self._maybe_setup_shared_experts_stream(hidden_states)
+        )
+
         local_num_experts = w1.size(0)
         if global_num_experts == -1:
             global_num_experts = local_num_experts
@@ -1219,4 +1286,6 @@ def forward(
             topk_weights,
             topk_ids,
             apply_router_weight_on_input,
+            hidden_states_clone=hidden_states_clone,
+            use_shared_experts_stream=use_shared_experts_stream,
         )
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 9bb976fb9ec9..e27e2eb32da0 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -45,7 +45,8 @@ def prepare(
             assert topk == 1, (
                 "apply_router_weight_on_input is only implemented for topk=1"
             )
-            a1.mul_(topk_weights.to(a1.dtype))
+            # Note: do not use inplace for shared experts overlap
+            a1 = a1 * topk_weights.to(a1.dtype)
 
         a1q, a1q_scale = moe_kernel_quantize_input(
             a1,

From 9875be6431872b513a8554c518e48ad79eba4656 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 21 Nov 2025 09:46:43 +0800
Subject: [PATCH 261/578] [LoRA][2/2]Remove LoRA extra vocab  (#28545)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py                       |  10 +
 tests/lora/test_layers.py                    | 189 ++-----------------
 tests/lora/test_llama_tp.py                  |  84 +++++----
 tests/lora/test_lora_functions.py            |   4 +-
 tests/lora/test_lora_manager.py              |  20 +-
 tests/lora/test_worker.py                    |   8 +-
 tests/lora/utils.py                          |   8 -
 vllm/config/lora.py                          |  18 +-
 vllm/engine/arg_utils.py                     |   5 -
 vllm/lora/layers/base.py                     |   1 -
 vllm/lora/layers/base_linear.py              |   1 -
 vllm/lora/layers/column_parallel_linear.py   |   1 -
 vllm/lora/layers/fused_moe.py                |   2 -
 vllm/lora/layers/logits_processor.py         |  55 +-----
 vllm/lora/layers/vocal_parallel_embedding.py |  33 +---
 vllm/lora/lora_weights.py                    |  24 ---
 vllm/lora/models.py                          |  54 +-----
 vllm/lora/punica_wrapper/punica_base.py      |  11 +-
 vllm/lora/punica_wrapper/punica_gpu.py       |   5 +-
 vllm/lora/punica_wrapper/punica_tpu.py       |   3 +-
 vllm/lora/punica_wrapper/punica_xpu.py       |   5 +-
 vllm/lora/utils.py                           |  10 +
 vllm/lora/worker_manager.py                  |   9 +-
 vllm/model_executor/models/granite.py        |  34 +---
 vllm/model_executor/models/llama.py          |  30 +--
 vllm/model_executor/models/mixtral.py        |  32 +---
 vllm/model_executor/models/teleflm.py        |   2 +-
 vllm/v1/worker/tpu_model_runner.py           |   3 -
 28 files changed, 133 insertions(+), 528 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index d8ff9339bb49..9d38ec542279 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -250,6 +250,16 @@ def olmoe_lora_files():
     return snapshot_download(repo_id="jeeejeee/olmoe-instruct-text2sql-spider")
 
 
+@pytest.fixture(scope="session")
+def qwen3_lora_files():
+    return snapshot_download(repo_id="charent/self_cognition_Alice")
+
+
+@pytest.fixture(scope="session")
+def llama32_lora_files():
+    return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
+
+
 @pytest.fixture
 def reset_default_device():
     """
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 8f18f0144193..9df3a07a9e5e 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -136,7 +136,6 @@ def populate_loras(
     id_to_index: list[int | None],
     layer: BaseLayerWithLoRA,
     layer_weights: torch.Tensor,
-    generate_embeddings_tensor: int = 0,
     repeats: int = 1,
 ) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]:
     """This method populates the lora layers with lora weights.
@@ -148,8 +147,6 @@ def populate_loras(
         layer: the LoRAlayer to populate.
         layer_weights: the PyTorch tensor containing the layer's
             weights.
-        generate_embeddings_tensor: whether to generate an
-            embeddings tensor for each LoRA.
         repeats: must only be set for column parallel packed
             layers. Indicates the number of loras to compose
             together to create a single lora layer.
@@ -171,7 +168,6 @@ def populate_loras(
                 sublora = DummyLoRAManager(layer_weights.device).init_random_lora(
                     module_name=f"fake_{i}",
                     weight=layer_weights,
-                    generate_embeddings_tensor=generate_embeddings_tensor,
                 )
                 sublora.lora_b = sublora.lora_b[
                     (sublora_len * i) : (sublora_len * (i + 1)), :
@@ -185,7 +181,6 @@ def populate_loras(
                 slot_idx,
                 lora_a=lora.lora_a,
                 lora_b=lora.lora_b,
-                embeddings_tensor=lora.embeddings_tensor,
             )
 
             lora_dict[lora_id] = lora
@@ -306,7 +301,6 @@ def create_random_embedding_layer():
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_embedding(torch.cat(inputs))
@@ -344,7 +338,6 @@ def create_random_embedding_layer():
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_embedding(torch.cat(inputs))
@@ -354,149 +347,6 @@ def create_random_embedding_layer():
         torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
-@torch.inference_mode()
-# @pytest.mark.skip(
-#     reason="Fails when loras are in any slot other than the first.")
-@pytest.mark.parametrize("num_loras", [1, 2, 4])
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
-@pytest.mark.parametrize("stage", STAGES)
-def test_embeddings_with_new_embeddings(
-    dist_init, num_loras, device, vocab_size, stage
-) -> None:
-    if current_platform.is_cuda_alike():
-        torch.cuda.set_device(device)
-
-    torch.set_default_device(device)
-    max_loras = 8
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(
-        max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
-    )
-
-    def create_random_embedding_layer():
-        embedding = VocabParallelEmbedding(vocab_size, 256)
-        embedding_data = torch.rand_like(embedding.weight.data)
-        embedding.weight.data = embedding_data
-        embedding.weight.data[vocab_size:, :] = 0
-        expanded_embedding = VocabParallelEmbedding(
-            vocab_size + lora_config.lora_extra_vocab_size * max_loras,
-            256,
-            org_num_embeddings=vocab_size,
-        )
-        expanded_embedding.weight.data[:vocab_size, :] = embedding_data
-        # We need to deepcopy the embedding as it will be modified
-        # in place
-        lora_embedding = VocabParallelEmbeddingWithLoRA(deepcopy(expanded_embedding))
-        lora_embedding.create_lora_weights(max_loras, lora_config)
-
-        return expanded_embedding, lora_embedding
-
-    for i in range(NUM_RANDOM_SEEDS):
-        set_random_seed(i)
-
-        id_to_index = get_random_id_to_index(num_loras, max_loras)
-        expanded_embedding, lora_embedding = create_random_embedding_layer()
-        lora_dict, _ = populate_loras(
-            id_to_index,
-            layer=lora_embedding,
-            layer_weights=torch.zeros(
-                (256, vocab_size + lora_config.lora_extra_vocab_size)
-            ),
-            generate_embeddings_tensor=256,
-        )
-
-        lora_embedding.set_mapping(punica_wrapper)
-        # All embeddings tensors have the same shape.
-        embeddings_tensors = [
-            lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
-        ]
-        embeddings_tensor_len = embeddings_tensors[0].shape[0]
-
-        # Add empty embeddings_tensors for unoccupied lora slots.
-        for _ in range(max_loras - len(embeddings_tensors)):
-            embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape))
-
-        inputs, index_mapping, prompt_mapping = create_random_inputs(
-            active_lora_ids=list(lora_dict.keys()),
-            num_inputs=num_loras * 3,
-            input_size=(200,),
-            input_range=(1, vocab_size),
-            device=device,
-        )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
-        punica_wrapper.update_metadata(
-            lora_mapping,
-            id_to_index,
-            max_loras,
-            vocab_size,
-            lora_config.lora_extra_vocab_size,
-        )
-        original_inputs = deepcopy(inputs)
-
-        # Force some of the inputs to be in the extended embeddings range
-        # to guarantee that their behavior is tested.
-        for input_, original_input_, lora_id in zip(
-            inputs, original_inputs, prompt_mapping
-        ):
-            embedding_id = lora_id - 1
-            input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len)
-            original_input_[-1] = vocab_size
-            input_[-2] = vocab_size + ((embedding_id + 1) * embeddings_tensor_len - 1)
-            original_input_[-2] = vocab_size + embeddings_tensor_len - 1
-
-        expanded_embedding.weight[
-            vocab_size : vocab_size + (embeddings_tensor_len * max_loras)
-        ] = torch.cat(embeddings_tensors)
-
-        lora_result = lora_embedding(torch.cat(original_inputs))
-
-        expected_results: list[torch.Tensor] = []
-        for input_, original_input_, lora_id in zip(
-            inputs, original_inputs, prompt_mapping
-        ):
-            lora = lora_dict[lora_id]
-            result = expanded_embedding(input_)
-            after_a = F.embedding(
-                original_input_,
-                lora.lora_a.T,
-            )
-            result += after_a @ lora.lora_b.T
-            expected_results.append(result)
-        expected_result = torch.cat(expected_results)
-
-        rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
-
-        # Check that resetting the lora weights succeeds
-
-        for slot_idx in range(max_loras):
-            lora_embedding.reset_lora(slot_idx)
-
-        inputs, index_mapping, prompt_mapping = create_random_inputs(
-            active_lora_ids=[0],
-            num_inputs=num_loras * 3,
-            input_size=(200,),
-            input_range=(1, vocab_size),
-            device=device,
-        )
-        original_inputs = deepcopy(inputs)
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
-        punica_wrapper.update_metadata(
-            lora_mapping,
-            id_to_index,
-            max_loras,
-            vocab_size,
-            lora_config.lora_extra_vocab_size,
-        )
-        lora_result = lora_embedding(torch.cat(original_inputs))
-        expected_result = expanded_embedding(torch.cat(inputs))
-
-        rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
-
-
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
@@ -518,16 +368,13 @@ def test_lm_head_logits_processor(
 
     def _pretest():
         linear = ParallelLMHead(
-            vocab_size + lora_config.lora_extra_vocab_size,
-            1024,
-            vocab_size,
+            num_embeddings=vocab_size,
+            embedding_dim=1024,
             params_dtype=torch.float16,
         )
         linear.weight.data = torch.rand_like(linear.weight.data)
         linear.weight.data[:, vocab_size:] = 0
-        logits_processor = LogitsProcessor(
-            vocab_size + lora_config.lora_extra_vocab_size, vocab_size
-        )
+        logits_processor = LogitsProcessor(vocab_size)
         lora_logits_processor = LogitsProcessorWithLoRA(
             logits_processor, 1024, linear.weight.dtype, linear.weight.device, None
         )
@@ -541,15 +388,12 @@ def _pretest():
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, logits_processor, lora_logits_processor = _pretest()
         lora_logits_processor.set_mapping(punica_wrapper)
-        # NOTE: all the generated loras share the same embeddings tensor.
+
         lora_dict, _ = populate_loras(
             id_to_index,
             layer=lora_logits_processor,
             layer_weights=linear.weight,
-            generate_embeddings_tensor=1024,
         )
-        embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor
-        embeddings_tensor_len = embeddings_tensor.shape[0]
 
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=list(lora_dict.keys()),
@@ -565,7 +409,6 @@ def _pretest():
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
         input_ = torch.rand(20, 1024)
 
@@ -575,23 +418,16 @@ def _pretest():
 
         original_lm_head = deepcopy(linear)
 
-        linear.weight[
-            logits_processor.org_vocab_size : logits_processor.org_vocab_size
-            + embeddings_tensor_len
-        ] = embeddings_tensor
-
-        logits_processor.org_vocab_size = vocab_size + lora_config.lora_extra_vocab_size
         expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = logits_processor._get_logits(
                 hidden_states=input_, lm_head=linear, embedding_bias=None
             )
-            result[:, vocab_size + embeddings_tensor_len :] = float("-inf")
+
             result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
-        logits_processor.org_vocab_size = vocab_size
 
         # Check that resetting the lora weights succeeds
 
@@ -612,7 +448,6 @@ def _pretest():
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_logits_processor._get_logits(
@@ -694,7 +529,6 @@ def create_random_linear_replicated_layer():
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -726,7 +560,10 @@ def create_random_linear_replicated_layer():
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
         punica_wrapper.update_metadata(
-            lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -817,7 +654,6 @@ def create_random_linear_parallel_layer():
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -849,7 +685,10 @@ def create_random_linear_parallel_layer():
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
         punica_wrapper.update_metadata(
-            lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -963,7 +802,6 @@ class FakeConfig:
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -1000,7 +838,6 @@ class FakeConfig:
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 7bbd1e364d19..18704fa6e45d 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -13,17 +13,27 @@
 
 from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
 
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+PROMPT_TEMPLATE = """<|eot_id|><|start_header_id|>user<|end_header_id|>
+I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
+"
+##Instruction:
+candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
+Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
+The People_ID of candidate is the foreign key of People_ID of people.
+###Input:
+{context}
+###Response:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+"""  # noqa: E501
 
 EXPECTED_LORA_OUTPUT = [
-    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
-    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
-    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
-    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
-    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
-    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ",  # noqa: E501
+    "SELECT count(*) FROM candidate",
+    "SELECT count(*) FROM candidate",
+    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
+    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
 ]
 
+MODEL_PATH = "meta-llama/Llama-3.2-3B-Instruct"
+
 
 def do_sample(
     llm: vllm.LLM,
@@ -32,18 +42,19 @@ def do_sample(
     tensorizer_config_dict: dict | None = None,
 ) -> list[str]:
     prompts = [
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]",  # noqa: E501
+        PROMPT_TEMPLATE.format(context="How many candidates are there?"),
+        PROMPT_TEMPLATE.format(context="Count the number of candidates."),
+        PROMPT_TEMPLATE.format(
+            context="Which poll resource provided the most number of candidate information?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            context="Return the poll resource associated with the most candidates."
+        ),
     ]
 
     sampling_params = vllm.SamplingParams(
-        temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]
+        temperature=0, max_tokens=64, stop=["<|im_end|>"]
     )
-
     if tensorizer_config_dict is not None:
         outputs = llm.generate(
             prompts,
@@ -75,13 +86,15 @@ def do_sample(
     return generated_texts
 
 
-def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None):
+def generate_and_test(
+    llm, llama32_lora_files, tensorizer_config_dict: dict | None = None
+):
     print("lora adapter created")
     print("lora 1")
     assert (
         do_sample(
             llm,
-            sql_lora_files,
+            llama32_lora_files,
             tensorizer_config_dict=tensorizer_config_dict,
             lora_id=1,
         )
@@ -92,7 +105,7 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None =
     assert (
         do_sample(
             llm,
-            sql_lora_files,
+            llama32_lora_files,
             tensorizer_config_dict=tensorizer_config_dict,
             lora_id=2,
         )
@@ -104,51 +117,52 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None =
 
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("cudagraph_specialize_lora", [True, False])
-def test_llama_lora(sql_lora_files, cudagraph_specialize_lora: bool):
+def test_llama_lora(llama32_lora_files, cudagraph_specialize_lora: bool):
     llm = vllm.LLM(
         MODEL_PATH,
-        tokenizer=sql_lora_files,
         enable_lora=True,
         # also test odd max_num_seqs
-        max_num_seqs=13,
+        max_num_seqs=7,
+        max_model_len=1024,
         max_loras=4,
         compilation_config=vllm.config.CompilationConfig(
             cudagraph_specialize_lora=cudagraph_specialize_lora,
         ),
     )
-    generate_and_test(llm, sql_lora_files)
+    generate_and_test(llm, llama32_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
-def test_llama_lora_tp4(sql_lora_files):
+def test_llama_lora_tp4(llama32_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
-        tokenizer=sql_lora_files,
         enable_lora=True,
-        max_num_seqs=16,
+        max_num_seqs=7,
+        max_model_len=1024,
         max_loras=4,
         tensor_parallel_size=4,
     )
-    generate_and_test(llm, sql_lora_files)
+    generate_and_test(llm, llama32_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
-def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
+def test_llama_lora_tp4_fully_sharded_loras(llama32_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
-        tokenizer=sql_lora_files,
         enable_lora=True,
-        max_num_seqs=16,
+        max_num_seqs=8,
         max_loras=4,
+        max_model_len=1024,
         tensor_parallel_size=4,
         fully_sharded_loras=True,
     )
-    generate_and_test(llm, sql_lora_files)
+    generate_and_test(llm, llama32_lora_files)
 
 
 @multi_gpu_test(num_gpus=2)
 def test_tp2_serialize_and_deserialize_lora(
-    tmp_path, sql_lora_files, sql_lora_huggingface_id
+    tmp_path,
+    llama32_lora_files,
 ):
     # Run the tensorizing of the LoRA adapter and the model in a subprocess
     # to guarantee cleanup
@@ -157,7 +171,7 @@ def test_tp2_serialize_and_deserialize_lora(
     model_name = "model-rank-%03d.tensors"
 
     model_ref = MODEL_PATH
-    lora_path = sql_lora_huggingface_id
+    lora_path = llama32_lora_files
     suffix = "test"
     try:
         result = subprocess.run(
@@ -195,12 +209,12 @@ def test_tp2_serialize_and_deserialize_lora(
 
     loaded_llm = LLM(
         model=model_ref,
-        tokenizer=sql_lora_files,
         load_format="tensorizer",
         enable_lora=True,
         enforce_eager=True,
         model_loader_extra_config=tensorizer_config,
-        max_num_seqs=13,
+        max_num_seqs=7,
+        max_model_len=1024,
         tensor_parallel_size=2,
         max_loras=2,
     )
@@ -211,7 +225,7 @@ def test_tp2_serialize_and_deserialize_lora(
     print("lora 1")
     assert (
         do_sample(
-            loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
+            loaded_llm, llama32_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
         )
         == EXPECTED_LORA_OUTPUT
     )
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index e914393fee8a..1c692630284d 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -13,8 +13,8 @@
 from vllm.lora.request import LoRARequest
 from vllm.v1.engine.llm_engine import LLMEngine
 
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
-LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
+MODEL_PATH = "Qwen/Qwen3-0.6B"
+LORA_MODULE_PATH = "charent/self_cognition_Alice"
 LORA_RANK = 8
 
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index e7816031142e..24d4dfca46d6 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -48,9 +48,6 @@
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
-    new_embeddings = load_file(
-        os.path.join(sql_lora_files, "new_embeddings.safetensors")
-    )
 
     peft_helper = PEFTHelper.from_local_dir(
         sql_lora_files, max_position_embeddings=4096
@@ -60,7 +57,6 @@ def test_from_lora_tensors(sql_lora_files, device):
         tensors,
         peft_helper=peft_helper,
         device=device,
-        embeddings=new_embeddings,
         embedding_modules=EMBEDDING_MODULES,
         embedding_padding_modules=EMBEDDING_PADDING_MODULES,
     )
@@ -76,18 +72,6 @@ def test_from_lora_tensors(sql_lora_files, device):
             f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
         )
         assert lora.lora_a.shape[0] == 8
-        embeddings_module = next(
-            (k for k in EMBEDDING_MODULES if k in module_name), None
-        )
-        if embeddings_module:
-            assert torch.equal(
-                lora.embeddings_tensor,
-                new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
-                    device=lora.embeddings_tensor.device
-                ),
-            )
-        else:
-            assert lora.embeddings_tensor is None
 
 
 def create_lora(
@@ -552,9 +536,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
     worker_adapter_manager = WorkerLoRAManager(
         vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
     )
-    worker_adapter_manager.vocab_size = (
-        dummy_model_gate_up.unpadded_vocab_size - lora_config.lora_extra_vocab_size
-    )
+    worker_adapter_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
     worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
 
     dummy_lora_files = f"{tmp_path}/lora_adapter"
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index c97f8debd1b9..b163559a9414 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -20,11 +20,12 @@
 from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker
 
+MODEL_PATH = "Qwen/Qwen3-0.6B"
 NUM_LORAS = 16
 
 
 @patch.dict(os.environ, {"RANK": "0"})
-def test_worker_apply_lora(sql_lora_files):
+def test_worker_apply_lora(qwen3_lora_files):
     def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
         lora_mapping = LoRAMapping([], [])
 
@@ -34,9 +35,10 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
 
     vllm_config = VllmConfig(
         model_config=ModelConfig(
-            "meta-llama/Llama-2-7b-hf",
+            MODEL_PATH,
             seed=0,
             dtype="float16",
+            max_model_len=127,
             enforce_eager=True,
         ),
         load_config=LoadConfig(
@@ -73,7 +75,7 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
     assert worker.list_loras() == set()
 
     lora_requests = [
-        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(NUM_LORAS)
+        LoRARequest(str(i + 1), i + 1, qwen3_lora_files) for i in range(NUM_LORAS)
     ]
 
     set_active_loras(worker, lora_requests)
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index d30b77f09466..6aba5299b582 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -28,7 +28,6 @@ def init_random_lora(
         module_name: str,
         weight: torch.Tensor,
         rank: int = 8,
-        generate_embeddings_tensor: int = 0,
     ):
         lora = LoRALayerWeights(
             module_name,
@@ -41,13 +40,6 @@ def init_random_lora(
                 [weight.shape[0], rank], dtype=weight.dtype, device=self._device
             ),
         )
-        if generate_embeddings_tensor:
-            lora.embeddings_tensor = torch.rand(
-                5,
-                generate_embeddings_tensor,
-                dtype=weight.dtype,
-                device=self._device,
-            )
         self.set_module_lora(module_name, lora)
 
         return lora
diff --git a/vllm/config/lora.py b/vllm/config/lora.py
index 84e92eef4007..072e0ec2104f 100644
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
-from typing import TYPE_CHECKING, Any, ClassVar, Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 import torch
 from pydantic import ConfigDict, Field, model_validator
@@ -11,7 +11,6 @@
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -46,19 +45,6 @@ class LoRAConfig:
     `max_loras`."""
     lora_dtype: torch.dtype | LoRADType = "auto"
     """Data type for LoRA. If auto, will default to base model dtype."""
-    lora_extra_vocab_size: LoRAExtraVocabSize = Field(
-        default=256,
-        deprecated=(
-            "`lora_extra_vocab_size` is deprecated and will be removed "
-            "in v0.12.0. Additional vocabulary support for "
-            "LoRA adapters is being phased out."
-        ),
-    )
-    """(Deprecated) Maximum size of extra vocabulary that can be present in a 
-    LoRA adapter. Will be removed in v0.12.0."""
-    lora_vocab_padding_size: ClassVar[int] = (
-        current_platform.get_lora_vocab_padding_size()
-    )
     default_mm_loras: dict[str, str] | None = None
     """Dictionary mapping specific modalities to LoRA model paths; this field
     is only applicable to multimodal models and should be leveraged when a
@@ -87,8 +73,6 @@ def compute_hash(self) -> str:
         factors.append(self.max_loras)
         factors.append(self.fully_sharded_loras)
         factors.append(self.lora_dtype)
-        factors.append(self.lora_extra_vocab_size)
-        factors.append(self.lora_vocab_padding_size)
 
         hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 74828bc109cb..bcb90119f9b0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -484,7 +484,6 @@ class EngineArgs:
     fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
     max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
     lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
-    lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
 
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
@@ -1011,9 +1010,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         )
         lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
         lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"])
-        lora_group.add_argument(
-            "--lora-extra-vocab-size", **lora_kwargs["lora_extra_vocab_size"]
-        )
         lora_group.add_argument(
             "--lora-dtype",
             **lora_kwargs["lora_dtype"],
@@ -1680,7 +1676,6 @@ def create_engine_config(
                 max_loras=self.max_loras,
                 default_mm_loras=self.default_mm_loras,
                 fully_sharded_loras=self.fully_sharded_loras,
-                lora_extra_vocab_size=self.lora_extra_vocab_size,
                 lora_dtype=self.lora_dtype,
                 max_cpu_loras=self.max_cpu_loras
                 if self.max_cpu_loras and self.max_cpu_loras > 0
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
index 0c7e80684889..62326c05b2bd 100644
--- a/vllm/lora/layers/base.py
+++ b/vllm/lora/layers/base.py
@@ -44,7 +44,6 @@ def set_lora(
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
     ):
         """Overwrites lora tensors at index."""
         ...
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index 3db4165e2017..e85c5bd70b07 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -96,7 +96,6 @@ def set_lora(
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
     ):
         # Except for QKVParallelLinearWithLoRA and
         # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index 637ded9b2a0f..273c4950e323 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -248,7 +248,6 @@ def set_lora(
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
 
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 3291c41fcda1..adf30855cafc 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -406,8 +406,6 @@ def set_lora(
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
-        bias: torch.Tensor | None = None,
     ):
         """Overwrites lora tensors at index."""
         self.reset_lora(index)
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index adc5e861f57f..06f92652031e 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import math
 
 import torch
 import torch.nn as nn
@@ -108,22 +107,13 @@ def create_lora_weights(
             (
                 max_loras,
                 1,
-                # Pad for kernel compatibility
-                math.ceil(
-                    self.base_layer.vocab_size / lora_config.lora_vocab_padding_size
-                )
-                * lora_config.lora_vocab_padding_size,
+                self.base_layer.vocab_size,
                 lora_config.max_lora_rank,
             ),
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
-        self.embeddings_tensors = torch.full(
-            (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size),
-            fill_value=float("-inf"),
-            dtype=self.dtype,
-            device=self.device,
-        )
+
         if self.sharded_to_full_mapping is not None:
             self.sharded_to_full_mapping_gpu = torch.tensor(
                 self.sharded_to_full_mapping, device=self.device, dtype=torch.long
@@ -134,14 +124,12 @@ def create_lora_weights(
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
-        self.embeddings_tensors[index] = float("-inf")
 
     def set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
         self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
@@ -150,12 +138,6 @@ def set_lora(
         self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
             lora_b, non_blocking=True
         )
-        if embeddings_tensor is not None:
-            self.embeddings_tensors[
-                index,
-                : embeddings_tensor.shape[0],
-                : embeddings_tensor.shape[1],
-            ] = embeddings_tensor
 
     def _get_logits(
         self,
@@ -193,39 +175,6 @@ def _get_logits(
             # token_id: [0, 1, 2, 3, 4, 5, -1, -1]
             logits = logits[:, self.sharded_to_full_mapping_gpu]
 
-        lora_logits = torch.empty(
-            self.embeddings_tensors.shape[0] + 1,
-            self.embeddings_tensors.shape[1],
-            hidden_states.shape[0],
-            dtype=self.embeddings_tensors.dtype,
-            device=self.embeddings_tensors.device,
-        )
-        torch.matmul(self.embeddings_tensors, hidden_states.T, out=lora_logits[:-1])
-
-        neg_inf, pos_inf = current_platform.get_infinity_values(lora_logits.dtype)
-
-        lora_logits[-1] = neg_inf
-        lora_logits = lora_logits.mT
-        indices_padded = self.punica_wrapper.sampler_indices_padded
-
-        if current_platform.is_tpu() or current_platform.is_xpu():
-            indices_padded = indices_padded[: logits.size(0)]
-
-        lora_logits = (
-            lora_logits.reshape(
-                lora_logits.shape[0] * lora_logits.shape[1],
-                lora_logits.shape[2],
-            )
-            .index_select(0, indices_padded)
-            .nan_to_num_(nan=neg_inf, posinf=pos_inf, neginf=neg_inf)
-        )
-
-        logits[
-            :,
-            self.base_layer.org_vocab_size : self.base_layer.org_vocab_size
-            + lora_logits.shape[1],
-        ] = lora_logits
-
         lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_logits(
             logits, hidden_states, self.lora_a_stacked, self.lora_b_stacked, 1.0
         )
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
index ca4ad8012e9c..5b1f7886bc23 100644
--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -46,19 +46,10 @@ def create_lora_weights(
             self.embeddings_slice = None
             self.embeddings_weights = None
 
-        self.embeddings_tensors = torch.zeros(
-            (
-                max_loras,
-                lora_config.lora_extra_vocab_size,
-                self.base_layer.embedding_dim,
-            ),
-            dtype=self.base_layer.weight.dtype,
-            device=self.base_layer.weight.device,
-        )
         self.lora_a_stacked = torch.zeros(
             (
                 max_loras,
-                self.base_layer.org_vocab_size + lora_config.lora_extra_vocab_size,
+                self.base_layer.org_vocab_size,
                 lora_config.max_lora_rank,
             ),
             dtype=lora_config.lora_dtype,
@@ -82,14 +73,12 @@ def create_lora_weights(
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
-        self.embeddings_tensors[index] = 0
 
     def set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
         # NOTE self.lora_a_stacked is row-major, and lora_a is col-major,
@@ -100,36 +89,18 @@ def set_lora(
         self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
             lora_b, non_blocking=True
         )
-        if embeddings_tensor is not None:
-            self.embeddings_tensors[
-                index,
-                : embeddings_tensor.shape[0],
-                : embeddings_tensor.shape[1],
-            ].copy_(embeddings_tensor, non_blocking=True)
-            if self.embeddings_slice is not None:
-                # TODO(yard1): Optimize this copy, we don't need to copy
-                # everything, just the modified part
-                embeddings = self.embeddings_tensors.view(
-                    self.embeddings_tensors.shape[0] * self.embeddings_tensors.shape[1],
-                    self.embeddings_tensors.shape[2],
-                )[self.embeddings_slice[0] : self.embeddings_slice[1]]
-                assert self.embeddings_weights is not None
-                self.embeddings_weights[: embeddings.shape[0]].copy_(embeddings)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1, 1, 0)
-
         # NB: Don't use torch.narrow here. torch.narrow triggers some
         # Dynamic Shape specialization in torch.compile
         num_tokens = x.shape[0]
         indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens]
-        indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens]
 
         full_lora_a_embeddings = F.embedding(
             x + indices_1,
             self.lora_a_stacked_2d,
         )
-        full_output = self.base_layer.forward(x + (indices_0 * added_tokens_mask))
+        full_output = self.base_layer.forward(x)
 
         full_output_org = full_output
         if full_output.ndim == 3:
diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py
index 7691481d5039..f0d8e2219405 100644
--- a/vllm/lora/lora_weights.py
+++ b/vllm/lora/lora_weights.py
@@ -21,7 +21,6 @@ def __init__(
         lora_alpha: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None = None,
         scaling: float | None = None,
     ) -> None:
         self.module_name = module_name
@@ -29,7 +28,6 @@ def __init__(
         self.lora_alpha = lora_alpha
         self.lora_a = lora_a
         self.lora_b = lora_b
-        self.embeddings_tensor = embeddings_tensor
 
         if scaling is None:
             self.scaling = self.lora_alpha / self.rank
@@ -56,18 +54,11 @@ def output_dim(self) -> int:
     def is_packed(self) -> bool:
         return False
 
-    @property
-    def extra_vocab_size(self) -> int:
-        return (
-            self.embeddings_tensor.shape[0] if self.embeddings_tensor is not None else 0
-        )
-
     @classmethod
     def from_config(
         cls,
         module_name: str,
         peft_helper: PEFTHelper,
-        embeddings_tensor: torch.Tensor | None = None,
     ) -> "LoRALayerWeights":
         # lora_a and lora_b are set to None for config-based construction
         return cls(
@@ -76,7 +67,6 @@ def from_config(
             peft_helper.lora_alpha,
             None,
             None,
-            embeddings_tensor,
             peft_helper.vllm_lora_scaling_factor,
         )
 
@@ -89,7 +79,6 @@ def create_dummy_lora_weights(
         rank: int,
         dtype: torch.dtype,
         device: torch.types.Device,
-        embeddings_tensor_dim: int | None = None,
     ) -> "LoRALayerWeights":
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         lora_a = torch.zeros(
@@ -99,24 +88,12 @@ def create_dummy_lora_weights(
             [output_dim, rank], dtype=dtype, device=device, pin_memory=pin_memory
         )
 
-        embeddings_tensor = (
-            torch.rand(
-                10,
-                embeddings_tensor_dim,
-                dtype=dtype,
-                device=device,
-                pin_memory=pin_memory,
-            )
-            if embeddings_tensor_dim
-            else None
-        )
         return cls(
             module_name,
             rank=rank,
             lora_alpha=1,
             lora_a=lora_a,
             lora_b=lora_b,
-            embeddings_tensor=embeddings_tensor,
         )
 
 
@@ -139,7 +116,6 @@ def __init__(
             lora_a=lora_a,
             lora_b=lora_b,
             scaling=scaling,  # type: ignore
-            embeddings_tensor=None,
         )
         self.lora_alphas = lora_alphas
         if scaling is None:
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 02c252f15bfa..eb11cd0afc48 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -21,6 +21,7 @@
     from_layer,
     from_layer_logits_processor,
     get_supported_lora_modules,
+    is_base_embeddding_weights,
     is_regex_target_modules,
     parse_fine_tuned_lora_name,
     process_packed_modules_mapping,
@@ -93,14 +94,6 @@ def clone(self, lora_model_id: int) -> "LoRAModel":
             loras=self.loras.copy(),
         )
 
-    @property
-    def extra_vocab_size(self) -> int:
-        return (
-            max(lora.extra_vocab_size for lora in self.loras.values())
-            if self.loras
-            else 0
-        )
-
     def get_lora(self, module_name: str) -> LoRALayerWeights | None:
         """Get LoRA for a given module by name"""
         return self.loras.get(module_name, None)
@@ -117,7 +110,6 @@ def from_lora_tensors(
         peft_helper: PEFTHelper,
         device: str = "cuda",
         dtype: torch.dtype | None = None,
-        embeddings: dict[str, torch.Tensor] | None = None,
         target_embedding_padding: int | None = None,
         embedding_modules: dict[str, str] | None = None,
         embedding_padding_modules: list[str] | None = None,
@@ -127,24 +119,14 @@ def from_lora_tensors(
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         loras: dict[str, LoRALayerWeights] = {}
         for tensor_name, tensor in tensors.items():
+            if is_base_embeddding_weights(tensor_name):
+                continue
             module_name, is_lora_a = parse_fine_tuned_lora_name(
                 tensor_name, weights_mapper
             )
             if module_name not in loras:
-                lora_embeddings_tensor = None
-                if embeddings:
-                    assert embedding_modules is not None
-                    embeddings_module = next(
-                        (k for k in embedding_modules if k in module_name), None
-                    )
-                    if embeddings_module:
-                        lora_embeddings_tensor = embeddings[
-                            embedding_modules[embeddings_module]
-                        ].to(device=device, dtype=dtype)
-                        if pin_memory:
-                            lora_embeddings_tensor = lora_embeddings_tensor.pin_memory()
                 loras[module_name] = LoRALayerWeights.from_config(
-                    module_name, peft_helper, lora_embeddings_tensor
+                    module_name, peft_helper
                 )
 
             if is_lora_a:
@@ -206,15 +188,17 @@ def from_local_checkpoint(
         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
         lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
-        new_embeddings_tensor_path = os.path.join(
-            lora_dir, "new_embeddings.safetensors"
-        )
-        new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
+        # new_embeddings_tensor_path = os.path.join(
+        #     lora_dir, "new_embeddings.safetensors"
+        # )
+        # new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
         tensors: dict[str, torch.Tensor] = {}
         unexpected_modules: list[list[str] | str] = []
 
         def check_unexpected_modules(modules: dict):
             for lora_module in modules.keys():  # noqa
+                if is_base_embeddding_weights(lora_module):
+                    continue
                 module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
                 # Handle FSDP file format where experts.base_layer is the
                 # gate_up_proj and experts is the down_proj
@@ -300,21 +284,12 @@ def check_unexpected_modules(modules: dict):
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
 
-        embeddings = None
-        if os.path.isfile(new_embeddings_tensor_path):
-            embeddings = safetensors.torch.load_file(new_embeddings_tensor_path)
-        elif os.path.isfile(new_embeddings_bin_file_path):
-            embeddings = torch.load(
-                new_embeddings_bin_file_path, map_location=device, weights_only=True
-            )
-
         return cls.from_lora_tensors(
             lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id,
             tensors=tensors,
             peft_helper=peft_helper,
             device=device,
             dtype=dtype,
-            embeddings=embeddings,
             target_embedding_padding=target_embedding_padding,
             embedding_modules=embedding_modules,
             embedding_padding_modules=embedding_padding_modules,
@@ -474,7 +449,6 @@ def activate_adapter(
                     index,
                     module_lora.lora_a,
                     module_lora.lora_b,
-                    module_lora.embeddings_tensor,
                 )
             else:
                 module.reset_lora(index)
@@ -505,7 +479,6 @@ def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
             self.lora_index_to_id,
             self.lora_slots + 1,
             self.vocab_size,
-            self.lora_config.lora_extra_vocab_size,
         )
 
     def remove_all_adapters(self):
@@ -616,7 +589,6 @@ def create_dummy_lora(
                 if parts[-1] in embedding_modules:
                     input_dim = (
                         module.base_layer.org_vocab_size
-                        + self.lora_config.lora_extra_vocab_size
                         if hasattr(module.base_layer, "org_vocab_size")
                         else module.base_layer.weight.shape[1]
                     )
@@ -625,11 +597,6 @@ def create_dummy_lora(
                         if hasattr(module.base_layer, "embedding_dim")
                         else module.base_layer.weight.shape[0]
                     )
-                    embeddings_tensor_dim = (
-                        module.base_layer.embedding_dim
-                        if hasattr(module.base_layer, "embedding_dim")
-                        else module.base_layer.weight.shape[1]
-                    )
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
                         input_dim,
@@ -637,7 +604,6 @@ def create_dummy_lora(
                         rank,
                         module.lora_a_stacked[0].dtype,
                         "cpu",
-                        embeddings_tensor_dim=embeddings_tensor_dim,
                     )
                 else:
                     lora = LoRALayerWeights.create_dummy_lora_weights(
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index a6ffbb7b71ce..7c0fc8167711 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -31,7 +31,6 @@ def update_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ) -> None:
         """
@@ -172,8 +171,11 @@ def _update_base_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
     ):
+        # NOTE We have remove lora extra vocab support for now. So we set
+        # extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed.
+
+        extra_vocab_size = 0
         (
             base_indices,
             sampler_indices,
@@ -285,12 +287,9 @@ def update_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ):
-        self._update_base_metadata(
-            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
-        )
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
 
         if mapping.is_prefill:
             # Update metadata required for prefill-related operators.
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index d863a5884d3c..52138ef0cc3b 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -65,13 +65,10 @@ def update_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ):
         self.is_prefill = mapping.is_prefill
-        self._update_base_metadata(
-            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
-        )
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
 
         # Prepare cuda kernel metadata tensors
         self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py
index 090878dcd254..0888772db54e 100644
--- a/vllm/lora/punica_wrapper/punica_tpu.py
+++ b/vllm/lora/punica_wrapper/punica_tpu.py
@@ -292,7 +292,6 @@ def _update_base_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
     ):
         # Make sure we don't accidentally collect outside operations
         torch_xla.sync()
@@ -313,7 +312,7 @@ def _update_base_metadata(
             lora_index_to_id,
             max_loras,
             vocab_size,
-            extra_vocab_size,
+            0,  # extra_vocab_size
             "cpu",
         )
         self._token_lora_indices = self._pad_to_shape(
diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py
index b95087d0ff83..00c00782896c 100644
--- a/vllm/lora/punica_wrapper/punica_xpu.py
+++ b/vllm/lora/punica_wrapper/punica_xpu.py
@@ -43,13 +43,10 @@ def update_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ):
         self.is_prefill = mapping.is_prefill
-        self._update_base_metadata(
-            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
-        )
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
 
     def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor:
         return torch.narrow(self._token_lora_indices, 0, 0, x.size(0))
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 0f43ff06d8f2..a49a7d9d1669 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -166,6 +166,16 @@ def parse_fine_tuned_lora_name(
     raise ValueError(f"{name} is unsupported LoRA weight")
 
 
+def is_base_embeddding_weights(name: str) -> bool:
+    # hardcoded subfixes for input & output embedding weights
+    input_embedding_subfix = ".embed_tokens.base_layer.weight"
+    output_embedding_subfix = ".lm_head.base_layer.weight"
+
+    return name.endswith(input_embedding_subfix) or name.endswith(
+        output_embedding_subfix
+    )
+
+
 def is_regex_target_modules(
     load_modules: str | list[str], expected_lora_modules: list[str]
 ) -> bool:
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index b85151f2c759..4cc201a6414f 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -121,8 +121,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 lora_model_id=lora_request.lora_int_id,
                 device="cpu",
                 dtype=self.lora_config.lora_dtype,
-                target_embedding_padding=self.vocab_size
-                + self.lora_config.lora_extra_vocab_size,
+                target_embedding_padding=self.vocab_size,
                 embedding_modules=self.embedding_modules,
                 embedding_padding_modules=self.embedding_padding_modules,
                 tensorizer_config_dict=lora_request.tensorizer_config_dict,
@@ -143,12 +142,6 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
             # For BadRequestError
             raise e
 
-        if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
-            raise ValueError(
-                f"LoRA added vocab size {lora.extra_vocab_size} "
-                f"is greater than lora_extra_vocab_size "
-                f"{self.lora_config.lora_extra_vocab_size}."
-            )
         return lora
 
     def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 1dc205b47753..cd7ce2fc8f00 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -46,7 +46,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -261,29 +260,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
-                self.vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
             )
         else:
@@ -420,28 +406,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = GraniteModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -453,7 +429,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 logit_scale /= config.logits_scaling
 
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, scale=logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d5b49d2fb4c2..ebf8addda4a5 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -47,7 +47,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -368,24 +367,18 @@ def __init__(
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -562,9 +555,7 @@ def __init__(
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         self.config = config
-        self.lora_config = lora_config
 
         self.model = self._init_model(
             vllm_config=vllm_config,
@@ -573,20 +564,9 @@ def __init__(
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -595,7 +575,7 @@ def __init__(
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 54ab8dd493e7..0a9c3f136964 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -51,7 +51,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -301,23 +300,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         parallel_config = vllm_config.parallel_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         self.org_vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.enable_eplb = parallel_config.enable_eplb
@@ -508,34 +502,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = MixtralModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
index 8a0bec9dff84..bebd7bcaa924 100644
--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
@@ -74,5 +74,5 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.output_mult = self.config.output_mult / self.mup_scale_factor
             logit_scale = self.output_mult
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, self.config.vocab_size, logit_scale
+                self.config.vocab_size, scale=logit_scale
             )
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index e9eb7cad38f8..923c31c187f3 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -219,9 +219,6 @@ def __init__(
         self.hidden_size = model_config.get_hidden_size()
         self.vocab_size = model_config.get_vocab_size()
 
-        if self.lora_config is not None:
-            self.vocab_size += self.lora_config.lora_extra_vocab_size
-
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope

From ed6ae1e36a03bed4a29287163e051a7772b1d8b1 Mon Sep 17 00:00:00 2001
From: Xiao Li <swing1979@gmail.com>
Date: Thu, 20 Nov 2025 17:54:35 -0800
Subject: [PATCH 262/578] [AITER] [ROCm] Fix crash when loading llama4 model
 with old aiter version installed, fallback to forward_native implementation
 (#29124)

Signed-off-by: Xiao Li <ilx@meta.com>
---
 vllm/v1/sample/ops/topk_topp_sampler.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index c6c7e924175f..5b2d130b0ea4 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -60,13 +60,20 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
             logprobs_mode not in ("processed_logits", "processed_logprobs")
             and rocm_aiter_ops.is_enabled()
         ):
-            import aiter.ops.sampling  # noqa: F401
+            try:
+                import aiter.ops.sampling  # noqa: F401
 
-            self.aiter_ops = torch.ops.aiter
-            logger.info_once(
-                "Using aiter sampler on ROCm (lazy import, sampling-only)."
-            )
-            self.forward = self.forward_hip
+                self.aiter_ops = torch.ops.aiter
+                logger.info_once(
+                    "Using aiter sampler on ROCm (lazy import, sampling-only)."
+                )
+                self.forward = self.forward_hip
+            except ImportError:
+                logger.warning_once(
+                    "aiter.ops.sampling is not available on ROCm. "
+                    "Falling back to forward_native implementation."
+                )
+                self.forward = self.forward_native
         else:
             self.forward = self.forward_native
 

From e1eefa4c40fc5b28bd7e83b6596bb5d2f420fd92 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 20 Nov 2025 20:54:59 -0500
Subject: [PATCH 263/578] [Bug] Fix torch warning of tf32 usage (#29112)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/batch_invariant.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 69fa6bdffd43..bec7af028634 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -852,5 +852,6 @@ def init_batch_invariance():
         enable_batch_invariant_mode()
 
         # Disable TF32 for batch invariance - it causes non-deterministic rounding
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cuda.matmul.fp32_precision = "ieee"
+        torch.backends.cudnn.conv.fp32_precision = "ieee"
+        torch.backends.cudnn.rnn.fp32_precision = "ieee"

From 3f5f36da3fefbae96960f60d41ccf8ac1155515e Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Thu, 20 Nov 2025 22:30:07 -0500
Subject: [PATCH 264/578] [ROCm] Fix for import when building with upstream
 triton for gfx1100 for gpt-oss serving (#29127)

Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
---
 .../layers/quantization/utils/mxfp4_utils.py         | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index cbc46810a26a..d0c8b3d1a309 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -39,15 +39,15 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
         value_layout = StridedLayout
         scale_layout = StridedLayout
     elif current_platform.is_rocm():
-        from triton_kernels.tensor_details.layout import (
-            GFX950MXScaleLayout,
-            StridedLayout,
-        )
-
         from vllm.platforms.rocm import on_gfx950
 
         value_layout = StridedLayout
-        scale_layout = GFX950MXScaleLayout if on_gfx950() else StridedLayout
+        if on_gfx950():
+            from triton_kernels.tensor_details.layout import GFX950MXScaleLayout
+
+            scale_layout = GFX950MXScaleLayout
+        else:
+            scale_layout = StridedLayout
     else:
         value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
             mx_axis=1

From 56669c1f293d5c53b6a19ddf2f78802fa9fff2c2 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 20 Nov 2025 22:36:07 -0500
Subject: [PATCH 265/578] [CI] Fix mypy for `vllm/v1/worker` (#29037)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tools/pre_commit/mypy.py                      |   2 +-
 vllm/model_executor/utils.py                  |   2 +-
 vllm/multimodal/utils.py                      |   4 +-
 vllm/v1/worker/cpu_worker.py                  |  12 +-
 vllm/v1/worker/gpu_model_runner.py            | 128 +++++++++++-------
 vllm/v1/worker/gpu_ubatch_wrapper.py          |  20 ++-
 vllm/v1/worker/gpu_worker.py                  |  62 +++++----
 .../worker/kv_connector_model_runner_mixin.py |   2 +-
 vllm/v1/worker/tpu_model_runner.py            |  28 +++-
 vllm/v1/worker/tpu_worker.py                  |   5 +-
 vllm/v1/worker/utils.py                       |   8 +-
 vllm/v1/worker/worker_base.py                 |   2 +
 vllm/v1/worker/xpu_worker.py                  |   9 +-
 13 files changed, 180 insertions(+), 104 deletions(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 8d04848f8f78..34f6e8c928ff 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -38,6 +38,7 @@
     "vllm/usage",
     "vllm/v1/core",
     "vllm/v1/engine",
+    "vllm/v1/worker",
 ]
 
 # After fixing errors resulting from changing follow_imports
@@ -62,7 +63,6 @@
     "vllm/v1/sample",
     "vllm/v1/spec_decode",
     "vllm/v1/structured_output",
-    "vllm/v1/worker",
 ]
 
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 759b809433b1..8aad59e84ff2 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -10,7 +10,7 @@
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
-def set_random_seed(seed: int) -> None:
+def set_random_seed(seed: int | None) -> None:
     from vllm.platforms import current_platform
 
     current_platform.seed_everything(seed)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 3f55c46ca334..ac89bdacc01d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,7 +3,7 @@
 
 import asyncio
 import atexit
-from collections.abc import Iterable, Set
+from collections.abc import Generator, Set
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
@@ -403,7 +403,7 @@ def group_mm_kwargs_by_modality(
     pin_memory: bool = False,
     merge_by_field_config: bool | None = None,
     multimodal_cpu_fields: Set[str] = frozenset(),
-) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
+) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]:
     """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
     modality together into the same `MultiModalKwargs` instance.
 
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index 4420a057d1e5..b080fea1d2dd 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -3,6 +3,7 @@
 import os
 import platform
 from collections.abc import Callable
+from typing import Any
 
 import torch
 
@@ -37,6 +38,9 @@ def __init__(
 
         self.parallel_config.disable_custom_all_reduce = True
 
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        self.profiler: Any | None = None
         if envs.VLLM_TORCH_PROFILER_DIR:
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
@@ -80,13 +84,13 @@ def init_device(self):
             self.local_omp_cpuid = "nobind"
         else:
             local_dp_rank = self.parallel_config.data_parallel_rank_local
-            omp_cpuids = omp_cpuids.split("|")
+            omp_cpuids_list = omp_cpuids.split("|")
             if local_dp_rank is not None:
                 world_size = self.parallel_config.world_size
-                omp_cpuids = omp_cpuids[
+                omp_cpuids_list = omp_cpuids_list[
                     local_dp_rank * world_size : (local_dp_rank + 1) * world_size
                 ]
-            self.local_omp_cpuid = omp_cpuids[self.rank]
+            self.local_omp_cpuid = omp_cpuids_list[self.rank]
 
         if self.local_omp_cpuid != "nobind":
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
@@ -120,7 +124,7 @@ def wake_up(self, tags: list[str] | None = None) -> None:
         pass
 
     def determine_available_memory(self) -> int:
-        return self.cache_config.cpu_kvcache_space_bytes  # type: ignore
+        return self.cache_config.cpu_kvcache_space_bytes or 0
 
     def compile_or_warm_up_model(self) -> None:
         # Reset the seed to ensure that the random state is not affected by
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4b0a08ab57e1..a7fa68b20ac5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5,7 +5,7 @@
 import itertools
 import time
 from collections import defaultdict
-from collections.abc import Iterator
+from collections.abc import Iterator, Sequence
 from contextlib import contextmanager
 from copy import copy, deepcopy
 from functools import reduce
@@ -53,6 +53,7 @@
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (
+    SupportsMRoPE,
     SupportsMultiModal,
     is_mixture_of_experts,
     supports_eagle3,
@@ -126,6 +127,7 @@
 )
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
+from vllm.v1.sample.logits_processor.interface import LogitsProcessor
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
 from vllm.v1.sample.sampler import Sampler
@@ -404,7 +406,10 @@ def __init__(
         # solution, we initialize the input batch here, and re-initialize it
         # in `initialize_kv_cache` if the block_sizes here is different from
         # the block_sizes in the kv cache config.
-        custom_logitsprocs = model_config.logits_processors
+        logits_processors = model_config.logits_processors
+        custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = (
+            tuple(logits_processors) if logits_processors is not None else ()
+        )
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
             # We need to use the encoder length for encoder-decoer
@@ -959,9 +964,13 @@ def _update_states_after_model_execute(
     def _init_mrope_positions(self, req_state: CachedRequestState):
         model = self.get_model()
         assert supports_mrope(model), "M-RoPE support is not implemented."
+        assert req_state.prompt_token_ids is not None, (
+            "M-RoPE requires prompt_token_ids to be available."
+        )
+        mrope_model = cast(SupportsMRoPE, model)
 
         req_state.mrope_positions, req_state.mrope_position_delta = (
-            model.get_mrope_input_positions(
+            mrope_model.get_mrope_input_positions(
                 req_state.prompt_token_ids,
                 req_state.mm_features,
             )
@@ -1762,6 +1771,7 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
                 dst_start = mrope_pos_ptr
                 dst_end = mrope_pos_ptr + completion_part_len
 
+                assert req.mrope_position_delta is not None
                 MRotaryEmbedding.get_next_input_positions_tensor(
                     out=self.mrope_positions.np,
                     out_offset=dst_start,
@@ -1907,6 +1917,8 @@ def _batch_mm_kwargs_from_scheduler(
 
             for mm_input_id in encoder_input_ids:
                 mm_feature = req_state.mm_features[mm_input_id]
+                if mm_feature.data is None:
+                    continue
                 mm_hash = mm_feature.identifier
                 mm_kwargs.append(mm_feature.data)
                 mm_hashes_pos.append((mm_hash, mm_feature.mm_position))
@@ -1930,7 +1942,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         # multimodal inputs. The proper solution should be reordering the
         # encoder outputs.
         model = cast(SupportsMultiModal, self.model)
-        encoder_outputs = []
+        encoder_outputs: list[torch.Tensor] = []
         for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
             mm_kwargs,
             device=self.device,
@@ -1938,7 +1950,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             merge_by_field_config=model.merge_by_field_config,
             multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
-            curr_group_outputs = []
+            curr_group_outputs: list[torch.Tensor] = []
 
             # EVS-related change.
             # (ekhvedchenia): Temporary hack to limit peak memory usage when
@@ -1980,7 +1992,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
                 # 2. A list or tuple (length: num_items) of tensors,
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
-                curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
+                curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)  # type: ignore[assignment]
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
@@ -2180,7 +2192,7 @@ def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
     def sync_and_slice_intermediate_tensors(
         self,
         num_tokens: int,
-        intermediate_tensors: IntermediateTensors,
+        intermediate_tensors: IntermediateTensors | None,
         sync_self: bool,
     ) -> IntermediateTensors:
         assert self.intermediate_tensors is not None
@@ -2397,6 +2409,7 @@ def _preprocess(
         if is_first_rank:
             intermediate_tensors = None
         else:
+            assert intermediate_tensors is not None
             intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                 num_input_tokens, intermediate_tensors, True
             )
@@ -2765,14 +2778,14 @@ def execute_model(
             uniform_decode = (
                 max_num_scheduled_tokens == self.uniform_decode_query_len
             ) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
-            batch_descriptor = BatchDescriptor(
+            batch_desc = BatchDescriptor(
                 num_tokens=num_input_tokens,
                 uniform_decode=uniform_decode,
                 has_lora=len(self.input_batch.lora_id_to_lora_request) > 0,
             )
             cudagraph_runtime_mode, batch_descriptor = (
                 self.cudagraph_dispatcher.dispatch(
-                    batch_descriptor,
+                    batch_desc,
                     use_cascade_attn=cascade_attn_prefix_lens is not None,
                 )
             )
@@ -2856,15 +2869,15 @@ def execute_model(
                 else:
                     logits = self.model.compute_logits(sample_hidden_states)
 
-                model_output_broadcast_data = {}
+                model_output_broadcast_data: dict[str, Any] = {}
                 if logits is not None:
                     model_output_broadcast_data["logits"] = logits.contiguous()
 
-                model_output_broadcast_data = get_pp_group().broadcast_tensor_dict(
+                broadcasted = get_pp_group().broadcast_tensor_dict(
                     model_output_broadcast_data, src=len(get_pp_group().ranks) - 1
                 )
-                assert model_output_broadcast_data is not None
-                logits = model_output_broadcast_data["logits"]
+                assert broadcasted is not None
+                logits = broadcasted["logits"]
 
         self.execute_model_state = ExecuteModelState(
             scheduler_output,
@@ -2889,7 +2902,7 @@ def sample_tokens(
         if self.execute_model_state is None:
             # Nothing to do (PP non-final rank case), output isn't used.
             if not kv_connector_output:
-                return None  # noqa
+                return None  # type: ignore[return-value]
 
             # In case of PP with kv transfer, we need to pass through the
             # kv_connector_output
@@ -2941,33 +2954,37 @@ def propose_draft_token_ids(
                     spec_decode_common_attn_metadata,
                 )
 
+        spec_config = self.speculative_config
         use_padded_batch_for_eagle = (
-            self.speculative_config
-            and self.speculative_config.use_eagle()
-            and not self.speculative_config.disable_padded_drafter_batch
+            spec_config is not None
+            and spec_config.use_eagle()
+            and not spec_config.disable_padded_drafter_batch
         )
         effective_drafter_max_model_len = self.max_model_len
         if effective_drafter_max_model_len is None:
             effective_drafter_max_model_len = self.model_config.max_model_len
         if (
-            self.speculative_config
-            and self.speculative_config.draft_model_config is not None
-            and self.speculative_config.draft_model_config.max_model_len is not None
+            spec_config is not None
+            and spec_config.draft_model_config is not None
+            and spec_config.draft_model_config.max_model_len is not None
         ):
             effective_drafter_max_model_len = (
-                self.speculative_config.draft_model_config.max_model_len
+                spec_config.draft_model_config.max_model_len
             )
         input_fits_in_drafter = spec_decode_common_attn_metadata and (
             spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens
             <= effective_drafter_max_model_len
         )
         if use_padded_batch_for_eagle:
+            assert self.speculative_config is not None
+            assert isinstance(self.drafter, EagleProposer)
             sampled_token_ids = sampler_output.sampled_token_ids
             if input_fits_in_drafter:
                 # EAGLE speculative decoding can use the GPU sampled tokens
                 # as inputs, and does not need to wait for bookkeeping to finish.
                 propose_draft_token_ids(sampled_token_ids)
             elif self.valid_sampled_token_count_event is not None:
+                assert spec_decode_common_attn_metadata is not None
                 next_token_ids, valid_sampled_tokens_count = (
                     self.drafter.prepare_next_token_ids_padded(
                         spec_decode_common_attn_metadata,
@@ -3105,7 +3122,9 @@ def propose_draft_token_ids(
         common_attn_metadata: CommonAttentionMetadata,
     ) -> torch.Tensor | list[list[int]]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        if self.speculative_config.method == "ngram":
+        spec_config = self.speculative_config
+        assert spec_config is not None
+        if spec_config.method == "ngram":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, NgramProposer)
             draft_token_ids = self.drafter.propose(
@@ -3115,11 +3134,11 @@ def propose_draft_token_ids(
                 self.input_batch.token_ids_cpu,
                 self.input_batch.spec_decode_unsupported_reqs,
             )
-        elif self.speculative_config.method == "suffix":
+        elif spec_config.method == "suffix":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, SuffixDecodingProposer)
             draft_token_ids = self.drafter.propose(self.input_batch, sampled_token_ids)
-        elif self.speculative_config.method == "medusa":
+        elif spec_config.method == "medusa":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, MedusaProposer)
 
@@ -3144,10 +3163,10 @@ def propose_draft_token_ids(
                 target_hidden_states=hidden_states,
                 sampling_metadata=sampling_metadata,
             )
-        elif self.speculative_config.use_eagle():
+        elif spec_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
 
-            if self.speculative_config.disable_padded_drafter_batch:
+            if spec_config.disable_padded_drafter_batch:
                 # When padded-batch is disabled, the sampled_token_ids should be
                 # the cpu-side list[list[int]] of valid sampled tokens for each
                 # request, with invalid requests having empty lists.
@@ -3197,7 +3216,7 @@ def propose_draft_token_ids(
                 else:
                     target_hidden_states = hidden_states[:num_scheduled_tokens]
             else:
-                if self.speculative_config.disable_padded_drafter_batch:
+                if spec_config.disable_padded_drafter_batch:
                     token_indices_to_sample = None
                     common_attn_metadata, token_indices = self.drafter.prepare_inputs(
                         common_attn_metadata,
@@ -3292,9 +3311,12 @@ def load_model(self, eep_scale_up: bool = False) -> None:
                     and is_mixture_of_experts(self.drafter.model)
                     and self.parallel_config.enable_eplb
                 ):
+                    spec_config = self.vllm_config.speculative_config
+                    assert spec_config is not None
+                    assert spec_config.draft_model_config is not None
                     logger.info_once(
                         "EPLB is enabled for drafter model %s.",
-                        self.vllm_config.speculative_config.draft_model_config.model,
+                        spec_config.draft_model_config.model,
                     )
 
                     global_expert_load = (
@@ -3311,7 +3333,7 @@ def load_model(self, eep_scale_up: bool = False) -> None:
                         self.eplb_state = EplbState(self.parallel_config, self.device)
                     self.eplb_state.add_model(
                         self.drafter.model,
-                        self.vllm_config.speculative_config.draft_model_config,
+                        spec_config.draft_model_config,
                         global_expert_load,
                         old_global_expert_indices,
                         rank_mapping,
@@ -3346,9 +3368,11 @@ def load_model(self, eep_scale_up: bool = False) -> None:
             scope="local",
         )
         prepare_communication_buffer_for_model(self.model)
+        mm_config = self.model_config.multimodal_config
         self.is_multimodal_pruning_enabled = (
             supports_multimodal_pruning(self.get_model())
-            and self.model_config.multimodal_config.is_multimodal_pruning_enabled()
+            and mm_config is not None
+            and mm_config.is_multimodal_pruning_enabled()
         )
 
         if is_mixture_of_experts(self.model) and self.parallel_config.enable_eplb:
@@ -3383,15 +3407,14 @@ def load_model(self, eep_scale_up: bool = False) -> None:
         # CudagraphWraper and CudagraphDispatcher of vllm.
 
         # wrap the model with full cudagraph wrapper if needed.
-        if (
-            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-            and not self.parallel_config.enable_dbo
-        ):
+        cudagraph_mode = self.compilation_config.cudagraph_mode
+        assert cudagraph_mode is not None
+        if cudagraph_mode.has_full_cudagraphs() and not self.parallel_config.enable_dbo:
             self.model = CUDAGraphWrapper(
                 self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
             )
         elif self.parallel_config.enable_dbo:
-            if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
+            if cudagraph_mode.has_full_cudagraphs():
                 self.model = UBatchWrapper(
                     self.model, self.vllm_config, CUDAGraphMode.FULL, self.device
                 )
@@ -4071,7 +4094,8 @@ def _dummy_pooler_run(
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
-            if self.model_config.multimodal_config.skip_mm_profiling:
+            mm_config = self.model_config.multimodal_config
+            if mm_config is not None and mm_config.skip_mm_profiling:
                 logger.info(
                     "Skipping memory profiling for multimodal encoder and "
                     "encoder cache."
@@ -4333,8 +4357,9 @@ class AttentionGroupKey(NamedTuple):
         def get_attn_backends_for_group(
             kv_cache_group_spec: KVCacheGroupSpec,
         ) -> tuple[dict[AttentionGroupKey, list[str]], set[type[AttentionBackend]]]:
+            layer_type = cast(type[Any], AttentionLayerBase)
             layers = get_layers_from_vllm_config(
-                self.vllm_config, AttentionLayerBase, kv_cache_group_spec.layer_names
+                self.vllm_config, layer_type, kv_cache_group_spec.layer_names
             )
             attn_backends = {}
             attn_backend_layers = defaultdict(list)
@@ -4349,7 +4374,7 @@ def get_attn_backends_for_group(
                 if layer_name in self.kv_sharing_fast_prefill_eligible_layers:
                     attn_backend = create_fast_prefill_custom_backend(
                         "FastPrefill",
-                        attn_backend,
+                        attn_backend,  # type: ignore[arg-type]
                     )
 
                 full_cls_name = attn_backend.full_cls_name()
@@ -4448,6 +4473,7 @@ def _check_and_update_cudagraph_mode(
                     min_cg_backend_name = attn_backend.__name__
         # Flexible resolve the cudagraph mode
         cudagraph_mode = self.compilation_config.cudagraph_mode
+        assert cudagraph_mode is not None
         # check cudagraph for mixed batch is supported
         if (
             cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL
@@ -4562,12 +4588,17 @@ def _check_and_update_cudagraph_mode(
             self.compilation_config.adjust_cudagraph_sizes_for_spec_decode(
                 self.uniform_decode_query_len, self.parallel_config.tensor_parallel_size
             )
-            self.cudagraph_batch_sizes = self.compilation_config.cudagraph_capture_sizes
+            capture_sizes = self.compilation_config.cudagraph_capture_sizes
+            self.cudagraph_batch_sizes = (
+                capture_sizes if capture_sizes is not None else []
+            )
 
         # Trigger cudagraph dispatching keys initialization after
         # resolved cudagraph mode.
+        cudagraph_mode = self.compilation_config.cudagraph_mode
+        assert cudagraph_mode is not None
         self.cudagraph_dispatcher.initialize_cudagraph_keys(
-            self.compilation_config.cudagraph_mode, self.uniform_decode_query_len
+            cudagraph_mode, self.uniform_decode_query_len
         )
 
     def calculate_reorder_batch_threshold(self) -> None:
@@ -4579,7 +4610,7 @@ def calculate_reorder_batch_threshold(self) -> None:
         """
         min_none_high = lambda a, b: a if b is None else b if a is None else min(a, b)
 
-        reorder_batch_thresholds = [
+        reorder_batch_thresholds: list[int | None] = [
             group.get_metadata_builder().reorder_batch_threshold
             for group in self._attn_group_iterator()
         ]
@@ -4588,7 +4619,7 @@ def calculate_reorder_batch_threshold(self) -> None:
         if len(reorder_batch_thresholds) == 0:
             self.reorder_batch_threshold = None
             return
-        self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds)
+        self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds)  # type: ignore[assignment]
 
     @staticmethod
     def select_common_block_size(
@@ -5048,12 +5079,16 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
 
         if self.dcp_world_size > 1:
-            layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
+            layer_type = cast(type[Any], AttentionLayerBase)
+            layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
             for layer in layers.values():
-                assert layer.impl.need_to_return_lse_for_decode, (
+                layer_impl = getattr(layer, "impl", None)
+                if layer_impl is None:
+                    continue
+                assert layer_impl.need_to_return_lse_for_decode, (
                     "DCP requires attention impls to return"
                     " the softmax lse for decode, but the impl "
-                    f"{layer.impl.__class__.__name__} "
+                    f"{layer_impl.__class__.__name__} "
                     "does not return the softmax lse for decode."
                 )
 
@@ -5094,7 +5129,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         if has_ec_transfer() and get_ec_transfer().is_producer:
             return {}
         kv_cache_spec: dict[str, KVCacheSpec] = {}
-        attn_layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
+        layer_type = cast(type[Any], AttentionLayerBase)
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
         for layer_name, attn_module in attn_layers.items():
             if isinstance(attn_module, Attention) and (
                 kv_tgt_layer := attn_module.kv_sharing_target_layer_name
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
index 9de123263755..2ce2b6451256 100644
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -121,18 +121,24 @@ def __init__(
 
     @staticmethod
     def _create_sm_control_context(vllm_config: VllmConfig):
-        comm_sms = envs.VLLM_DBO_COMM_SMS
+        comm_sms: int = envs.VLLM_DBO_COMM_SMS
 
         set_comm_sms = lambda sms: None
         if vllm_config.parallel_config.enable_expert_parallel:
             # Currently only DeepEP highthroughput supports SM control so this
             # only affects that case.
-            all2all_manager = get_ep_group().device_communicator.all2all_manager
-
-            if all2all_manager.max_sms_used() is not None:
-                comm_sms = min(comm_sms, all2all_manager.max_sms_used())
-
-            if comm_sms > 0:
+            ep_group = get_ep_group()
+            device_communicator = ep_group.device_communicator
+            all2all_manager = None
+            if device_communicator is not None:
+                all2all_manager = device_communicator.all2all_manager
+
+            if all2all_manager is not None:
+                max_sms_used = all2all_manager.max_sms_used()
+                if max_sms_used is not None:
+                    comm_sms = min(comm_sms, max_sms_used)
+
+            if comm_sms > 0 and all2all_manager is not None:
                 set_comm_sms = lambda sms: all2all_manager.set_num_sms(sms)
 
         # TODO(lucas): support other kernels besides DeepGEMM
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 18cbc3826279..f1fd5be966c3 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -6,7 +6,7 @@
 import os
 from contextlib import AbstractContextManager, nullcontext
 from types import NoneType
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 
 import torch
 import torch.distributed
@@ -87,8 +87,10 @@ def __init__(
         # Buffers saved before sleep
         self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
 
-        # Torch profiler. Enabled and configured through env vars:
+        # Torch/CUDA profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        # VLLM_TORCH_CUDA_PROFILE=1
+        self.profiler: Any | None = None
         if envs.VLLM_TORCH_PROFILER_DIR:
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
             self.profiler = TorchProfilerWrapper(
@@ -146,17 +148,17 @@ def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
                 assert allocator.get_current_usage() == 0, (
                     "Sleep mode can only be used for one instance per process."
                 )
-            context = allocator.use_memory_pool(tag=tag)
+            return allocator.use_memory_pool(tag=tag)
         else:
-            context = nullcontext()
-        return context
+            return nullcontext()
 
     def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
     def init_device(self):
-        if self.device_config.device.type == "cuda":
+        device = self.device_config.device
+        if isinstance(device, torch.device) and device.type == "cuda":
             # This env var set by Ray causes exceptions with graph building.
             os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
             if (
@@ -375,23 +377,21 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
             from vllm.device_allocator.cumem import CuMemAllocator
 
             allocator = CuMemAllocator.get_instance()
-            context = allocator.use_memory_pool(tag="kv_cache")
+            with allocator.use_memory_pool(tag="kv_cache"):
+                self.model_runner.initialize_kv_cache(kv_cache_config)
         else:
-            context = nullcontext()
-        with context:
             self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def compile_or_warm_up_model(self) -> None:
         # warm up sizes that are not in cudagraph capture sizes,
         # but users still want to compile for better performance,
         # e.g. for the max-num-batched token size in chunked prefill.
-        warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
+        compile_sizes = self.vllm_config.compilation_config.compile_sizes
+        warmup_sizes = compile_sizes.copy() if compile_sizes is not None else []
         if not self.model_config.enforce_eager:
-            warmup_sizes = [
-                x
-                for x in warmup_sizes
-                if x not in self.vllm_config.compilation_config.cudagraph_capture_sizes
-            ]
+            capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
+            if capture_sizes is not None:
+                warmup_sizes = [x for x in warmup_sizes if x not in capture_sizes]
         # We skip EPLB here since we don't want to record dummy metrics
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
@@ -532,12 +532,12 @@ def execute_model(
             )
         }
         if forward_pass and not get_pp_group().is_first_rank:
-            intermediate_tensors = IntermediateTensors(
-                get_pp_group().recv_tensor_dict(
-                    all_gather_group=get_tp_group(),
-                    all_gather_tensors=all_gather_tensors,
-                )
+            tensor_dict = get_pp_group().recv_tensor_dict(
+                all_gather_group=get_tp_group(),
+                all_gather_tensors=all_gather_tensors,
             )
+            assert tensor_dict is not None
+            intermediate_tensors = IntermediateTensors(tensor_dict)
 
         with self.annotate_profile(scheduler_output):
             output = self.model_runner.execute_model(
@@ -605,7 +605,7 @@ def _eplb_before_scale_down(self, old_ep_size: int, new_ep_size: int) -> None:
         assert self.model_runner.eplb_state is not None
         self.model_runner.eplb_state.rearrange(
             execute_shuffle=True,
-            global_expert_load=None,
+            global_expert_loads=None,
             rank_mapping=rank_mapping,
         )
         torch.cuda.synchronize()
@@ -661,7 +661,7 @@ def _reconfigure_parallel_config(
 
     def _reconfigure_moe(
         self, old_ep_size: int, new_ep_size: int
-    ) -> torch.Tensor | None:
+    ) -> list[torch.Tensor] | None:
         """
         Reconfigure MoE modules with provided reconfig_request
 
@@ -728,26 +728,29 @@ def update_moe_modules(moe_modules: list[FusedMoE], num_local_experts: int):
             num_local_physical_experts = num_local_experts
             assert self.model_runner.eplb_state is not None
             new_physical_experts = (
-                self.model_runner.eplb_state.physical_to_logical_map.shape[1]
+                self.model_runner.eplb_state.physical_to_logical_map.shape[1]  # type: ignore[attr-defined]
             )
             parallel_config.eplb_config.num_redundant_experts = (
                 new_physical_experts
-                - self.model_runner.eplb_state.logical_replica_count.shape[1]
+                - self.model_runner.eplb_state.logical_replica_count.shape[1]  # type: ignore[attr-defined]
             )
             global_expert_loads = None
         else:
-            num_local_physical_experts = torch.tensor(
+            num_local_physical_experts_tensor = torch.tensor(
                 [num_local_experts], dtype=torch.int32, device="cpu"
             )
             torch.distributed.broadcast(
-                num_local_physical_experts, group=get_ep_group().cpu_group, group_src=0
+                num_local_physical_experts_tensor,
+                group=get_ep_group().cpu_group,
+                group_src=0,
             )
-            num_local_physical_experts = num_local_physical_experts.item()
+            num_local_physical_experts = int(num_local_physical_experts_tensor.item())
             new_physical_experts = num_local_physical_experts * new_ep_size
             assert self.model_runner.eplb_state is not None
-            global_expert_loads = self.model_runner.eplb_state.rearrange(
+            global_expert_loads_any = self.model_runner.eplb_state.rearrange(
                 execute_shuffle=False
             )
+            global_expert_loads = cast(list[torch.Tensor], global_expert_loads_any)
             parallel_config.eplb_config.num_redundant_experts = (
                 new_physical_experts - global_expert_loads[0].shape[1]
             )
@@ -849,8 +852,9 @@ def init_worker_distributed_environment(
     init_batch_invariance()
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
+    init_method = distributed_init_method or "env://"
     init_distributed_environment(
-        parallel_config.world_size, rank, distributed_init_method, local_rank, backend
+        parallel_config.world_size, rank, init_method, local_rank, backend
     )
 
     ensure_model_parallel_initialized(
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index e59361f21372..ff047d8d03f0 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -59,7 +59,7 @@ def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
     @staticmethod
     def ensure_kv_transfer_shutdown() -> None:
         # has_kv_transfer_group can be None during interpreter shutdown.
-        if has_kv_transfer_group and has_kv_transfer_group():
+        if has_kv_transfer_group and has_kv_transfer_group():  # type: ignore[truthy-function]
             ensure_kv_transfer_shutdown()
 
     @staticmethod
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 923c31c187f3..450160d28649 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -572,7 +572,10 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
             format. Layers that do not need KV cache are not included.
         """
 
-        layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
+        layers = get_layers_from_vllm_config(
+            self.vllm_config,
+            AttentionLayerBase,  # type: ignore[type-abstract]
+        )
         block_size = self.vllm_config.cache_config.block_size
         cache_dtype_str = self.vllm_config.cache_config.cache_dtype
 
@@ -725,7 +728,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput", start_index: int)
             req_id = self.input_batch.req_ids[i]
             assert req_id is not None
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-            if not use_max_model_len and num_tokens > self.most_model_len:
+            if (
+                not use_max_model_len
+                and self.most_model_len is not None
+                and num_tokens > self.most_model_len
+            ):
                 use_max_model_len = True
             num_scheduled_tokens_per_req.append(num_tokens)
         if use_max_model_len:
@@ -737,6 +744,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput", start_index: int)
             else:
                 end_index = num_reqs
         else:
+            assert self.num_reqs_most_model_len is not None
             if len(num_scheduled_tokens_per_req) > self.num_reqs_most_model_len:
                 num_scheduled_tokens_per_req = num_scheduled_tokens_per_req[
                     : self.num_reqs_most_model_len
@@ -829,6 +837,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput", start_index: int)
             ].to(self.device)
             seq_lens = self.seq_lens_cpu[: self.num_reqs_max_model_len].to(self.device)
         else:
+            assert self.num_reqs_most_model_len is not None
             block_tables = self.block_table_cpu[
                 : self.num_reqs_most_model_len, : self.num_blocks_per_most_len_req
             ]
@@ -931,6 +940,8 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
 
             for mm_input_id in encoder_input_ids:
                 mm_feature = req_state.mm_features[mm_input_id]
+                if mm_feature.data is None:
+                    continue
                 mm_hash = mm_feature.identifier
                 mm_kwargs.append(mm_feature.data)
                 mm_hashes_pos.append((mm_hash, mm_feature.mm_position))
@@ -1114,7 +1125,7 @@ def sample_tokens(
     ) -> ModelRunnerOutput:
         if self.scheduler_output is None:
             # Nothing to do (PP non-final rank case), output isn't used.
-            return None  # noqa
+            return None  # type: ignore[return-value]
         scheduler_output = self.scheduler_output
         mm_embed_inputs = self.mm_embed_inputs
         self.scheduler_output = None
@@ -1696,7 +1707,8 @@ def profile_run(
     ) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
-            if self.model_config.multimodal_config.skip_mm_profiling:
+            mm_config = self.model_config.multimodal_config
+            if mm_config is not None and mm_config.skip_mm_profiling:
                 logger.info(
                     "Skipping memory profiling for multimodal encoder and "
                     "encoder cache."
@@ -2166,5 +2178,9 @@ def _tpu_reset_lora(self, index: int):
         if isinstance(module, BaseLayerWithLoRA):
             module._original_set_lora = module.set_lora
             module._original_reset_lora = module.reset_lora
-            module.set_lora = _tpu_set_lora.__get__(module, module.__class__)
-            module.reset_lora = _tpu_reset_lora.__get__(module, module.__class__)
+            module.set_lora = _tpu_set_lora.__get__(  # type: ignore[method-assign]
+                module, module.__class__
+            )
+            module.reset_lora = _tpu_reset_lora.__get__(  # type: ignore[method-assign]
+                module, module.__class__
+            )
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index a716a9c3aa82..569b2aaa766e 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -141,8 +141,7 @@ def init_device(self):
 
         # Set random seed.
         set_random_seed(self.model_config.seed)
-        if self.model_config.seed is not None:
-            xm.set_rng_state(self.model_config.seed, self.device)
+        xm.set_rng_state(self.model_config.seed, self.device)
 
         # Increase the cache size limit, which is the maximum number of
         # dynamo graphs that can be compiled.
@@ -332,7 +331,7 @@ def _init_tpu_worker_distributed_environment(
             world_size=parallel_config.world_size,
             rank=rank,
             local_rank=local_rank,
-            distributed_init_method=distributed_init_method,
+            distributed_init_method=distributed_init_method or "env://",
             backend=current_platform.dist_backend,
         )
         ensure_model_parallel_initialized(
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 9e99ea964ee0..92e4ce3abdba 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -280,7 +280,7 @@ def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],
     forward_context: dict[str, "Attention"],
     runner_kv_caches: list[torch.Tensor],
-    num_attn_module: int | None = 1,
+    num_attn_module: int = 1,
 ) -> None:
     """
     Bind the allocated KV cache to both ModelRunner and forward context so
@@ -362,5 +362,7 @@ def is_residual_scattered_for_sp(
         or vllm_config.compilation_config.use_inductor_graph_partition
     ):
         return True
-
-    return num_input_tokens in vllm_config.compilation_config.compile_sizes
+    compile_sizes = vllm_config.compilation_config.compile_sizes
+    if compile_sizes is None:
+        return False
+    return num_input_tokens in compile_sizes
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 16f321c08077..57e7037e946e 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -315,10 +315,12 @@ def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None:
 
     def initialize_from_config(self, kv_cache_configs: list[Any]) -> None:
         kv_cache_config = kv_cache_configs[self.global_rank]
+        assert self.vllm_config is not None
         with set_current_vllm_config(self.vllm_config):
             self.worker.initialize_from_config(kv_cache_config)  # type: ignore
 
     def init_device(self):
+        assert self.vllm_config is not None
         with set_current_vllm_config(self.vllm_config):
             # To make vLLM config available during device initialization
             self.worker.init_device()  # type: ignore
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 26c6f8d06bdc..4d7864e90496 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from typing import Any
 
 import torch
 import torch.distributed
@@ -37,6 +38,7 @@ def __init__(
 
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        self.profiler: Any | None = None
         if envs.VLLM_TORCH_PROFILER_DIR:
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
@@ -148,7 +150,12 @@ def determine_available_memory(self) -> int:
         return int(available_kv_cache_memory)
 
     def init_device(self):
-        if self.device_config.device.type == "xpu" and current_platform.is_xpu():
+        device = self.device_config.device
+        if (
+            isinstance(device, torch.device)
+            and device.type == "xpu"
+            and current_platform.is_xpu()
+        ):
             self.device = torch.device(f"xpu:{self.local_rank}")
             current_platform.set_device(self.device)
             current_platform.check_if_supports_dtype(self.model_config.dtype)

From 0e741c12e3dc45093b2ddab8a31310703aa27002 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 11:38:35 +0800
Subject: [PATCH 266/578] [Bugfix] Fix Plamo3 rope handling (#29092)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/plamo3.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py
index 5bb07722a5fc..4aeb9d432dcc 100644
--- a/vllm/model_executor/models/plamo3.py
+++ b/vllm/model_executor/models/plamo3.py
@@ -62,7 +62,7 @@ class Plamo3Config(PretrainedConfig):  # type: ignore
     # if `sliding_window` is list
     interleaved_sliding_window: list[int | None]
     sliding_window_pattern: int
-    rope_theta: int
+    rope_parameters: dict[str, Any]
     rope_local_theta: int
     # MLP
     intermediate_size: int
@@ -153,13 +153,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
         )
+
         layer_idx = extract_layer_index(prefix)
-        full_attn = config.interleaved_sliding_window[layer_idx] is None
+        layer_type = config.layer_types[layer_idx]
+        is_sliding = layer_type == "sliding_attention"
 
-        self.rope_theta = config.rope_theta if full_attn else config.rope_local_theta
-        self.rope_scaling = (
-            config.rope_scaling if hasattr(config, "rope_scaling") else None
-        )
+        # Initialize the rotary embedding.
+        if layer_type in config.rope_parameters:
+            # Transformers v5 rope config.
+            rope_parameters = config.rope_parameters[layer_type]
+        else:
+            # Transformers v4 rope config.
+            # Global attention. Use the values in config.json.
+            rope_parameters = config.rope_parameters
+            # Local attention. Override the values in config.json.
+            if is_sliding:
+                rope_parameters = dict(
+                    rope_type="default", rope_theta=config.rope_local_theta
+                )
         max_position = config.max_position_embeddings
         if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
             vllm_config.model_config.max_model_len, int
@@ -170,8 +181,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
         set_weight_attrs(

From a982f5b5ea4a1932424927ea357b532d0e45caf1 Mon Sep 17 00:00:00 2001
From: zhrrr <43847754+izhuhaoran@users.noreply.github.com>
Date: Fri, 21 Nov 2025 11:39:09 +0800
Subject: [PATCH 267/578] [kernel][perf] support uncontiguous input for
 rms_norm kernel (#28103)

Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 csrc/dispatch_utils.h             | 21 ++++++++
 csrc/layernorm_kernels.cu         | 80 +++++++++++++++++++++----------
 vllm/_custom_ops.py               |  5 +-
 vllm/compilation/matcher_utils.py |  4 +-
 4 files changed, 77 insertions(+), 33 deletions(-)

diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 9ae0ed975edd..e1d131e4a785 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -117,3 +117,24 @@
       break;                                  \
     }                                         \
   }
+
+#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...)                                   \
+  switch (NUM_DIMS) {                                                          \
+    case 2: {                                                                  \
+      constexpr int tensor_rank = 2;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    case 3: {                                                                  \
+      constexpr int tensor_rank = 3;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    case 4: {                                                                  \
+      constexpr int tensor_rank = 4;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    default:                                                                   \
+      TORCH_CHECK(false, "Expects rank 2, 3 or 4 tensors but got ", NUM_DIMS); \
+  }
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index 48771e4b3aff..dfc67b933cca 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -10,16 +10,38 @@
 namespace vllm {
 
 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t, int VEC_SIZE>
+template <typename scalar_t, int VEC_SIZE, int NUM_DIMS>
 __global__ void rms_norm_kernel(
-    scalar_t* __restrict__ out,          // [..., hidden_size]
-    const scalar_t* __restrict__ input,  // [..., hidden_size]
-    const int64_t input_stride,
+    scalar_t* __restrict__ out,           // [..., hidden_size]
+    const scalar_t* __restrict__ input,   // [..., hidden_size]
+    const int64_t input_stride_d2,        // input.stride(-2)
+    const int64_t input_stride_d3,        // input.stride(-3)
+    const int64_t input_stride_d4,        // input.stride(-4)
+    const int64_t input_shape_d2,         // input.size(-2)
+    const int64_t input_shape_d3,         // input.size(-3)
     const scalar_t* __restrict__ weight,  // [hidden_size]
     const float epsilon, const int num_tokens, const int hidden_size) {
   __shared__ float s_variance;
   float variance = 0.0f;
-  const scalar_t* input_row = input + blockIdx.x * input_stride;
+  const scalar_t* input_row;
+  if constexpr (NUM_DIMS == 2) {
+    // 2D for layernorm normal case [batch_size, hidden]
+    input_row = input + blockIdx.x * input_stride_d2;
+  } else if constexpr (NUM_DIMS == 3) {
+    // 3D for q/k norm [batch_size, num_heads, head_size]
+    int batch_idx = blockIdx.x / input_shape_d2;
+    int head_idx = blockIdx.x % input_shape_d2;
+    input_row =
+        input + batch_idx * input_stride_d3 + head_idx * input_stride_d2;
+  } else if constexpr (NUM_DIMS == 4) {
+    // 4D for transformers model_impl qk norm [batch, seq, head, head_dim]
+    int batch_idx = blockIdx.x / (input_shape_d3 * input_shape_d2);
+    int remaining = blockIdx.x % (input_shape_d3 * input_shape_d2);
+    int seq_idx = remaining / input_shape_d2;
+    int head_idx = remaining % input_shape_d2;
+    input_row = input + batch_idx * input_stride_d4 +
+                seq_idx * input_stride_d3 + head_idx * input_stride_d2;
+  }
 
   auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
@@ -164,38 +186,44 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
               torch::Tensor& weight,  // [hidden_size]
               double epsilon) {
   TORCH_CHECK(out.is_contiguous());
+  if (input.stride(-1) != 1) {
+    input = input.contiguous();
+  }
   TORCH_CHECK(input.stride(-1) == 1);
   TORCH_CHECK(weight.is_contiguous());
 
   int hidden_size = input.size(-1);
 
-  // We cannot just use `input.stride(-2)` if the tensor is not row-major.
-  // Instead, we use a 2d view to get the second-innermost stride.
-  // That way the dimensions (except the last one) can be arbitrarily permuted.
-  torch::Tensor input_view = input.view({-1, hidden_size});
-
-  int num_tokens = input_view.numel() / hidden_size;
-  int64_t input_stride = input_view.stride(-2);
+  int num_tokens = input.numel() / hidden_size;
+  int num_dims = input.dim();
+  int64_t input_stride_d2 = input.stride(-2);
+  int64_t input_stride_d3 = (num_dims >= 3) ? input.stride(-3) : 0;
+  int64_t input_stride_d4 = (num_dims >= 4) ? input.stride(-4) : 0;
+  int64_t input_shape_d2 = (num_dims >= 3) ? input.size(-2) : 0;
+  int64_t input_shape_d3 = (num_dims >= 4) ? input.size(-3) : 0;
 
   // For large num_tokens, use smaller blocks to increase SM concurrency.
   const int max_block_size = (num_tokens < 256) ? 1024 : 256;
   dim3 grid(num_tokens);
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      input_view.scalar_type(), "rms_norm_kernel", [&] {
-        const int calculated_vec_size =
-            std::gcd(16 / sizeof(scalar_t), hidden_size);
-        const int block_size =
-            std::min(hidden_size / calculated_vec_size, max_block_size);
-        dim3 block(block_size);
-        VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
-          vllm::rms_norm_kernel<scalar_t, vec_size><<<grid, block, 0, stream>>>(
-              out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
-              input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
-              hidden_size);
-        });
+  VLLM_DISPATCH_RANK234(num_dims, [&] {
+    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
+      const int calculated_vec_size =
+          std::gcd(16 / sizeof(scalar_t), hidden_size);
+      const int block_size =
+          std::min(hidden_size / calculated_vec_size, max_block_size);
+      dim3 block(block_size);
+      VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
+        vllm::rms_norm_kernel<scalar_t, vec_size, tensor_rank>
+            <<<grid, block, 0, stream>>>(
+                out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+                input_stride_d2, input_stride_d3, input_stride_d4,
+                input_shape_d2, input_shape_d3, weight.data_ptr<scalar_t>(),
+                epsilon, num_tokens, hidden_size);
       });
+    });
+  });
 }
 
 #define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 66cf6472eee4..0f625a794524 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -328,10 +328,7 @@ def rotary_embedding(
 def rms_norm(
     out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float
 ) -> None:
-    # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input
-    # If removed, also need to remove contiguous in MatcherRMSNorm
-    input_contiguous = input.contiguous()
-    torch.ops._C.rms_norm(out, input_contiguous, weight, epsilon)
+    torch.ops._C.rms_norm(out, input, weight, epsilon)
 
 
 def fused_add_rms_norm(
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
index 38eb4e5301a1..e4cd063d2aee 100644
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -162,12 +162,10 @@ def forward_custom(
         weight: torch.Tensor,
     ) -> torch.Tensor:
         result = torch.empty_like(input)
-        # TODO: support non-contiguous input for RMSNorm and remove this
-        input_contiguous = input.contiguous()
         _, result = auto_functionalized(
             RMS_OP,
             result=result,
-            input=input_contiguous,
+            input=input,
             weight=weight,
             epsilon=self.epsilon,
         )

From 0730414999343e722590ace615d5814c7e5b6827 Mon Sep 17 00:00:00 2001
From: jeremyteboul <80506730+jeremyteboul@users.noreply.github.com>
Date: Thu, 20 Nov 2025 19:39:47 -0800
Subject: [PATCH 268/578] [Core] Add audio_embeds support to chat completions
 (#29059)

Signed-off-by: Jeremy Teboul <jeremyteboul@fb.com>
Co-authored-by: Jeremy Teboul <jeremyteboul@fb.com>
---
 docs/features/multimodal_inputs.md   |  32 ++++++
 tests/entrypoints/test_chat_utils.py | 145 ++++++++++++++++++++++++++
 vllm/entrypoints/chat_utils.py       | 149 ++++++++++++++++++++++++++-
 vllm/multimodal/audio.py             |  24 +++++
 vllm/multimodal/utils.py             |  13 ++-
 5 files changed, 360 insertions(+), 3 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 5f684604e603..4656ee43ea25 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -365,6 +365,8 @@ You must enable this feature via `enable_mm_embeds=True`.
     The vLLM engine may crash if incorrect shape of embeddings is passed.
     Only enable this flag for trusted users!
 
+#### Image Embeddings
+
 ??? code
 
     ```python
@@ -441,6 +443,36 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
         print(generated_text)
     ```
 
+#### Audio Embeddings
+
+You can pass pre-computed audio embeddings similar to image embeddings:
+
+??? code
+
+    ```python
+    from vllm import LLM
+    import torch
+
+    # Enable audio embeddings support
+    llm = LLM(model="fixie-ai/ultravox-v0_5-llama-3_2-1b", enable_mm_embeds=True)
+
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <audio>\nWhat is in this audio?\nASSISTANT:"
+
+    # Load pre-computed audio embeddings
+    # torch.Tensor of shape (1, audio_feature_size, hidden_size of LM)
+    audio_embeds = torch.load(...)
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"audio": audio_embeds},
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
+
 ## Online Serving
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index ca87b3e76b3f..7baf564ad01a 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -103,6 +103,19 @@ def qwen2_audio_model_config():
     )
 
 
+@pytest.fixture(scope="function")
+def audio_embeds_model_config():
+    return ModelConfig(
+        QWEN2AUDIO_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+        enable_mm_embeds=True,
+    )
+
+
 @pytest.fixture(scope="module")
 def qwen2_audio_tokenizer():
     return get_tokenizer(QWEN2AUDIO_MODEL_ID)
@@ -843,6 +856,138 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
 
 
+def test_parse_chat_messages_empty_audio_embeds_with_uuid(
+    audio_embeds_model_config,
+    qwen2_audio_tokenizer,
+):
+    """Test audio_embeds with UUID (no actual embeds data)."""
+    uuid = "test-audio-uuid-123"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this audio"},
+                    {"type": "audio_embeds", "audio_embeds": None, "uuid": uuid},
+                ],
+            }
+        ],
+        audio_embeds_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    # Should have audio in mm_data as None (UUID provided)
+    assert mm_data is not None
+    assert "audio" in mm_data
+    assert mm_data["audio"] is None
+    # UUID should be recorded
+    assert mm_uuids is not None
+    assert "audio" in mm_uuids
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid])
+
+
+def test_parse_chat_messages_audio_embeds_with_string(
+    audio_embeds_model_config,
+    qwen2_audio_tokenizer,
+):
+    """Test audio_embeds with base64 string embedding data."""
+    import base64
+    import io
+
+    import torch
+
+    # Create a sample audio embedding tensor
+    audio_embedding = torch.randn(1, 128, 768)
+
+    # Encode it as base64
+    buffer = io.BytesIO()
+    torch.save(audio_embedding, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this audio"},
+                    {
+                        "type": "audio_embeds",
+                        "audio_embeds": base64_audio_embedding,
+                    },
+                ],
+            }
+        ],
+        audio_embeds_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    # Should have audio embedding in mm_data (single tensor, not a list)
+    assert mm_data is not None
+    assert "audio" in mm_data
+    assert isinstance(mm_data["audio"], torch.Tensor)
+    assert mm_data["audio"].shape == audio_embedding.shape
+    # No UUID provided
+    assert mm_uuids is not None
+    assert "audio" in mm_uuids
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_audio_embeds_async(
+    audio_embeds_model_config,
+    qwen2_audio_tokenizer,
+):
+    """Test audio_embeds with async futures."""
+    import base64
+    import io
+
+    import torch
+
+    # Create a sample audio embedding tensor
+    audio_embedding = torch.randn(1, 128, 768)
+
+    # Encode it as base64
+    buffer = io.BytesIO()
+    torch.save(audio_embedding, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this audio"},
+                    {
+                        "type": "audio_embeds",
+                        "audio_embeds": base64_audio_embedding,
+                    },
+                ],
+            }
+        ],
+        audio_embeds_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    # Should have audio embedding in mm_data (single tensor, not a list)
+    mm_data = await mm_future
+    assert mm_data is not None
+    assert "audio" in mm_data
+    assert isinstance(mm_data["audio"], torch.Tensor)
+    assert mm_data["audio"].shape == audio_embedding.shape
+    # No UUID provided
+    assert mm_uuids is not None
+    assert "audio" in mm_uuids
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
     phi3v_model_config_image_embeds,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 03214c4d131b..aaf8a3ae9d2d 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -94,6 +94,22 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
     """
 
 
+class ChatCompletionContentPartAudioEmbedsParam(TypedDict, total=False):
+    audio_embeds: str | dict[str, str] | None
+    """
+    The audio embeddings. It can be either:
+    - A single base64 string representing a serialized torch tensor.
+    - A dictionary where each value is a base64 string.
+    """
+    type: Required[Literal["audio_embeds"]]
+    """The type of the content part."""
+    uuid: str | None
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
+
+
 class VideoURL(TypedDict, total=False):
     url: Required[str]
     """
@@ -211,6 +227,7 @@ class CustomThinkCompletionContentParam(TypedDict, total=False):
     | CustomChatCompletionContentPILImageParam
     | CustomChatCompletionContentSimpleImageParam
     | ChatCompletionContentPartImageEmbedsParam
+    | ChatCompletionContentPartAudioEmbedsParam
     | CustomChatCompletionContentSimpleAudioParam
     | CustomChatCompletionContentSimpleVideoParam
     | str
@@ -599,7 +616,7 @@ def resolve_chat_template_content_format(
     return detected_format
 
 
-ModalityStr = Literal["image", "audio", "video", "image_embeds"]
+ModalityStr = Literal["image", "audio", "video", "image_embeds", "audio_embeds"]
 _T = TypeVar("_T")
 
 
@@ -684,6 +701,11 @@ def all_mm_uuids(self) -> MultiModalUUIDDict | None:
             mm_uuids["image"] = uuids_by_modality["image_embeds"]
         if "image" in uuids_by_modality:
             mm_uuids["image"] = uuids_by_modality["image"]  # UUIDs of images
+        if "audio_embeds" in uuids_by_modality:
+            audio_embeds_uuids = uuids_by_modality["audio_embeds"]
+            if len(audio_embeds_uuids) > 1:
+                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
+            mm_uuids["audio"] = uuids_by_modality["audio_embeds"]
         if "audio" in uuids_by_modality:
             mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
         if "video" in uuids_by_modality:
@@ -703,6 +725,8 @@ def all_mm_data(self) -> MultiModalDataDict | None:
         items_by_modality = dict(self._items_by_modality)
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
+        if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
+            raise ValueError("Mixing raw audio and embedding inputs is not allowed")
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
@@ -711,6 +735,11 @@ def all_mm_data(self) -> MultiModalDataDict | None:
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
+        if "audio_embeds" in items_by_modality:
+            audio_embeds_lst = items_by_modality["audio_embeds"]
+            if len(audio_embeds_lst) > 1:
+                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
+            mm_inputs["audio"] = audio_embeds_lst[0]
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
@@ -738,6 +767,8 @@ async def all_mm_data(self) -> MultiModalDataDict | None:
 
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
+        if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
+            raise ValueError("Mixing raw audio and embedding inputs is not allowed")
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
@@ -746,6 +777,11 @@ async def all_mm_data(self) -> MultiModalDataDict | None:
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
+        if "audio_embeds" in items_by_modality:
+            audio_embeds_lst = items_by_modality["audio_embeds"]
+            if len(audio_embeds_lst) > 1:
+                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
+            mm_inputs["audio"] = audio_embeds_lst[0]
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
@@ -804,6 +840,14 @@ def parse_input_audio(
     ) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_audio_embeds(
+        self,
+        audio_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
+    ) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
         raise NotImplementedError
@@ -861,6 +905,31 @@ def parse_image_embeds(
 
         self._add_placeholder("image", placeholder)
 
+    def parse_audio_embeds(
+        self,
+        audio_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
+    ) -> None:
+        mm_config = self.model_config.get_multimodal_config()
+        if not mm_config.enable_mm_embeds:
+            raise ValueError(
+                "You must set `--enable-mm-embeds` to input `audio_embeds`"
+            )
+
+        if isinstance(audio_embeds, dict):
+            embeds = {
+                k: self._connector.fetch_audio_embedding(v)
+                for k, v in audio_embeds.items()
+            }
+            placeholder = self._tracker.add("audio_embeds", embeds, uuid)
+        elif isinstance(audio_embeds, str):
+            embedding = self._connector.fetch_audio_embedding(audio_embeds)
+            placeholder = self._tracker.add("audio_embeds", embedding, uuid)
+        else:
+            placeholder = self._tracker.add("audio_embeds", None, uuid)
+
+        self._add_placeholder("audio", placeholder)
+
     def parse_image_pil(
         self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
@@ -950,6 +1019,67 @@ def parse_image_embeds(
         placeholder = self._tracker.add("image_embeds", future, uuid)
         self._add_placeholder("image", placeholder)
 
+    def parse_audio_embeds(
+        self,
+        audio_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
+    ) -> None:
+        mm_config = self.model_config.get_multimodal_config()
+        if not mm_config.enable_mm_embeds:
+            raise ValueError(
+                "You must set `--enable-mm-embeds` to input `audio_embeds`"
+            )
+
+        logger.info(
+            "🎵 Parsing audio_embeds: type=%s, uuid=%s, is_dict=%s, "
+            "is_str=%s, is_none=%s",
+            type(audio_embeds).__name__,
+            uuid,
+            isinstance(audio_embeds, dict),
+            isinstance(audio_embeds, str),
+            audio_embeds is None,
+        )
+
+        future: asyncio.Future[str | dict[str, str] | None] = asyncio.Future()
+
+        if isinstance(audio_embeds, dict):
+            logger.info(
+                "🎵 Processing dict audio_embeds with %d entries",
+                len(audio_embeds),
+            )
+            embeds = {
+                k: self._connector.fetch_audio_embedding(v)
+                for k, v in audio_embeds.items()
+            }
+            future.set_result(embeds)
+            logger.info(
+                "🎵 Successfully loaded %d audio embeddings from dict",
+                len(embeds),
+            )
+
+        if isinstance(audio_embeds, str):
+            base64_size = len(audio_embeds)
+            logger.info(
+                "🎵 Processing base64 audio_embeds: %d chars (%.2f KB)",
+                base64_size,
+                base64_size / 1024,
+            )
+            embedding = self._connector.fetch_audio_embedding(audio_embeds)
+            future.set_result(embedding)
+            logger.info(
+                "🎵 Successfully loaded audio embedding tensor: shape=%s, dtype=%s",
+                embedding.shape,
+                embedding.dtype,
+            )
+
+        if audio_embeds is None:
+            logger.info("🎵 Audio embeds is None (UUID-only reference)")
+            future.set_result(None)
+
+        placeholder = self._tracker.add("audio_embeds", future, uuid)
+        self._add_placeholder("audio", placeholder)
+        logger.info("🎵 Added audio_embeds placeholder with uuid=%s", uuid)
+
     def parse_image_pil(
         self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
@@ -1132,6 +1262,7 @@ def _get_full_multimodal_text_prompt(
 # No need to validate using Pydantic again
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
+_AudioEmbedsParser = partial(cast, ChatCompletionContentPartAudioEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
@@ -1155,6 +1286,7 @@ def _get_full_multimodal_text_prompt(
     "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None),
     "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "audio_embeds": lambda part: _AudioEmbedsParser(part).get("audio_embeds", None),
     "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
     "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio": lambda part: _InputAudioParser(part).get("input_audio", None),
@@ -1223,8 +1355,17 @@ def _parse_chat_message_content_mm_part(
             )
             image_embeds = image_params.get("image_embeds", None)
             return "image_embeds", image_embeds
+        if "audio_embeds" in part:
+            # "audio_embeds" could be None if UUID is provided.
+            audio_params = cast(  # type: ignore[assignment]
+                ChatCompletionContentPartAudioEmbedsParam, part
+            )
+            audio_embeds = audio_params.get("audio_embeds", None)
+            return "audio_embeds", audio_embeds
         if "audio_url" in part:
-            audio_params = cast(CustomChatCompletionContentSimpleAudioParam, part)
+            audio_params = cast(  # type: ignore[assignment]
+                CustomChatCompletionContentSimpleAudioParam, part
+            )
             audio_url = audio_params.get("audio_url", None)
             if isinstance(audio_url, dict):
                 # Can potentially happen if user provides a uuid
@@ -1348,6 +1489,10 @@ def _parse_chat_message_content_part(
         content = cast(str | dict[str, str], content) if content is not None else None
         mm_parser.parse_image_embeds(content, uuid)
         modality = "image"
+    elif part_type == "audio_embeds":
+        content = cast(str | dict[str, str], content) if content is not None else None
+        mm_parser.parse_audio_embeds(content, uuid)
+        modality = "audio"
     elif part_type == "audio_url":
         str_content = cast(str, content)
         mm_parser.parse_audio(str_content, uuid)
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 53052ddc6343..b93a42ffd24c 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -7,6 +7,8 @@
 
 import numpy as np
 import numpy.typing as npt
+import pybase64
+import torch
 
 from vllm.utils.import_utils import PlaceholderModule
 
@@ -116,3 +118,25 @@ def encode_base64(self, media: tuple[npt.NDArray, int]) -> str:
             data = buffer.getvalue()
 
         return base64.b64encode(data).decode("utf-8")
+
+
+class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def load_bytes(self, data: bytes) -> torch.Tensor:
+        buffer = BytesIO(data)
+        return torch.load(buffer, weights_only=True)
+
+    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
+        return self.load_bytes(pybase64.b64decode(data, validate=True))
+
+    def load_file(self, filepath: Path) -> torch.Tensor:
+        return torch.load(filepath, weights_only=True)
+
+    def encode_base64(self, media: torch.Tensor) -> str:
+        buffer = BytesIO()
+        torch.save(media, buffer)
+        buffer.seek(0)
+        binary_data = buffer.read()
+        return pybase64.b64encode(binary_data).decode("utf-8")
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index ac89bdacc01d..1020554e2e07 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -22,7 +22,7 @@
 from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.registry import ExtensionManager
 
-from .audio import AudioMediaIO
+from .audio import AudioEmbeddingMediaIO, AudioMediaIO
 from .base import MediaIO
 from .image import ImageEmbeddingMediaIO, ImageMediaIO
 from .video import VideoMediaIO
@@ -342,6 +342,17 @@ def fetch_image_embedding(
 
         return image_embedding_io.load_base64("", data)
 
+    def fetch_audio_embedding(
+        self,
+        data: str,
+    ) -> torch.Tensor:
+        """
+        Load audio embedding from a URL.
+        """
+        audio_embedding_io = AudioEmbeddingMediaIO()
+
+        return audio_embedding_io.load_base64("", data)
+
 
 def encode_audio_base64(
     audio: np.ndarray,

From 698024ecce3ebe9a108e9583395ca612757f7845 Mon Sep 17 00:00:00 2001
From: Qidong Su <soodoshll@gmail.com>
Date: Thu, 20 Nov 2025 22:40:25 -0500
Subject: [PATCH 269/578] [Doc] update installation guide regarding
 aarch64+cuda pytorch build (#28875)

Signed-off-by: Qidong Su <soodoshll@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docker/Dockerfile                               | 17 -----------------
 docs/deployment/docker.md                       |  7 +++----
 .../installation/gpu.cuda.inc.md                |  5 +----
 3 files changed, 4 insertions(+), 25 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 964700e2a43a..709b79e84fbb 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -56,7 +56,6 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 
 # PyTorch provides its own indexes for standard and nightly builds
 ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 
 # PIP supports multiple authentication schemes, including keyring
 # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
@@ -98,7 +97,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Activate virtual environment and add uv to PATH
@@ -317,7 +315,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Install uv for faster pip installs
@@ -337,20 +334,6 @@ ENV UV_LINK_MODE=copy
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# arm64 (GH200) build follows the practice of "use existing pytorch" build,
-# we need to install torch and torchvision from the nightly builds first,
-# pytorch will not appear as a vLLM dependency in all of the following steps
-# after this step
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            --pre pytorch_triton==3.3.0+gitab727c40 ; \
-    fi
-
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 1c639f3533d4..0e636c87f38a 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -82,8 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \
 
 ## Building for Arm64/aarch64
 
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
-of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 
 !!! note
     Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@@ -94,7 +93,6 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
     ```bash
     # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-    python3 use_existing_torch.py
     DOCKER_BUILDKIT=1 docker build . \
     --file docker/Dockerfile \
     --target vllm-openai \
@@ -102,7 +100,8 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
     -t vllm/vllm-gh200-openai:latest \
     --build-arg max_jobs=66 \
     --build-arg nvcc_threads=2 \
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX"
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
+    --build-arg RUN_WHEEL_CHECK=false
     ```
 
 !!! note
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index b2d0d64a2d35..601d3659af88 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -158,10 +158,7 @@ uv pip install -e .
 
 ##### Use an existing PyTorch installation
 
-There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.:
-
-- Building vLLM with PyTorch nightly or a custom PyTorch build.
-- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.
+There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, for example, when building vLLM with non-default PyTorch builds (like nightly or a custom build).
 
 To build vLLM using an existing PyTorch installation:
 

From 56e96b37e4951946c06379b4891d8170e743dcc2 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 11:40:40 +0800
Subject: [PATCH 270/578] [V0 Deprecation] Remove `best_of` (#29090)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/usage/v1_guide.md                        |  4 +-
 tests/v1/sample/test_sampling_params_e2e.py   |  8 ----
 vllm/entrypoints/openai/protocol.py           |  4 --
 vllm/entrypoints/openai/serving_completion.py | 10 +----
 vllm/sampling_params.py                       | 40 -------------------
 vllm/v1/engine/processor.py                   |  3 --
 6 files changed, 4 insertions(+), 65 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 8d8a9e0f5080..e46bee3f4ef2 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -169,8 +169,8 @@ As part of the major architectural rework in vLLM V1, several legacy features ha
 - **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
 - **Per-Request Logits Processors**: In V0, users could pass custom
   processing functions to adjust logits on a per-request basis. In vLLM V1, this
-  feature has been deprecated. Instead, the design is moving toward supporting **global logits
-  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
+  feature has been deprecated. Instead, we now support **global logits processors**
+  which are set at startup time, see [RFC #17799](https://github.com/vllm-project/vllm/issues/17799).
 
 ##### KV Cache features
 
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index 915b9957031d..1684252174d3 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -22,14 +22,6 @@ def test_n_gt_1(llm):
     assert len(outputs[0].outputs) == 3
 
 
-def test_best_of(llm):
-    """Raise a ValueError since best_of is deprecated."""
-
-    params = SamplingParams(n=2, best_of=3)
-    with pytest.raises(ValueError):
-        _ = llm.generate(PROMPT, params)
-
-
 def test_penalties(llm):
     """Check that we do not get errors if applied."""
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 65bd15ba387b..41172d8ec2f7 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -565,7 +565,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
     user: str | None = None
 
     # --8<-- [start:chat-completion-sampling-params]
-    best_of: int | None = None
     use_beam_search: bool = False
     top_k: int | None = None
     min_p: float | None = None
@@ -889,7 +888,6 @@ def to_sampling_params(
             extra_args["kv_transfer_params"] = self.kv_transfer_params
         return SamplingParams.from_optional(
             n=self.n,
-            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,
@@ -1088,7 +1086,6 @@ class CompletionRequest(OpenAIBaseModel):
     # https://platform.openai.com/docs/api-reference/completions/create
     model: str | None = None
     prompt: list[int] | list[list[int]] | str | list[str] | None = None
-    best_of: int | None = None
     echo: bool | None = False
     frequency_penalty: float | None = 0.0
     logit_bias: dict[str, float] | None = None
@@ -1375,7 +1372,6 @@ def to_sampling_params(
             extra_args["kv_transfer_params"] = self.kv_transfer_params
         return SamplingParams.from_optional(
             n=self.n,
-            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 1cfb45ef4036..9681aa8c71e6 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -250,14 +250,8 @@ async def create_completion(
         model_name = self.models.model_name(lora_request)
         num_prompts = len(engine_prompts)
 
-        # Similar to the OpenAI API, when n != best_of, we do not stream the
-        # results. Noting that best_of is only supported in V0. In addition,
-        # we do not stream the results when use beam search.
-        stream = (
-            request.stream
-            and (request.best_of is None or request.n == request.best_of)
-            and not request.use_beam_search
-        )
+        # We do not stream the results when using beam search.
+        stream = request.stream and not request.use_beam_search
 
         # Streaming response
         if stream:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 0fb1d67687c8..fbbe3d4cabb9 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -144,12 +144,6 @@ class SamplingParams(
         are generated and streamed cumulatively per request. To see all `n`
         outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY`
         in `SamplingParams`."""
-    best_of: int | None = None
-    """Number of output sequences that are generated from the prompt. From
-    these `best_of` sequences, the top `n` sequences are returned. `best_of`
-    must be greater than or equal to `n`. By default, `best_of` is set to `n`.
-    Warning, this is only supported in V0."""
-    _real_n: int | None = None
     presence_penalty: float = 0.0
     """Penalizes new tokens based on whether they appear in the generated text
     so far. Values > 0 encourage the model to use new tokens, while values < 0
@@ -265,7 +259,6 @@ class SamplingParams(
     @staticmethod
     def from_optional(
         n: int | None = 1,
-        best_of: int | None = None,
         presence_penalty: float | None = 0.0,
         frequency_penalty: float | None = 0.0,
         repetition_penalty: float | None = 1.0,
@@ -315,7 +308,6 @@ def from_optional(
 
         return SamplingParams(
             n=1 if n is None else n,
-            best_of=best_of,
             presence_penalty=0.0 if presence_penalty is None else presence_penalty,
             frequency_penalty=0.0 if frequency_penalty is None else frequency_penalty,
             repetition_penalty=1.0
@@ -348,22 +340,6 @@ def from_optional(
         )
 
     def __post_init__(self) -> None:
-        # how we deal with `best_of`:
-        # if `best_of` is not set, we default to `n`;
-        # if `best_of` is set, we set `n` to `best_of`,
-        # and set `_real_n` to the original `n`.
-        # when we return the result, we will check
-        # if we need to return `n` or `_real_n` results
-        if self.best_of:
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}."
-                )
-            if not self._real_n:
-                self._real_n = self.n
-                self.n = self.best_of
-
         if 0 < self.temperature < _MAX_TEMP:
             logger.warning(
                 "temperature %s is less than %s, which may cause numerical "
@@ -433,18 +409,6 @@ def _verify_args(self) -> None:
             raise ValueError(f"n must be an int, but is of type {type(self.n)}")
         if self.n < 1:
             raise ValueError(f"n must be at least 1, got {self.n}.")
-        if self.best_of is not None:
-            if not isinstance(self.best_of, int):
-                raise ValueError(
-                    f"best_of must be an integer, got {type(self.best_of)}"
-                )
-            if self.best_of < 1:
-                raise ValueError(f"best_of must be at least 1, got {self.best_of}")
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}."
-                )
         if not -2.0 <= self.presence_penalty <= 2.0:
             raise ValueError(
                 f"presence_penalty must be in [-2, 2], got {self.presence_penalty}."
@@ -519,10 +483,6 @@ def _verify_args(self) -> None:
                 "stop strings are only supported when detokenize is True. "
                 "Set detokenize=True to use stop."
             )
-        if self.best_of != self._real_n and self.output_kind == (
-            RequestOutputKind.DELTA
-        ):
-            raise ValueError("best_of must equal n to use output_kind=DELTA")
 
     def _verify_greedy_sampling(self) -> None:
         if self.n > 1:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 4cb911d8e22b..905ad406b307 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -142,9 +142,6 @@ def _validate_supported_sampling_params(
         self,
         params: SamplingParams,
     ) -> None:
-        # Best of not yet supported.
-        if params.best_of is not None and params.best_of > 1:
-            raise ValueError("vLLM V1 does not yet support best_of.")
         # Logits processors not supported.
         if params.logits_processors:
             raise ValueError(

From 8c25f9cfb619c46ecaf8fdb1ef697da9cc21d7e0 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Thu, 20 Nov 2025 19:50:59 -0800
Subject: [PATCH 271/578] [BugFix] skip combo kernel on cpu (#29129)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/config/compilation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 1c3ef502f0f4..abdae4910612 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -664,6 +664,8 @@ def __post_init__(self) -> None:
             is_torch_equal_or_newer("2.9.0.dev")
             and "combo_kernels" not in self.inductor_compile_config
             and "benchmark_combo_kernel" not in self.inductor_compile_config
+            # (fixme @boyuan) combo kernel does not support cpu yet.
+            and not current_platform.is_cpu()
         ):
             # use horizontal fusion, which is useful for fusing qk-norm and
             # qk-rope when query and key have different shapes.

From 11857a00b0a59183286eab393df4e13b20efec3a Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 20 Nov 2025 23:24:43 -0500
Subject: [PATCH 272/578] [Attention] Add ROCM_AITER_MLA_SPARSE to attention
 backend registry (#29103)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/attention/backends/registry.py | 3 +++
 vllm/platforms/rocm.py              | 5 +----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 91e1cad01f4f..6747cf7743b1 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -52,6 +52,9 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     ROCM_AITER_FA = (
         "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
     )
+    ROCM_AITER_MLA_SPARSE = (
+        "vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse.ROCMAiterMLASparseBackend"
+    )
     TORCH_SDPA = ""  # this tag is only used for ViT
     FLASHINFER = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
     FLASHINFER_MLA = (
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 1a2f9226ddce..f9005fd7d044 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -233,10 +233,7 @@ def get_attn_backend_cls(
                 "Sparse MLA backend on ROCm only supports block size 1 for now."
             )
             logger.info_once("Using Sparse MLA backend on V1 engine.")
-            return (
-                "vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse."
-                "ROCMAiterMLASparseBackend"
-            )
+            return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path()
 
         if use_mla:
             if selected_backend is None:

From 30b9c6774396c168bdf019b488f2cfc133b09b35 Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Thu, 20 Nov 2025 21:27:45 -0800
Subject: [PATCH 273/578] Revert "[Redo] #26368 (#28771)" (#29121)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 tests/v1/core/test_async_scheduler.py         |  3 +-
 .../v1/core/test_priority_scheduler_random.py |  6 +-
 tests/v1/core/test_scheduler.py               | 88 ++++++++-----------
 .../kv_connector/unit/test_nixl_connector.py  |  7 +-
 tests/v1/kv_connector/unit/utils.py           |  3 +-
 tests/v1/spec_decode/test_eagle.py            |  5 +-
 tests/v1/spec_decode/test_ngram.py            | 18 ++--
 vllm/utils/gc_utils.py                        | 13 +--
 vllm/v1/core/sched/scheduler.py               |  4 +-
 vllm/v1/outputs.py                            |  4 +-
 vllm/v1/sample/rejection_sampler.py           |  8 +-
 vllm/v1/spec_decode/eagle.py                  |  7 +-
 vllm/v1/spec_decode/ngram_proposer.py         |  6 +-
 vllm/v1/spec_decode/suffix_decoding.py        | 10 +--
 vllm/v1/worker/gpu_model_runner.py            | 36 +++-----
 vllm/v1/worker/tpu_model_runner.py            |  8 +-
 16 files changed, 99 insertions(+), 127 deletions(-)

diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index 1d80ee987591..e0645ed43015 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import deque
 
-import numpy as np
 import pytest
 
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -22,7 +21,7 @@ def _make_model_runner_output(
     return ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
-        sampled_token_ids=[np.array([i]) for i in range(len(req_ids))],
+        sampled_token_ids=[[i] for i in range(len(req_ids))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py
index ba0b703302e3..b4805be80272 100644
--- a/tests/v1/core/test_priority_scheduler_random.py
+++ b/tests/v1/core/test_priority_scheduler_random.py
@@ -3,7 +3,6 @@
 import random
 import uuid
 
-import numpy as np
 import pytest
 
 from vllm.config import VllmConfig
@@ -100,7 +99,8 @@ def _mock_execute_model(
         random.randint(*num_output_tokens_range) for _ in range(len(request_ids))
     ]
     sampled_token_ids = [
-        np.random.randint(0, 100, size=num_tokens) for num_tokens in num_output_tokens
+        [random.randint(0, 100) for _ in range(num_tokens)]
+        for num_tokens in num_output_tokens
     ]
 
     return ModelRunnerOutput(
@@ -196,8 +196,6 @@ def test_priority_scheduling_blast(
     num_blocks: int,
 ):
     random.seed(42)
-    np.random.seed(42)
-
     seen_request_prompt_length = dict[str, int]()
     seen_request_ids = set[str]()
     seen_mm_hashes = set[str]()
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 0570c0854c67..04e738293cd7 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -3,7 +3,6 @@
 import dataclasses
 from unittest.mock import Mock
 
-import numpy as np
 import pytest
 import torch
 
@@ -170,7 +169,7 @@ def test_schedule_partial_requests():
         req_id_to_index=req_to_index,
         # Only the first request has a sampled token id because
         # the rest requests are still being prefilled.
-        sampled_token_ids=[np.array([0]), np.array([]), np.array([])],
+        sampled_token_ids=[[0], [], []],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -217,7 +216,7 @@ def test_no_mm_input_chunking():
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
+        sampled_token_ids=[[] for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -277,7 +276,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([]) for _ in range(len(requests))],
+        sampled_token_ids=[[] for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -301,8 +300,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([0]), np.array([0])]
-        + [np.array([]) for _ in range(len(requests) - 2)],
+        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -349,8 +347,8 @@ def test_stop_via_update_from_output():
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
         sampled_token_ids=[
-            np.array([EOS_TOKEN_ID]),
-            np.array([10, 11]),
+            [EOS_TOKEN_ID],
+            [10, 11],
         ],  # First request hits EOS, second continues
         logprobs=None,
         prompt_logprobs_dict={},
@@ -394,10 +392,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[
-            np.array([10, 42, 12]),
-            np.array([13, 14]),
-        ],  # First request hits stop token
+        sampled_token_ids=[[10, 42, 12], [13, 14]],  # First request hits stop token
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -441,10 +436,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[
-            np.array([10, 11, 12]),
-            np.array([13]),
-        ],  # First request exceeds max_tokens
+        sampled_token_ids=[[10, 11, 12], [13]],  # First request exceeds max_tokens
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -483,7 +475,7 @@ def test_stop_via_update_from_output():
     model_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
+        sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -624,7 +616,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -641,7 +633,7 @@ def test_schedule_concurrent_batches(
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -678,7 +670,7 @@ def test_preempt_during_execution():
     model_runner_output0 = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -695,7 +687,7 @@ def test_preempt_during_execution():
     model_runner_output1 = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[np.array([42])],
+        sampled_token_ids=[[42]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -712,18 +704,14 @@ def test_preempt_during_execution():
 @pytest.mark.parametrize(
     "spec_tokens,output_tokens,expected",
     [
-        ([[1, 2, 3]], [np.array([1, 2, 3, 4])], (1, 3, 3, [1, 1, 1])),  # perfect match
-        ([[1, 2, 3]], [np.array([1, 5])], (1, 3, 1, [1, 0, 0])),  # early mismatch
-        (
-            [[1, 2], [3]],
-            [np.array([1, 2, 5]), np.array([3, 4])],
-            (2, 3, 3, [2, 1]),
-        ),  # multiple sequences
-        ([[1]], [np.array([1, 2])], (1, 1, 1, [1])),  # single token sequence
-        ([[]], [np.array([5])], (0, 0, 0, [0])),  # empty sequence
+        ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
+        ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (2, 3, 3, [2, 1])),  # multiple sequences
+        ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
+        ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
         (
             [[1, 2, 3], [4, 5, 6]],
-            [np.array([1, 2, 7]), np.array([4, 8])],
+            [[1, 2, 7], [4, 8]],
             (2, 6, 3, [2, 1, 0]),
         ),  # multiple mismatches
     ],
@@ -757,7 +745,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     model_runner_output = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([0]) for _ in range(len(requests))],
+        sampled_token_ids=[[0] for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -984,7 +972,7 @@ def test_kv_connector_basic(is_async: bool):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1037,7 +1025,7 @@ def test_kv_connector_basic(is_async: bool):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1100,7 +1088,7 @@ def test_external_prefix_cache_metrics():
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=[r.request_id for r in requests],
         req_id_to_index={r.request_id: i for i, r in enumerate(requests)},
-        sampled_token_ids=[np.array([1000])] * NUM_REQUESTS,
+        sampled_token_ids=[[1000]] * NUM_REQUESTS,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1166,7 +1154,7 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1251,7 +1239,7 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1344,7 +1332,7 @@ def make_output(scheduler: Scheduler):
     return ModelRunnerOutput(
         req_ids=[req.request_id for req in scheduler.running],
         req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)},
-        sampled_token_ids=[np.array([1000])] * len(scheduler.running),
+        sampled_token_ids=[[1000]] * len(scheduler.running),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1761,7 +1749,7 @@ def test_priority_scheduling_preemption():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
+        sampled_token_ids=[[100] for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1830,7 +1818,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
         req_id_to_index={
             req.request_id: i for i, req in enumerate(low_priority_requests)
         },
-        sampled_token_ids=[np.array([100]) for _ in low_priority_requests],
+        sampled_token_ids=[[100] for _ in low_priority_requests],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -2076,7 +2064,7 @@ def test_priority_scheduling_heap_property():
             model_output = ModelRunnerOutput(
                 req_ids=[req.req_id],
                 req_id_to_index={req.req_id: 0},
-                sampled_token_ids=[np.array([100])],
+                sampled_token_ids=[[100]],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[],
@@ -2162,7 +2150,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[request_low.request_id],
         req_id_to_index={request_low.request_id: 0},
-        sampled_token_ids=[np.array([100])],
+        sampled_token_ids=[[100]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2193,7 +2181,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[np.array([100]) for _ in requests],
+        sampled_token_ids=[[100] for _ in requests],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2219,7 +2207,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[np.array([]), np.array([100])],
+        sampled_token_ids=[[], [100]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2636,7 +2624,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
     model_output = ModelRunnerOutput(
         req_ids=[request1.request_id],
         req_id_to_index={request1.request_id: 0},
-        sampled_token_ids=[np.array([100])],
+        sampled_token_ids=[[100]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -2842,7 +2830,7 @@ def test_ec_connector_unable_to_allocate(use_kv_connector):
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
-        sampled_token_ids=[np.array([1000])] * len(req_ids),
+        sampled_token_ids=[[1000]] * len(req_ids),
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -2955,7 +2943,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
     model_output = ModelRunnerOutput(
         req_ids=[request_low.request_id],
         req_id_to_index={request_low.request_id: 0},
-        sampled_token_ids=[np.array([100])],
+        sampled_token_ids=[[100]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -3006,7 +2994,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[np.array([100]) for _ in requests],
+        sampled_token_ids=[[100] for _ in requests],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -3041,7 +3029,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
         req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
-        sampled_token_ids=[np.array([100]), np.array([100, 200])],
+        sampled_token_ids=[[100], [100, 200]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -3227,7 +3215,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
     model_output = ModelRunnerOutput(
         req_ids=[request1.request_id, request2.request_id],
         req_id_to_index={request1.request_id: 0, request2.request_id: 1},
-        sampled_token_ids=[np.array([100]), np.array([121])],
+        sampled_token_ids=[[100], [121]],
         # spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index b264e5108c16..b7d7a10057b8 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -11,7 +11,6 @@
 from collections import defaultdict
 from unittest.mock import patch
 
-import numpy as np
 import pytest
 import ray
 import torch
@@ -827,7 +826,7 @@ def test_kv_connector_stats_aggregation():
         output = ModelRunnerOutput(
             req_ids=[f"req_{i}"],
             req_id_to_index={f"req_{i}": 0},
-            sampled_token_ids=[np.array([123])],  # dummy token
+            sampled_token_ids=[[123]],  # dummy token
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[None],
@@ -908,7 +907,7 @@ def make_multi_stats(nixl_count: int, foo_count: int) -> MultiKVConnectorStats:
         output = ModelRunnerOutput(
             req_ids=[f"req_{i}"],
             req_id_to_index={f"req_{i}": 0},
-            sampled_token_ids=[np.array([123])],
+            sampled_token_ids=[[123]],
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[None],
@@ -966,7 +965,7 @@ def test_scheduler_kv_connector_stats_aggregation():
     model_output = ModelRunnerOutput(
         req_ids=["req_0"],
         req_id_to_index={"req_0": 0},
-        sampled_token_ids=[np.array([123])],
+        sampled_token_ids=[[123]],
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[None],
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index c248104d5b5e..f35f91bb3adf 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -7,7 +7,6 @@
 from itertools import chain, count
 from typing import Any
 
-import numpy as np
 import torch
 
 from vllm import SamplingParams
@@ -229,7 +228,7 @@ def create_model_runner_output(
 
     # Make sampled tokens.
     sampled_token = EOS_TOKEN_ID if use_eos else token_id
-    sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]
+    sampled_token_ids = [[sampled_token] for _ in req_ids]
 
     kv_connector_output = (
         None
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 805b8c86b080..c93c59d1f4c4 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -3,7 +3,6 @@
 
 from unittest import mock
 
-import numpy as np
 import pytest
 import torch
 
@@ -113,9 +112,7 @@ def test_prepare_next_token_ids():
     sampled_token_ids_tensor = torch.tensor(
         sampled_token_ids, dtype=torch.int32, device=device
     )
-    sampled_token_ids_cpu = [
-        np.array([i for i in seq if i != -1]) for seq in sampled_token_ids
-    ]
+    sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids]
 
     expected_next_token_ids_cpu = [1, 4, 30, 40]
     expected_next_token_ids_tensor = torch.tensor(
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 563bc1d957f4..692c39282c37 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -77,7 +77,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match.
     token_ids_cpu = np.array([[1, 2, 3, 4, 5]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -88,7 +88,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match for 4-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -99,7 +99,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # No match for 4-gram but match for 3-gram.
     token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -111,7 +111,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # In this case, the proposer should return the 4-gram match.
     token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -122,7 +122,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # Match for 2-gram and 3-gram, but not 4-gram.
     token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]])
     result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -133,7 +133,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # Multiple 3-gram matched, but always pick the first one.
     token_ids_cpu = np.array([[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]])
     result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -144,7 +144,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # check empty input
     token_ids_cpu = np.array([[]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[np.array([0])],
+        sampled_token_ids=[[0]],
         req_ids=["0"],
         num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
         token_ids_cpu=token_ids_cpu,
@@ -157,7 +157,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     # second request has 3 tokens and no match. Padded with -1 for max len 5
     token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]])
     result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
-        sampled_token_ids=[np.array([0]), np.array([1])],
+        sampled_token_ids=[[0], [1]],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([5, 3]),
         token_ids_cpu=token_ids_cpu,
@@ -181,7 +181,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
     input_2[:3] = [4, 5, 6]
     token_ids_cpu = np.array([input_1, input_2])
     result = ngram_proposer.propose(
-        sampled_token_ids=[np.array([0]), np.array([1])],
+        sampled_token_ids=[[0], [1]],
         req_ids=["0", "1"],
         num_tokens_no_spec=np.array([len(input_1), 3]),
         token_ids_cpu=token_ids_cpu,
diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
index 3436e450a269..c56b1794230e 100644
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@@ -53,6 +53,7 @@ def __init__(self, config: GCDebugConfig) -> None:
         self.config = config
         # Start time in micro second of this GC cycle
         self.start_time_ns: int = time.monotonic_ns()
+        self.num_objects: int = 0
         # If config.top_objects is positive,
         # compute top collected objects by object types
         self.gc_top_collected_objects: str = ""
@@ -68,19 +69,21 @@ def handle(self, phase: str, info: dict[str, int]) -> None:
             # Before GC started, record GC start time
             # and top collected objects
             self.start_time_ns = time.monotonic_ns()
-            if (top_objects := self.config.top_objects) > 0:
-                self.gc_top_collected_objects = _compute_top_gc_collected_objects(
-                    gc.get_objects(generation), top_objects
-                )
+            objects = gc.get_objects(generation)
+            self.num_objects = len(objects)
+            self.gc_top_collected_objects = _compute_top_gc_collected_objects(
+                objects, self.config.top_objects
+            )
         elif phase == "stop":
             # After GC finished, Record GC elapsed time and
             # optionally top collected objects
             elpased_ms = (time.monotonic_ns() - self.start_time_ns) / 1e6
             logger.info(
                 "GC took %.3fms to complete. "
-                "Collected %s objects in GC generation %d.%s",
+                "Collected %s objects (out of %d) in GC generation %d.%s",
                 elpased_ms,
                 str(info.get("collected", "?")),
+                self.num_objects,
                 generation,
                 (
                     f" Top collected objects: \n{self.gc_top_collected_objects}"
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4cc4c29591cc..1ac8520a8ed2 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1013,8 +1013,8 @@ def update_from_output(
                 continue
 
             req_index = model_runner_output.req_id_to_index[req_id]
-            generated_token_ids: list[int] = (
-                sampled_token_ids[req_index].tolist() if sampled_token_ids else []
+            generated_token_ids = (
+                sampled_token_ids[req_index] if sampled_token_ids else []
             )
 
             scheduled_spec_token_ids = (
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index c0b2835c3124..e32d5bb608b1 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -158,7 +158,7 @@ class ModelRunnerOutput:
     # num_generated_tokens is the number of tokens
     # generated in the current step. It can be different for
     # each request due to speculative/jump decoding.
-    sampled_token_ids: list[np.ndarray]
+    sampled_token_ids: list[list[int]]
 
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
@@ -220,7 +220,7 @@ def make_empty_encoder_model_runner_output(
     req_id_to_index: dict[str, int] = {rid: idx for idx, rid in enumerate(req_ids)}
 
     # No tokens generated yet ⇒ one empty list per request
-    sampled_token_ids: list[list[int]] = [np.array([0]) for _ in req_ids]
+    sampled_token_ids: list[list[int]] = [[0] for _ in req_ids]
 
     # Pooler outputs are not available yet ⇒ use None placeholders
     pooler_output: list[torch.Tensor | None] = [None for _ in req_ids]
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index f31a0cddda9a..926305d25f56 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -3,7 +3,6 @@
 
 from dataclasses import replace
 
-import numpy as np
 import torch
 import torch.nn as nn
 
@@ -205,7 +204,7 @@ def _get_logprobs_tensors(
     def parse_output(
         output_token_ids: torch.Tensor,
         vocab_size: int,
-    ) -> list[np.ndarray]:
+    ) -> list[list[int]]:
         """Parse the output of the rejection sampler.
         Args:
             output_token_ids: The sampled token IDs in shape
@@ -221,7 +220,10 @@ def parse_output(
         valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
             output_token_ids_np < vocab_size
         )
-        return [row[valid_mask[i]] for i, row in enumerate(output_token_ids_np)]
+        outputs = [
+            row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
+        ]
+        return outputs
 
     def apply_logits_processors(
         self,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index ba37bc81607f..0df9cd3214e5 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -496,7 +496,7 @@ def propose(
 
     def prepare_next_token_ids_cpu(
         self,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
         requests: dict[str, CachedRequestState],
         gpu_input_batch: InputBatch,
         num_scheduled_tokens: dict[str, int],
@@ -511,7 +511,7 @@ def prepare_next_token_ids_cpu(
         req_ids = gpu_input_batch.req_ids
         next_token_ids: list[int] = []
         for i, token_ids in enumerate(sampled_token_ids):
-            if token_ids.shape[0] > 0:
+            if token_ids:
                 # Common case.
                 next_token_id = token_ids[-1]
             else:
@@ -522,9 +522,10 @@ def prepare_next_token_ids_cpu(
                 seq_len = req_state.num_computed_tokens + num_scheduled_tokens[req_id]
                 next_token_id = req_state.get_token_id(seq_len)
             next_token_ids.append(next_token_id)
-        return torch.tensor(
+        next_token_ids = torch.tensor(
             next_token_ids, dtype=torch.int32, device=self.input_ids.device
         )
+        return next_token_ids
 
     def prepare_next_token_ids_padded(
         self,
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 378937dba988..e2f83cb24aa9 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -54,7 +54,7 @@ def __init__(self, vllm_config: VllmConfig):
         # Trigger Numba JIT compilation for N-gram proposer.
         # This usually takes less than 1 second.
         self.propose(
-            [np.array([])] * 1024,
+            [[]] * 1024,
             [""] * 1024,
             np.zeros(1024, dtype=np.int32),
             np.zeros((1024, self.max_model_len), dtype=np.int32),
@@ -131,7 +131,7 @@ def batch_propose(
 
     def propose(
         self,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
         req_ids: list[str],
         num_tokens_no_spec: np.ndarray,
         token_ids_cpu: np.ndarray,
@@ -140,7 +140,7 @@ def propose(
         # find which requests need ngram proposals
         valid_ngram_requests = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            num_sampled_ids = sampled_ids.shape[0]
+            num_sampled_ids = len(sampled_ids)
             if not num_sampled_ids:
                 # Skip speculative decoding.
                 continue
diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py
index d76e0ffe778d..049e335db325 100644
--- a/vllm/v1/spec_decode/suffix_decoding.py
+++ b/vllm/v1/spec_decode/suffix_decoding.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import numpy as np
-
 from vllm.config import VllmConfig
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
@@ -34,16 +32,16 @@ def __init__(self, vllm_config: VllmConfig):
     def propose(
         self,
         input_batch: InputBatch,
-        sampled_token_ids: list[np.ndarray],
+        sampled_token_ids: list[list[int]],
     ) -> list[list[int]]:
         """
         Propose speculative tokens for each request in the input batch. Suffix Decoding
         will speculate a dynamic number of tokens for each request every decoding step,
         so each entry in the returned list may have different lengths.
         """
-        draft_token_ids: list[np.ndarray] = []
+        draft_token_ids: list[list[int]] = []
         for i, sampled_ids in enumerate(sampled_token_ids):
-            if sampled_ids.shape[0] == 0:
+            if not sampled_ids:
                 # Skip speculative decoding for partial prefills.
                 draft_token_ids.append([])
                 continue
@@ -72,7 +70,7 @@ def propose(
                 self.suffix_cache.start_request(req_id, prompt_token_ids)
 
             # Append the newly sampled ids to the suffix cache for this request.
-            self.suffix_cache.add_active_response(req_id, sampled_ids.tolist())
+            self.suffix_cache.add_active_response(req_id, sampled_ids)
 
             # Suffix decoding only uses the most recent tokens up to max_tree_depth, so
             # we extract the pattern from the end of the input.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a7fa68b20ac5..4c65a5e9b029 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -221,16 +221,14 @@ def get_output(self) -> ModelRunnerOutput:
         del self._sampled_token_ids
         max_gen_len = self.sampled_token_ids_cpu.shape[-1]
         if max_gen_len == 1:
-            valid_sampled_token_ids: list[np.ndarray] = [
-                row for row in self.sampled_token_ids_cpu.numpy()
-            ]
+            valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
         else:
             valid_sampled_token_ids = RejectionSampler.parse_output(
                 self.sampled_token_ids_cpu,
                 self.vocab_size,
             )
         for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i] = np.array([])
+            valid_sampled_token_ids[i].clear()
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
@@ -2466,7 +2464,7 @@ def _bookkeeping_sync(
     ) -> tuple[
         dict[str, int],
         LogprobsLists | None,
-        list[np.ndarray],
+        list[list[int]],
         dict[str, LogprobsTensors | None],
         list[str],
         dict[str, int],
@@ -2492,7 +2490,6 @@ def _bookkeeping_sync(
         num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
         invalid_req_indices = []
-        valid_sampled_token_ids: list[np.ndarray]
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
@@ -2507,7 +2504,7 @@ def _bookkeeping_sync(
                 )
             # Mask out the sampled tokens that should not be sampled.
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)] = np.array([])
+                valid_sampled_token_ids[int(i)].clear()
         else:
             valid_sampled_token_ids = []
             invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
@@ -2537,24 +2534,19 @@ def _bookkeeping_sync(
             [0] if spec_decode_metadata and logprobs_tensors else None
         )
         for req_idx in range(num_sampled_tokens):
-            sampled_ids: np.ndarray | None
             if self.use_async_scheduling:
-                sampled_ids = (
-                    np.array([-1]) if req_idx not in invalid_req_indices_set else None
-                )
+                sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
             else:
                 sampled_ids = valid_sampled_token_ids[req_idx]
 
-            num_sampled_ids: int = (
-                sampled_ids.shape[0] if sampled_ids is not None else 0
-            )
+            num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
 
             if cu_num_accepted_tokens is not None:
                 cu_num_accepted_tokens.append(
                     cu_num_accepted_tokens[-1] + num_sampled_ids
                 )
 
-            if sampled_ids is None or num_sampled_ids == 0:
+            if not sampled_ids:
                 continue
 
             start_idx = self.input_batch.num_tokens_no_spec[req_idx]
@@ -2938,9 +2930,7 @@ def sample_tokens(
 
         self.input_batch.prev_sampled_token_ids = None
 
-        def propose_draft_token_ids(
-            sampled_token_ids: torch.Tensor | list[np.ndarray],
-        ) -> None:
+        def propose_draft_token_ids(sampled_token_ids):
             assert spec_decode_common_attn_metadata is not None
             with record_function_or_nullcontext("gpu_model_runner: draft"):
                 self._draft_token_ids = self.propose_draft_token_ids(
@@ -3113,14 +3103,14 @@ def _get_valid_sampled_token_count(self) -> list[int]:
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
-        sampled_token_ids: torch.Tensor | list[np.ndarray],
+        sampled_token_ids: torch.Tensor | list[list[int]],
         sampling_metadata: SamplingMetadata,
         hidden_states: torch.Tensor,
         sample_hidden_states: torch.Tensor,
         aux_hidden_states: list[torch.Tensor] | None,
         spec_decode_metadata: SpecDecodeMetadata | None,
         common_attn_metadata: CommonAttentionMetadata,
-    ) -> torch.Tensor | list[list[int]]:
+    ) -> list[list[int]] | torch.Tensor:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         spec_config = self.speculative_config
         assert spec_config is not None
@@ -3154,7 +3144,7 @@ def propose_draft_token_ids(
                 for num_draft, tokens in zip(
                     spec_decode_metadata.num_draft_tokens, sampled_token_ids
                 ):
-                    indices.append(offset + tokens.shape[0] - 1)
+                    indices.append(offset + len(tokens) - 1)
                     offset += num_draft + 1
                 indices = torch.tensor(indices, device=self.device)
                 hidden_states = sample_hidden_states[indices]
@@ -5150,7 +5140,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
 
         return kv_cache_spec
 
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
         # This is a short term mitigation for issue mentioned in
         # https://github.com/vllm-project/vllm/issues/22754.
         # `tolist` would trigger a cuda wise stream sync, which
@@ -5163,4 +5153,4 @@ def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
         pinned.copy_(sampled_token_ids, non_blocking=True)
         self.transfer_event.record()
         self.transfer_event.synchronize()
-        return [row for row in pinned.numpy()]
+        return pinned.tolist()
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 450160d28649..5f6012ec614c 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1262,15 +1262,13 @@ def concat_lists(input_lists):
 
         max_gen_len = selected_token_ids.shape[-1]
         if max_gen_len == 1:
-            valid_sampled_token_ids: list[np.ndarray] = [
-                row for row in selected_token_ids.numpy()
-            ]
+            valid_sampled_token_ids = selected_token_ids.tolist()
 
             # Mask out the sampled tokens that should not be sampled.
             # TODO: Keep in sync with gpu_model_runner.py, in particular
             #       the "else" case here
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[i] = np.array([])
+                valid_sampled_token_ids[i].clear()
 
             # Append sampled tokens
             for i, req_state, seq_len in request_seq_lens:
@@ -1283,7 +1281,7 @@ def concat_lists(input_lists):
             valid_mask = selected_token_ids != INVALID_TOKEN_ID
             gen_lens = valid_mask.sum(dim=1).tolist()
             valid_sampled_token_ids = [
-                seq.numpy() for seq in selected_token_ids[valid_mask].split(gen_lens)
+                seq.tolist() for seq in selected_token_ids[valid_mask].split(gen_lens)
             ]
             self.input_batch.num_tokens[:num_reqs] += gen_lens
             for i, req_state, seq_len in request_seq_lens:

From b4734b9550eb85cb8e38e3d93ef8fccb83ddf8fe Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Thu, 20 Nov 2025 22:32:30 -0700
Subject: [PATCH 274/578] [Bugfix] Fix default MM LoRA alignment for single str
 prompts (#29140)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
---
 tests/lora/test_default_mm_loras.py | 35 +++++++++++++++++++++++++++++
 vllm/entrypoints/llm.py             |  2 +-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/tests/lora/test_default_mm_loras.py b/tests/lora/test_default_mm_loras.py
index dfc45e78e464..407b29fdd1d5 100644
--- a/tests/lora/test_default_mm_loras.py
+++ b/tests/lora/test_default_mm_loras.py
@@ -5,7 +5,9 @@
 """
 
 import os
+import unittest.mock as mock
 
+import pytest
 from huggingface_hub import snapshot_download
 
 from vllm.lora.request import LoRARequest
@@ -114,3 +116,36 @@ def test_default_mm_lora_fails_with_overridden_lora_request(
         default_mm_loras={"audio": IMAGE_LORA_PATH},
         expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
     )
+
+
+def test_default_mm_lora_does_not_expand_string_reqs(vllm_runner):
+    class MockEngineException(Exception):
+        pass
+
+    # Regression test for ensuring default multimodal lora resolution
+    # does not expand the lora req if the prompt type is a string.
+    vllm_runner_kwargs = {
+        **VLLM_RUNNER_BASE_KWARGS,
+        **{"default_mm_loras": {"audio": AUDIO_LORA_PATH}},
+    }
+
+    # Avoid the full generation call since these tests are expensive;
+    # just check what lora request is actually submitted to the engine
+    mock_err = "Engine is mocked for this test"
+
+    with (
+        mock.patch(
+            "vllm.v1.engine.llm_engine.LLMEngine.add_request",
+            side_effect=MockEngineException(mock_err),
+        ) as mock_add_request,
+        vllm_runner(**vllm_runner_kwargs) as vllm_model,
+    ):
+        # Die once we actually submit the request to the engine
+        with pytest.raises(MockEngineException):
+            vllm_model.llm.generate(prompts=AUDIO_PROMPT)
+
+        # Then check to make sure the submitted lora request
+        # and text prompt were zipped together correctly
+        engine_args, engine_kwargs = mock_add_request.call_args
+        assert engine_kwargs["lora_request"] is None
+        assert engine_kwargs["prompt_text"] == AUDIO_PROMPT
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index b0786bd355aa..7421eb8b8abc 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -466,7 +466,7 @@ def _get_modality_specific_lora_reqs(
         ):
             return lora_request
 
-        if not isinstance(prompts, Sequence):
+        if not isinstance(prompts, Sequence) or isinstance(prompts, str):
             prompts = [prompts]
 
         optional_loras = (

From e4c3182c6851af08c366eff725dbee876fa9fdeb Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Thu, 20 Nov 2025 22:54:10 -0800
Subject: [PATCH 275/578] [Small] Capture AttributeError when checking ray
 dependency.  (#29024)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 vllm/ray/lazy_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/ray/lazy_utils.py b/vllm/ray/lazy_utils.py
index 64b5f51571a3..06c91cc3943a 100644
--- a/vllm/ray/lazy_utils.py
+++ b/vllm/ray/lazy_utils.py
@@ -10,6 +10,8 @@ def is_ray_initialized():
         return ray.is_initialized()
     except ImportError:
         return False
+    except AttributeError:
+        return False
 
 
 def is_in_ray_actor():
@@ -24,3 +26,5 @@ def is_in_ray_actor():
         )
     except ImportError:
         return False
+    except AttributeError:
+        return False

From 7d6da483b0a1177744be4c30eb18beb6e98f53a3 Mon Sep 17 00:00:00 2001
From: Canlin Guo <canlinguosdu@gmail.com>
Date: Fri, 21 Nov 2025 15:52:34 +0800
Subject: [PATCH 276/578] [Minor][Clean] Remove the legacy assertion in video
 (#29150)

Signed-off-by: gcanlin <canlinguosdu@gmail.com>
---
 vllm/multimodal/video.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 5c75bee54dd3..763f90fde7b6 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -113,11 +113,6 @@ def _read_frames(
                 valid_num_frames,
             )
 
-        assert i == valid_num_frames, (
-            f"Expected reading {valid_num_frames} frames, "
-            f"but only loaded {i} frames from video."
-        )
-
         return frames[:valid_num_frames], valid_num_frames, valid_frame_indices
 
 
From 8ac3a4148796648d206a46144aa0dacea8977d55 Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Thu, 20 Nov 2025 23:53:30 -0800
Subject: [PATCH 277/578] [CI Failure] Fix Gemma3 RoPE configuration for
 sliding attention layers (#29111)

Signed-off-by: Huamin Li <3ericli@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/gemma3.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 565719ae7fae..4ad6fc89dcaf 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -166,10 +166,12 @@ def __init__(
         else:
             # Transformers v4 rope config.
             # Global attention. Use the values in config.json.
-            rope_parameters = config.rope_parameters.copy()
+            rope_parameters = config.rope_parameters
             # Local attention. Override the values in config.json.
             if self.is_sliding:
-                rope_parameters["rope_theta"] = config.rope_local_base_freq
+                rope_parameters = dict(
+                    rope_type="default", rope_theta=config.rope_local_base_freq
+                )
 
         self.rotary_emb = get_rope(
             self.head_dim,

From 4d7231e7743e80078bbc68ccc37b5ba5a1f28bf5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 17:40:17 +0800
Subject: [PATCH 278/578] Revert #28875 (#29159)

---
 docker/Dockerfile                               | 17 +++++++++++++++++
 docs/deployment/docker.md                       |  7 ++++---
 .../installation/gpu.cuda.inc.md                |  5 ++++-
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 709b79e84fbb..964700e2a43a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -56,6 +56,7 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 
 # PyTorch provides its own indexes for standard and nightly builds
 ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 
 # PIP supports multiple authentication schemes, including keyring
 # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
@@ -97,6 +98,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Activate virtual environment and add uv to PATH
@@ -315,6 +317,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Install uv for faster pip installs
@@ -334,6 +337,20 @@ ENV UV_LINK_MODE=copy
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            --pre pytorch_triton==3.3.0+gitab727c40 ; \
+    fi
+
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 0e636c87f38a..1c639f3533d4 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -82,7 +82,8 @@ DOCKER_BUILDKIT=1 docker build . \
 
 ## Building for Arm64/aarch64
 
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
+of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 
 !!! note
     Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@@ -93,6 +94,7 @@ A docker container can be built for aarch64 systems such as the Nvidia Grace-Hop
 
     ```bash
     # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+    python3 use_existing_torch.py
     DOCKER_BUILDKIT=1 docker build . \
     --file docker/Dockerfile \
     --target vllm-openai \
@@ -100,8 +102,7 @@ A docker container can be built for aarch64 systems such as the Nvidia Grace-Hop
     -t vllm/vllm-gh200-openai:latest \
     --build-arg max_jobs=66 \
     --build-arg nvcc_threads=2 \
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
-    --build-arg RUN_WHEEL_CHECK=false
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX"
     ```
 
 !!! note
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index 601d3659af88..b2d0d64a2d35 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -158,7 +158,10 @@ uv pip install -e .
 
 ##### Use an existing PyTorch installation
 
-There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, for example, when building vLLM with non-default PyTorch builds (like nightly or a custom build).
+There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.:
+
+- Building vLLM with PyTorch nightly or a custom PyTorch build.
+- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.
 
 To build vLLM using an existing PyTorch installation:
 

From b34129bf8e5412e4094b89aba5246605c280a5fd Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Fri, 21 Nov 2025 17:41:20 +0800
Subject: [PATCH 279/578] [Misc] remove useless v1 env (#29164)

Signed-off-by: David Chen <530634352@qq.com>
---
 tests/v1/e2e/test_lora_with_spec_decode.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/v1/e2e/test_lora_with_spec_decode.py b/tests/v1/e2e/test_lora_with_spec_decode.py
index 14532f279544..8c9ab58c3c0a 100644
--- a/tests/v1/e2e/test_lora_with_spec_decode.py
+++ b/tests/v1/e2e/test_lora_with_spec_decode.py
@@ -61,8 +61,6 @@ def test_batch_inference_correctness(
     model_setup: (method, model_name, spec_model_name, lora_path, tp_size)
     """
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Disable randomness
         m.setenv("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
         torch.manual_seed(SEED)

From aab0102a267eba814cdc09170b530a3aed96be60 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 19:56:59 +0800
Subject: [PATCH 280/578] [V0 deprecation] Remove more V0 references (#29088)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/model/basic.md              |  2 --
 docs/design/prefix_caching.md                 |  3 ---
 docs/usage/reproducibility.md                 |  9 ++-----
 docs/usage/v1_guide.md                        |  2 +-
 examples/offline_inference/reproducibility.py |  8 ++----
 examples/offline_inference/rlhf_utils.py      |  8 +++---
 .../offline_inference/save_sharded_state.py   | 19 +++-----------
 examples/offline_inference/spec_decode.py     |  6 +----
 .../model_loader/test_sharded_state_loader.py | 13 ++--------
 tests/tool_use/utils.py                       | 25 ++++++++++---------
 vllm/entrypoints/llm.py                       |  1 -
 vllm/entrypoints/openai/protocol.py           |  6 ++---
 .../layers/mamba/mamba_mixer2.py              |  1 -
 vllm/model_executor/models/interfaces.py      |  2 --
 vllm/model_executor/models/plamo2.py          |  1 -
 15 files changed, 31 insertions(+), 75 deletions(-)

diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index d7f5d2f311a3..e828de0adf3c 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -133,8 +133,6 @@ We consider 3 different scenarios:
 For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
 The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
 For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
-Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
-V0-only classes and code will be removed in the very near future.
 The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
 
 For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md
index 48536a877bd3..cf792fdabe1a 100644
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -94,9 +94,6 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache
 
 With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
 
-!!! note
-    Cache isolation is not supported in engine V0.
-
 ## Data Structure
 
 The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md
index d8a1943209c1..afc25b63902e 100644
--- a/docs/usage/reproducibility.md
+++ b/docs/usage/reproducibility.md
@@ -1,10 +1,7 @@
 # Reproducibility
 
-vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve
-reproducible results:
-
-- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
-- For V0: Set the global seed (see below).
+vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. To achieve
+reproducible results, you need to turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
 
 Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py)
 
@@ -30,8 +27,6 @@ However, in some cases, setting the seed will also [change the random state in u
 
 ### Default Behavior
 
-In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected.
-
 In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`.
 
 !!! note
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index e46bee3f4ef2..22f4e6761ea9 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -2,7 +2,7 @@
 
 !!! announcement
 
-    We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
+    We have fully deprecated V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
 
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py
index d909438b4104..e135bc1b2abb 100644
--- a/examples/offline_inference/reproducibility.py
+++ b/examples/offline_inference/reproducibility.py
@@ -11,13 +11,9 @@
 
 from vllm import LLM, SamplingParams
 
-# V1 only: Turn off multiprocessing to make the scheduling deterministic.
+# Turn off multiprocessing to make the scheduling deterministic.
 os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
 
-# V0 only: Set the global seed. The default seed is None, which is
-# not reproducible.
-SEED = 42
-
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -28,7 +24,7 @@
 
 
 def main():
-    llm = LLM(model="facebook/opt-125m", seed=SEED)
+    llm = LLM(model="facebook/opt-125m")
     outputs = llm.generate(prompts, sampling_params)
     print("-" * 50)
     for output in outputs:
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index 13def88439ef..5c0787b8778d 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -30,8 +30,8 @@ class WorkerExtension:
     """
     The class for vLLM's worker to inherit from.
     By defining an extension class, the code can work no matter what is
-    the underlying worker class. This way, the code can be compatible
-    with both vLLM V0 and V1.
+    the underlying worker class.
+
     NOTE: we define this class in a separate module, and the main module
     should pass the full qualified name as `worker_extension_cls` argument.
     """
@@ -96,8 +96,8 @@ class ColocateWorkerExtension:
     """
     The class for vLLM's worker to inherit from, in the colocate setting.
     By defining an extension class, the code can work no matter what is
-    the underlying worker class. This way, the code can be compatible
-    with both vLLM V0 and V1.
+    the underlying worker class.
+
     NOTE: we define this class in a separate module, and the main module
     should pass the full qualified name as `worker_extension_cls` argument.
     """
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index e25f46b126e6..88ee48b98bff 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -67,22 +67,9 @@ def main(args):
     Path(args.output).mkdir(exist_ok=True)
     # Dump worker states to output directory
 
-    # Check which engine version is being used
-    is_v1_engine = hasattr(llm.llm_engine, "engine_core")
-
-    if is_v1_engine:
-        # For V1 engine, we need to use engine_core.save_sharded_state
-        print("Using V1 engine save path")
-        llm.llm_engine.engine_core.save_sharded_state(
-            path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
-        )
-    else:
-        # For V0 engine
-        print("Using V0 engine save path")
-        model_executor = llm.llm_engine.model_executor
-        model_executor.save_sharded_state(
-            path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
-        )
+    llm.llm_engine.engine_core.save_sharded_state(
+        path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
+    )
 
     # Copy metadata files to output directory
     for file in os.listdir(model_path):
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 3cdc3b245b72..67a073245970 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -158,11 +158,7 @@ def main(args):
             print(f"generated text: {output.outputs[0].text}")
             print("-" * 50)
 
-    try:
-        metrics = llm.get_metrics()
-    except AssertionError:
-        print("Metrics are not supported in the V0 engine.")
-        return
+    metrics = llm.get_metrics()
 
     total_num_output_tokens = sum(
         len(output.outputs[0].token_ids) for output in outputs
diff --git a/tests/model_executor/model_loader/test_sharded_state_loader.py b/tests/model_executor/model_loader/test_sharded_state_loader.py
index 5bb841bf2fa0..cf06b000efb5 100644
--- a/tests/model_executor/model_loader/test_sharded_state_loader.py
+++ b/tests/model_executor/model_loader/test_sharded_state_loader.py
@@ -60,18 +60,9 @@ def llama_3p2_1b_files():
 
 def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
     llm_sharded_writer = LLM(model=input_dir, **kwargs)
-    # Check which engine version is being used
-    is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core")
+
     # Dump worker states to output directory
-    if is_v1_engine:
-        # For V1 engine, we need to use engine_core.save_sharded_state
-        print("Using V1 engine save path")
-        llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir)
-    else:
-        # For V0 engine
-        print("Using V0 engine save path")
-        model_executor = llm_sharded_writer.llm_engine.model_executor
-        model_executor.save_sharded_state(path=output_dir)
+    llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir)
 
     # Copy metadata files to output directory
     for file in os.listdir(input_dir):
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 38def6f874d7..d188b2186381 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -140,21 +140,22 @@ def ensure_system_prompt(
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
         "to the user's question - just respond to it normally.",
     },
-    # V1 Test: Passing locally but failing in CI. This runs the
-    # V0 Engine because of CPU offloading. Need to debug why.
+    # FIXME: This test currently fails, need to debug why.
     # "granite20b": {
-    #     "model":
-    #     "mbayser/granite-20b-functioncalling-FP8-KV",
+    #     "model": "mbayser/granite-20b-functioncalling-FP8-KV",
     #     "arguments": [
-    #         "--tool-call-parser", "granite-20b-fc", "--chat-template",
-    #         str(VLLM_PATH /
-    #             "examples/tool_chat_template_granite_20b_fc.jinja"),
-    #         "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
+    #         "--tool-call-parser",
+    #         "granite-20b-fc",
+    #         "--chat-template",
+    #         str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja"),
+    #         "--max_num_seqs",
+    #         "1",
+    #         "--enforce-eager",
+    #         "--cpu-offload-gb",
+    #         "20",
     #     ],
-    #     "supports_parallel":
-    #     False,
-    #     "supports_rocm":
-    #     False,
+    #     "supports_parallel": False,
+    #     "supports_rocm": False,
     # },
     "granite-3.0-8b": {
         "model": "ibm-granite/granite-3.0-8b-instruct",
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 7421eb8b8abc..848916dbd876 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -339,7 +339,6 @@ def __init__(
 
         log_non_default_args(engine_args)
 
-        # Create the Engine (autoselects V0 vs V1)
         self.llm_engine = LLMEngine.from_engine_args(
             engine_args=engine_args, usage_context=UsageContext.LLM_CLASS
         )
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 41172d8ec2f7..b352c3ad01db 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -377,7 +377,7 @@ class ResponsesRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
         ),
     )
 
@@ -763,7 +763,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
         ),
     )
     kv_transfer_params: dict[str, Any] | None = Field(
@@ -1249,7 +1249,7 @@ class CompletionRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
         ),
     )
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 900701c46348..0ea5805305ed 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -590,7 +590,6 @@ def conv_ssm_forward(
             hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C)
             return hidden_states
 
-        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
         num_prefills = attn_metadata.num_prefills  # request count
         num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
         num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index dc4caf2f02f9..9966498e1b4c 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -586,13 +586,11 @@ class IsHybrid(Protocol):
     def get_mamba_state_shape_from_config(
         cls,
         vllm_config: VllmConfig,
-        use_v1: bool = True,
     ) -> tuple[tuple[int, int], tuple[int, int, int]]:
         """Calculate shapes for Mamba's convolutional and state caches.
 
         Args:
             vllm_config: vLLM config
-            use_v1: Get shapes for V1 (or V0)
 
         Returns:
             Tuple containing:
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 22f9c87fc905..472de5590dcf 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -290,7 +290,6 @@ def forward_cuda(
         has_decode = num_decodes > 0
         num_actual_tokens = num_prefill_tokens + num_decodes
 
-        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
         # Separate prefill and decode by splitting varlen input
         # Split along token dimension
         hidden_states_d, hidden_states_p = torch.split(

From cca2d2cdbe56529205c10e58363c7bd2d31e15df Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 21 Nov 2025 07:01:54 -0500
Subject: [PATCH 281/578] [Core] Align whisper closer to other multimodal
 models (#27292)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/model_executor/models/whisper.py | 13 ++++---
 vllm/v1/worker/gpu_model_runner.py    | 49 +++++++--------------------
 2 files changed, 21 insertions(+), 41 deletions(-)

diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 91a10b95a08c..50587c627160 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -599,15 +599,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
     def forward(
         self,
-        input_features: torch.Tensor | list[torch.Tensor] | None,
         input_ids: torch.Tensor | None,
         positions: torch.Tensor,
+        encoder_outputs: list[torch.Tensor],
     ) -> torch.Tensor:
-        encoder_outputs = self.get_encoder_outputs(input_features)
+        assert len(encoder_outputs) in (0, 1)
+        enc_states = encoder_outputs[0] if len(encoder_outputs) == 1 else None
         decoder_outputs = self.decoder(
             input_ids=input_ids,
             positions=positions,
-            encoder_hidden_states=encoder_outputs,
+            encoder_hidden_states=enc_states,
         )
         return decoder_outputs
 
@@ -894,13 +895,15 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
+        encoder_outputs: list[torch.Tensor] | None = None,
         **kwargs,
     ) -> torch.Tensor:
-        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if encoder_outputs is None:
+            encoder_outputs = []
         decoder_outputs = self.model(
-            input_features=audio_input["input_features"],
             input_ids=input_ids,
             positions=positions,
+            encoder_outputs=encoder_outputs,
         )
         return decoder_outputs
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4c65a5e9b029..e786cd8bc7c9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1923,14 +1923,16 @@ def _batch_mm_kwargs_from_scheduler(
 
         return mm_kwargs, mm_hashes_pos
 
-    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _execute_mm_encoder(
+        self, scheduler_output: "SchedulerOutput"
+    ) -> list[torch.Tensor]:
         # Batch the multi-modal inputs using the helper method.
         mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
             scheduler_output
         )
 
         if not mm_kwargs:
-            return
+            return []
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -2007,6 +2009,8 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             logger.debug("Finish execute for mm hash %s", mm_hash)
             self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash)
 
+        return encoder_outputs
+
     def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
@@ -2095,38 +2099,6 @@ def _gather_mm_embeddings(
 
         return mm_embeds, is_mm_embed
 
-    def _extract_encoder_inputs(
-        self,
-        scheduler_output: "SchedulerOutput",
-    ) -> dict[str, torch.Tensor]:
-        """Extract encoder inputs for encoder-decoder models.
-
-        This method extracts multimodal input features from scheduled encoder
-        inputs and formats them for the encoder-decoder model forward pass.
-        """
-        # Batch the multi-modal inputs using the helper method.
-        mm_kwargs, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output)
-
-        if not mm_kwargs:
-            return {}
-
-        # Group MM kwargs by modality and extract features
-        model = cast(SupportsMultiModal, self.model)
-        encoder_features = {}
-        for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
-            mm_kwargs,
-            device=self.device,
-            pin_memory=self.pin_memory,
-            merge_by_field_config=model.merge_by_field_config,
-            multimodal_cpu_fields=model.multimodal_cpu_fields,
-        ):
-            # Add the grouped features to encoder_features dict
-            # This allows the model to receive them as kwargs (e.g.,
-            # input_features=...)
-            encoder_features.update(mm_kwargs_group)
-
-        return encoder_features
-
     def get_model(self) -> nn.Module:
         # get raw model out of the cudagraph wrapper.
         if isinstance(self.model, (CUDAGraphWrapper, UBatchWrapper)):
@@ -2416,8 +2388,13 @@ def _preprocess(
             self.model_config.is_encoder_decoder
             and scheduler_output.scheduled_encoder_inputs
         ):
-            encoder_inputs = self._extract_encoder_inputs(scheduler_output)
-            model_kwargs.update(encoder_inputs)
+            # Run the encoder, just like we do with other multimodal inputs.
+            # For an encoder-decoder model, our processing here is a bit
+            # simpler, because the outputs are just passed to the decoder.
+            # We are not doing any prompt replacement. We also will only
+            # ever have a single encoder input.
+            encoder_outputs = self._execute_mm_encoder(scheduler_output)
+            model_kwargs.update({"encoder_outputs": encoder_outputs})
 
         return (
             input_ids,

From 2b1b3dfa4b02456b11b2bdbcd0857ddb96214a71 Mon Sep 17 00:00:00 2001
From: Bhagyashri <Bhagyashri.Gaikwad2@ibm.com>
Date: Fri, 21 Nov 2025 17:54:09 +0530
Subject: [PATCH 282/578] Update Dockerfile to use gcc-toolset-14 and fix test
 case failures on power (ppc64le) (#28957)

Signed-off-by: Bhagyashri <Bhagyashri.Gaikwad2@ibm.com>
---
 .../hardware_ci/run-cpu-test-ppc64le.sh       | 10 +++---
 docker/Dockerfile.ppc64le                     | 32 +++++++++++--------
 requirements/common.txt                       |  4 +--
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 39ea18017308..3728f73fa2a3 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,20 +25,22 @@ function cpu_tests() {
 
   # offline inference
   podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
     set -xve
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
 
   # Run basic model test
   podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
     set -evx
     pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
-    pip install sentence-transformers datamodel_code_generator
+    pip install sentence-transformers datamodel_code_generator tblib 
 
     # Note: disable Bart until supports V1
     # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
     # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
     # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index ad9eae94b83d..b16bea3607d2 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -8,8 +8,8 @@ FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openbl
 
 ARG MAX_JOBS
 ARG OPENBLAS_VERSION=0.3.30
-RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
-    && source /opt/rh/gcc-toolset-13/enable \
+RUN microdnf install -y dnf && dnf install -y gcc-toolset-14 make wget unzip \
+    && source /opt/rh/gcc-toolset-14/enable \
     && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
     && unzip OpenBLAS-$OPENBLAS_VERSION.zip \
     && cd OpenBLAS-$OPENBLAS_VERSION \
@@ -57,7 +57,7 @@ COPY --from=openblas-builder /tmp/control /dev/null
 RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
     dnf install -y openssl-devel \
     && dnf install -y \
-       git tar gcc-toolset-13 automake libtool \
+       git tar gcc-toolset-14 automake libtool \
        pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
        libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
        harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
@@ -84,7 +84,7 @@ ARG _GLIBCXX_USE_CXX11_ABI=1
 ARG OPENBLAS_VERSION=0.3.30
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable &&  \
+    source /opt/rh/gcc-toolset-14/enable &&  \
     git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
     cd pytorch && \
     uv pip install -r requirements.txt && \
@@ -97,7 +97,7 @@ ARG TORCHVISION_VERSION=0.22.0
 ARG TORCHVISION_USE_NVJPEG=0
 ARG TORCHVISION_USE_FFMPEG=0
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \
     cd vision && \
     MAX_JOBS=${MAX_JOBS:-$(nproc)} \
@@ -113,7 +113,7 @@ ARG USE_ROCM=0
 ARG USE_CUDA=0
 ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \
     cd audio && \
     MAX_JOBS=${MAX_JOBS:-$(nproc)} \
@@ -130,7 +130,7 @@ ARG MAX_JOBS
 ARG PYARROW_PARALLEL
 ARG PYARROW_VERSION=21.0.0
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
     cd arrow/cpp && \
     mkdir build && cd build && \
@@ -162,7 +162,7 @@ ARG OPENCV_VERSION=86
 ARG OPENCV_PATCH=97f3f39
 ARG ENABLE_HEADLESS=1
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
     cd opencv-python && \
     sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
@@ -196,7 +196,7 @@ ARG MAX_JOBS
 ARG NUMBA_VERSION=0.61.2
 
 # Clone all required dependencies
-RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
+RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-14/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
     git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
     cd ./numba && \
     if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
@@ -211,6 +211,9 @@ RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset
 
 FROM base-builder AS vllmcache-builder
 
+ENV LLVM_CONFIG=/usr/lib64/llvm15/bin/llvm-config
+ENV PATH=/usr/lib64/llvm15/bin:$PATH
+
 COPY --from=torch-builder /tmp/control /dev/null
 COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
@@ -225,10 +228,13 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 RUN --mount=type=cache,target=/root/.cache/uv \
     dnf install llvm15 llvm15-devel -y && \
     rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
     uv pip install maturin && \
     uv build --wheel --out-dir /hf_wheels/
+
+ENV CXXFLAGS="-fno-lto -Wno-error=free-nonheap-object" \
+    CFLAGS="-fno-lto"
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
@@ -236,7 +242,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
     --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
     --mount=type=bind,src=.,dst=/src/,rw \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     export PATH=$PATH:/usr/lib64/llvm15/bin && \
     uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
     sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
@@ -260,7 +266,7 @@ FROM base-builder AS lapack-builder
 ARG MAX_JOBS
 ARG LAPACK_VERSION=3.12.1
 RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \
-    && cd lapack && source /opt/rh/gcc-toolset-13/enable \
+    && cd lapack && source /opt/rh/gcc-toolset-14/enable \
     && cmake -B build -S . \
     && cmake --build build -j ${MAX_JOBS:-$(nproc)}
 
@@ -299,7 +305,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
     rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
     microdnf install --nodocs -y \
-    libomp tar findutils openssl llvm15 llvm15-devel \
+    libomp libicu tar findutils openssl llvm15 llvm15-devel \
     pkgconfig xsimd g++ gcc-fortran libsndfile \
     libtiff libjpeg openjpeg2 zlib zeromq \
     freetype lcms2 libwebp tcl tk utf8proc \
diff --git a/requirements/common.txt b/requirements/common.txt
index f2d1c0762ef6..3f8cd588422d 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -19,12 +19,12 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.11.3
-llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x"
+llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x" or platform_machine == "ppc64le"
 outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
+xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

From 9452863088b458912634f13273784bf6e16c8a4c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 20:27:43 +0800
Subject: [PATCH 283/578] Revert "Revert #28875 (#29159)" (#29179)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docker/Dockerfile                               | 17 -----------------
 docs/deployment/docker.md                       |  7 +++----
 .../installation/gpu.cuda.inc.md                |  5 +----
 3 files changed, 4 insertions(+), 25 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 964700e2a43a..709b79e84fbb 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -56,7 +56,6 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 
 # PyTorch provides its own indexes for standard and nightly builds
 ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 
 # PIP supports multiple authentication schemes, including keyring
 # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
@@ -98,7 +97,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Activate virtual environment and add uv to PATH
@@ -317,7 +315,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Install uv for faster pip installs
@@ -337,20 +334,6 @@ ENV UV_LINK_MODE=copy
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# arm64 (GH200) build follows the practice of "use existing pytorch" build,
-# we need to install torch and torchvision from the nightly builds first,
-# pytorch will not appear as a vLLM dependency in all of the following steps
-# after this step
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            --pre pytorch_triton==3.3.0+gitab727c40 ; \
-    fi
-
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 1c639f3533d4..0e636c87f38a 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -82,8 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \
 
 ## Building for Arm64/aarch64
 
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
-of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 
 !!! note
     Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@@ -94,7 +93,6 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
     ```bash
     # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-    python3 use_existing_torch.py
     DOCKER_BUILDKIT=1 docker build . \
     --file docker/Dockerfile \
     --target vllm-openai \
@@ -102,7 +100,8 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
     -t vllm/vllm-gh200-openai:latest \
     --build-arg max_jobs=66 \
     --build-arg nvcc_threads=2 \
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX"
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
+    --build-arg RUN_WHEEL_CHECK=false
     ```
 
 !!! note
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index b2d0d64a2d35..601d3659af88 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -158,10 +158,7 @@ uv pip install -e .
 
 ##### Use an existing PyTorch installation
 
-There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.:
-
-- Building vLLM with PyTorch nightly or a custom PyTorch build.
-- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.
+There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, for example, when building vLLM with non-default PyTorch builds (like nightly or a custom build).
 
 To build vLLM using an existing PyTorch installation:
 

From fc9f821d2062d412474ced64b9087c881651eb30 Mon Sep 17 00:00:00 2001
From: who who who <fsx950223@outlook.com>
Date: Fri, 21 Nov 2025 20:55:43 +0800
Subject: [PATCH 284/578] fix cross attention (#28346)

Signed-off-by: fsx950223 <fsx950223@outlook.com>
---
 vllm/v1/attention/backends/triton_attn.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 889c79db18ef..09c36043c8c8 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -244,14 +244,11 @@ def __init__(
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        if attn_type != AttentionType.DECODER:
+        if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]:
             raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "TritonAttentionImpl"
+                "Encoder self-attention is not implemented for TritonAttentionImpl"
             )
-
+        self.attn_type = attn_type
         self.fp8_dtype = current_platform.fp8_dtype()
 
         self.sinks = sinks
@@ -312,7 +309,11 @@ def forward(
         num_actual_tokens = attn_metadata.num_actual_tokens
         key_cache, value_cache = kv_cache.unbind(1)
 
-        if self.kv_sharing_target_layer_name is None:
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             if self.kv_cache_dtype.startswith("fp8"):
@@ -346,7 +347,7 @@ def forward(
         max_seqlen_k = attn_metadata.max_seq_len
         block_table = attn_metadata.block_table
 
-        descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+        descale_shape = (cu_seqlens_q.shape[0] - 1, key_cache.shape[2])
 
         unified_attention(
             q=query[:num_actual_tokens],

From 2092ce8c39a4f01a93dcb32d3c92d05586507e7c Mon Sep 17 00:00:00 2001
From: sfbemerk <benjaminmerkel@mail.de>
Date: Fri, 21 Nov 2025 13:57:19 +0100
Subject: [PATCH 285/578] Tool Call Parser logs should not contain user input /
 model output except on DEBUG (#29160)

Signed-off-by: Benjamin Merkel <benjamin.merkel@tngtech.com>
Co-authored-by: Benjamin Merkel <benjamin.merkel@tngtech.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 .../openai/tool_parsers/glm4_moe_tool_parser.py    |  2 +-
 .../openai/tool_parsers/qwen3coder_tool_parser.py  | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
index 120e63b929b1..389e9754b34d 100644
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -78,7 +78,7 @@ def _is_string_type(
                         .get("type", None)
                     )
                     return arg_type == "string"
-            logger.warning("No tool named '%s'.", tool_name)
+            logger.debug("No tool named '%s'.", tool_name)
             return False
 
         def _deserialize(value: str) -> Any:
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index 26261c0065ea..9d4c079eba18 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -128,7 +128,7 @@ def _get_arguments_config(
                     return params
                 else:
                     return {}
-        logger.warning("Tool '%s' is not defined in the tools list.", func_name)
+        logger.debug("Tool '%s' is not defined in the tools list.", func_name)
         return {}
 
     def _convert_param_value(
@@ -141,7 +141,7 @@ def _convert_param_value(
 
         if param_name not in param_config:
             if param_config != {}:
-                logger.warning(
+                logger.debug(
                     "Parsed parameter '%s' is not defined in the tool "
                     "parameters for tool '%s', directly returning the "
                     "string value.",
@@ -169,7 +169,7 @@ def _convert_param_value(
             try:
                 return int(param_value)
             except (ValueError, TypeError):
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' is not an "
                     "integer in tool '%s', degenerating to string.",
                     param_value,
@@ -186,7 +186,7 @@ def _convert_param_value(
                     else int(float_param_value)
                 )
             except (ValueError, TypeError):
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' is not a float "
                     "in tool '%s', degenerating to string.",
                     param_value,
@@ -197,7 +197,7 @@ def _convert_param_value(
         elif param_type in ["boolean", "bool", "binary"]:
             param_value = param_value.lower()
             if param_value not in ["true", "false"]:
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' is not a boolean "
                     "(`true` or `false`) in tool '%s', degenerating to "
                     "false.",
@@ -216,7 +216,7 @@ def _convert_param_value(
                     param_value = json.loads(param_value)
                     return param_value
                 except (json.JSONDecodeError, TypeError, ValueError):
-                    logger.warning(
+                    logger.debug(
                         "Parsed value '%s' of parameter '%s' cannot be "
                         "parsed with json.loads in tool '%s', will try "
                         "other methods to parse it.",
@@ -227,7 +227,7 @@ def _convert_param_value(
             try:
                 param_value = ast.literal_eval(param_value)  # safer
             except (ValueError, SyntaxError, TypeError):
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' cannot be "
                     "converted via Python `ast.literal_eval()` in tool "
                     "'%s', degenerating to string.",

From 434f3d3eb869606af221f0307e16548c1f99da20 Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Fri, 21 Nov 2025 15:01:20 +0100
Subject: [PATCH 286/578] Fix mistral config (#29172)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 vllm/transformers_utils/configs/mistral.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index 8f72f0b28b0d..fe202b2ed156 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -90,6 +90,10 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
         "rope_type": "yarn",
         "mscale_all_dim": 1,
     }
+
+    if rope_theta := config.pop("rope_theta", None):
+        config["rope_parameters"]["rope_theta"] = rope_theta
+
     for old_name, new_name in yarn_config_map.items():
         if old_name in yarn_config:
             config["rope_parameters"][new_name] = yarn_config.pop(old_name)

From f1805db1a671ffb1c99b2eae98e1b1b729fbcc65 Mon Sep 17 00:00:00 2001
From: skaraban3807 <siddappa.karabannavar@amd.com>
Date: Fri, 21 Nov 2025 19:43:52 +0530
Subject: [PATCH 287/578] [Perf] These changes enhance the NUMA functionality
 of vllm for systems with more than one NUMA nodes per socket (#25559)

Signed-off-by: Siddappa Karabannavar <siddappa.karabannavar@amd.com>
---
 csrc/cpu/utils.cpp | 67 +++++++++++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 22 deletions(-)

diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index c5a48352e308..5199ba2af024 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -45,31 +45,54 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
   // Memory node binding
   if (numa_available() != -1) {
     int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
-    // Verify all CPUs are on the same NUMA node
-    for (size_t i = 1; i < omp_cpu_ids.size(); ++i) {
-      int node_id = numa_node_of_cpu(omp_cpu_ids[i]);
-      TORCH_CHECK(node_id == mem_node_id, "CPU ", omp_cpu_ids[i],
-                  " is on NUMA node ", node_id, ", but CPU ",
-                  omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
-                  ". All CPUs should be on the same NUMA node for optimal "
-                  "performance. Memory will be bound to NUMA node ",
-                  mem_node_id, ".");
+    std::set<int> node_ids;
+    for (const auto& cpu_id : omp_cpu_ids) {
+      int node_id = numa_node_of_cpu(cpu_id);
+      if (node_id != -1) {
+        node_ids.insert(node_id);
+      }
+      TORCH_WARN(node_id == mem_node_id, "CPU ", cpu_id, " is on NUMA node ",
+                 node_id, ", but CPU ", omp_cpu_ids.front(),
+                 " is on NUMA node ", mem_node_id,
+                 ". All CPUs should be on the same NUMA node for optimal "
+                 "performance. Memory will be bound to NUMA node ",
+                 mem_node_id, ".");
     }
-    bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
-    bitmask* src_mask = numa_get_membind();
-
-    int pid = getpid();
+    // Concatenate all node_ids into a single comma-separated string
+    if (!node_ids.empty()) {
+      std::string node_ids_str;
+      for (const int node_id : node_ids) {
+        if (!node_ids_str.empty()) {
+          node_ids_str += ",";
+        }
+        node_ids_str += std::to_string(node_id);
+      }
 
-    // move all existing pages to the specified numa node.
-    *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
-    int page_num = numa_migrate_pages(pid, src_mask, mask);
-    if (page_num == -1) {
-      TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
+      bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
+      bitmask* src_mask = numa_get_membind();
+
+      int pid = getpid();
+
+      if (mask && src_mask) {
+        // move all existing pages to the specified numa node.
+        *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
+        int page_num = numa_migrate_pages(pid, src_mask, mask);
+        if (page_num == -1) {
+          TORCH_WARN("numa_migrate_pages failed. errno: " +
+                     std::to_string(errno));
+        }
+
+        // restrict memory allocation node.
+        numa_set_membind(mask);
+        numa_set_strict(1);
+
+        numa_free_nodemask(mask);
+        numa_free_nodemask(src_mask);
+      } else {
+        TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
+                   std::to_string(errno));
+      }
     }
-
-    // restrict memory allocation node.
-    numa_set_membind(mask);
-    numa_set_strict(1);
   }
 
   // OMP threads binding

From 4050bae4171edeadb24be5b6b1f8a3287612f872 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 21 Nov 2025 22:57:26 +0800
Subject: [PATCH 288/578] [Doc] Update plugin doc (#28532)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 docs/design/plugin_system.md | 98 +++++++++++++++++++++++++++++++++++-
 vllm/plugins/__init__.py     |  3 ++
 vllm/v1/metrics/loggers.py   |  4 +-
 3 files changed, 101 insertions(+), 4 deletions(-)

diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index e8db8047ca4e..9c84889f7f03 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -4,7 +4,7 @@ The community frequently requests the ability to extend vLLM with custom feature
 
 ## How Plugins Work in vLLM
 
-Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_plugins_by_group][vllm.plugins.load_plugins_by_group] function in the `vllm.plugins` module.
 
 ## How vLLM Discovers Plugins
 
@@ -57,6 +57,100 @@ Every plugin has three parts:
 
 - **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
 
+### Platform plugins guidelines
+
+1. Create a platform plugin project, for example, `vllm_add_dummy_platform`. The project structure should look like this:
+
+    ```shell
+    vllm_add_dummy_platform/
+    ├── vllm_add_dummy_platform/
+    │   ├── __init__.py
+    │   ├── my_dummy_platform.py
+    │   ├── my_dummy_worker.py
+    │   ├── my_dummy_attention.py
+    │   ├── my_dummy_device_communicator.py
+    │   ├── my_dummy_custom_ops.py
+    ├── setup.py
+    ```
+
+2. In the `setup.py` file, add the following entry point:
+
+    ```python
+    setup(
+        name="vllm_add_dummy_platform",
+        ...
+        entry_points={
+            "vllm.platform_plugins": [
+                "my_dummy_platform = vllm_add_dummy_platform:register"
+            ]
+        },
+        ...
+    )
+        ```
+
+    Please make sure `vllm_add_dummy_platform:register` is a callable function and returns the platform class's fully qualified name. for example:
+
+    ```python
+    def register():
+        return "vllm_add_dummy_platform.my_dummy_platform.MyDummyPlatform"
+    ```
+
+3. Implement the platform class `MyDummyPlatform` in `my_dummy_platform.py`. The platform class should inherit from `vllm.platforms.interface.Platform`. Please follow the interface to implement the functions one by one. There are some important functions and properties that should be implemented at least:
+
+    - `_enum`: This property is the device enumeration from [PlatformEnum][vllm.platforms.interface.PlatformEnum]. Usually, it should be `PlatformEnum.OOT`, which means the platform is out-of-tree.
+    - `device_type`: This property should return the type of the device which pytorch uses. For example, `"cpu"`, `"cuda"`, etc.
+    - `device_name`: This property is set the same as `device_type` usually. It's mainly used for logging purposes.
+    - `check_and_update_config`: This function is called very early in the vLLM's initialization process. It's used for plugins to update the vllm configuration. For example, the block size, graph mode config, etc, can be updated in this function. The most important thing is that the **worker_cls** should be set in this function to let vLLM know which worker class to use for the worker process.
+    - `get_attn_backend_cls`: This function should return the attention backend class's fully qualified name.
+    - `get_device_communicator_cls`: This function should return the device communicator class's fully qualified name.
+
+4. Implement the worker class `MyDummyWorker` in `my_dummy_worker.py`. The worker class should inherit from [WorkerBase][vllm.v1.worker.worker_base.WorkerBase]. Please follow the interface to implement the functions one by one. Basically, all interfaces in the base class should be implemented, since they are called here and there in vLLM. To make sure a model can be executed, the basic functions should be implemented are:
+
+    - `init_device`: This function is called to set up the device for the worker.
+    - `initialize_cache`: This function is called to set cache config for the worker.
+    - `load_model`: This function is called to load the model weights to device.
+    - `get_kv_cache_spaces`: This function is called to generate the kv cache spaces for the model.
+    - `determine_available_memory`: This function is called to profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs.
+    - `initialize_from_config`: This function is called to allocate device KV cache with the specified kv_cache_config
+    - `execute_model`: This function is called every step to inference the model.
+
+    Additional functions that can be implemented are:
+
+    - If the plugin wants to support sleep mode feature, please implement the `sleep` and `wakeup` functions.
+    - If the plugin wants to support graph mode feature, please implement the `compile_or_warm_up_model` function.
+    - If the plugin wants to support speculative decoding feature, please implement the `take_draft_token_ids` function.
+    - If the plugin wants to support lora feature, please implement the `add_lora`,`remove_lora`,`list_loras` and `pin_lora` functions.
+    - If the plugin wants to support data parallelism feature, please implement the `execute_dummy_batch` functions.
+
+    Please look at the worker base class [WorkerBase][vllm.v1.worker.worker_base.WorkerBase] for more functions that can be implemented.
+
+5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.attention.backends.abstract.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.
+
+6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports:
+
+    - pytorch ops
+      there are 3 kinds of pytorch ops:
+
+        - `communicator ops`: Device communicator op. Such as all-reduce, all-gather, etc.
+          Please implement the device communicator class `MyDummyDeviceCommunicator` in `my_dummy_device_communicator.py`. The device communicator class should inherit from [DeviceCommunicatorBase][vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase].
+        - `common ops`: Common ops. Such as matmul, softmax, etc.
+          Please implement the common ops by register oot way. See more detail in [CustomOp][vllm.model_executor.custom_op.CustomOp] class.
+        - `csrc ops`: C++ ops. This kind of ops are implemented in C++ and are registered as torch custom ops.
+          Following csrc module and `vllm._custom_ops` to implement your ops.
+
+    - triton ops
+      Custom way doesn't work for triton ops now.
+
+7. (optional) Implement other plugable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
+
 ## Compatibility Guarantee
 
-vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
+vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets.
+
+The interface for the model/module may change during vLLM's development. If you see any deprecation log info, please upgrade your plugin to the latest version.
+
+## Deprecation announcement
+
+!!! warning "Deprecations"
+    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0.
+    - `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 0d8988f27959..4c59d5364a76 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -17,6 +17,9 @@
 # Platform plugins group will be loaded in all processes when
 # `vllm.platforms.current_platform` is called and the value not initialized,
 PLATFORM_PLUGINS_GROUP = "vllm.platform_plugins"
+# Stat logger plugins group will be loaded in process0 only when serve vLLM with
+# async mode.
+STAT_LOGGER_PLUGINS_GROUP = "vllm.stat_logger_plugins"
 
 # make sure one process only loads plugins once
 plugins_loaded = False
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index cb36e7973650..e2d82241ce21 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -16,7 +16,7 @@
     KVConnectorPrometheus,
 )
 from vllm.logger import init_logger
-from vllm.plugins import load_plugins_by_group
+from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.prometheus import unregister_vllm_metrics
 from vllm.v1.metrics.stats import (
@@ -67,7 +67,7 @@ def record_sleep_state(self, is_awake: int, level: int):  # noqa
 def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]:
     factories: list[StatLoggerFactory] = []
 
-    for name, plugin_class in load_plugins_by_group("vllm.stat_logger_plugins").items():
+    for name, plugin_class in load_plugins_by_group(STAT_LOGGER_PLUGINS_GROUP).items():
         if not isinstance(plugin_class, type) or not issubclass(
             plugin_class, StatLoggerBase
         ):

From d7219bcda3e6508cb14881bec303e2d0ab68c898 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Nov 2025 23:27:44 +0800
Subject: [PATCH 289/578] [Misc] Move dynamic seed initialization to
 `EngineArgs` (#29165)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config/model.py         | 34 +++++++---------------------------
 vllm/config/speculative.py   |  7 +------
 vllm/engine/arg_utils.py     | 16 +++++++++++++++-
 vllm/v1/worker/tpu_worker.py |  3 ---
 4 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 97cba6ea7295..8f59673f4e1c 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -146,9 +146,12 @@ class ModelConfig:
     - "bfloat16" for a balance between precision and range.\n
     - "float" is shorthand for FP32 precision.\n
     - "float32" for FP32 precision."""
-    seed: int | None = None
-    """Random seed for reproducibility. Initialized to None in V0, but
-    initialized to 0 in V1."""
+    seed: int = 0
+    """Random seed for reproducibility.
+
+    We must set the global seed because otherwise,
+    different tensor parallel workers would sample different tokens,
+    leading to inconsistent results."""
     hf_config: PretrainedConfig = field(init=False)
     """The Hugging Face config of the model."""
     hf_text_config: PretrainedConfig = field(init=False)
@@ -415,7 +418,7 @@ def _apply_dict_overrides(
     def __post_init__(
         self,
         # Multimodal config init vars
-        limit_mm_per_prompt: dict[str, int] | None,
+        limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
         enable_mm_embeds: bool | None,
         media_io_kwargs: dict[str, dict[str, Any]] | None,
         mm_processor_kwargs: dict[str, Any] | None,
@@ -428,23 +431,6 @@ def __post_init__(
         skip_mm_profiling: bool | None,
         video_pruning_rate: float | None,
     ) -> None:
-        # Set the default seed to 0 in V1.
-        # NOTE(woosuk): In V1, we use separate processes for workers (unless
-        # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
-        # doesn't affect the user process. However, without a consistent seed,
-        # different tensor parallel workers would sample different tokens,
-        # leading to inconsistent results.
-        if self.seed is None:
-            self.seed = 0
-            if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
-                logger.warning(
-                    "The global random seed is set to %d. Since "
-                    "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
-                    "affect the random state of the Python process that "
-                    "launched vLLM.",
-                    self.seed,
-                )
-
         # Keep set served_model_name before maybe_model_redirect(self.model)
         self.served_model_name = get_served_model_name(
             self.model, self.served_model_name
@@ -1151,12 +1137,6 @@ def verify_with_parallel_config(
         self,
         parallel_config: ParallelConfig,
     ) -> None:
-        if parallel_config.distributed_executor_backend == "external_launcher":
-            assert self.seed is not None, (
-                "Seed must be set when using external launcher backend to "
-                "make sure sampling results are the same across workers."
-            )
-
         total_num_attention_heads = getattr(
             self.hf_text_config, "num_attention_heads", 0
         )
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index a0c65b6049e1..d7c019c73d59 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -9,6 +9,7 @@
 from pydantic.dataclasses import dataclass
 from typing_extensions import Self
 
+from vllm.config.model import ModelConfig
 from vllm.config.parallel import ParallelConfig
 from vllm.config.utils import config
 from vllm.logger import init_logger
@@ -18,10 +19,8 @@
     from transformers import PretrainedConfig
 
     import vllm.model_executor.layers.quantization as me_quant
-    from vllm.config import ModelConfig
 else:
     PretrainedConfig = Any
-    ModelConfig = Any
 
     me_quant = LazyLoader(
         "model_executor", globals(), "vllm.model_executor.layers.quantization"
@@ -316,10 +315,6 @@ def __post_init__(self):
             self.prompt_lookup_min = 0
 
             if self.model is not None:
-                # TODO: Move this import to the top once `ModelConfig`
-                # lives in `vllm.config.model`.
-                from vllm.config import ModelConfig
-
                 self.draft_model_config = ModelConfig(
                     model=self.model,
                     runner="draft",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bcb90119f9b0..6eaf328eb165 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -367,7 +367,7 @@ class EngineArgs:
     config_format: str = ModelConfig.config_format
     dtype: ModelDType = ModelConfig.dtype
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
-    seed: int | None = ModelConfig.seed
+    seed: int | None = None
     max_model_len: int | None = ModelConfig.max_model_len
     cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
     cudagraph_capture_sizes: list[int] | None = (
@@ -1188,6 +1188,20 @@ def create_model_config(self) -> ModelConfig:
         if check_gguf_file(self.model):
             self.quantization = self.load_format = "gguf"
 
+        # NOTE(woosuk): In V1, we use separate processes for workers (unless
+        # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
+        # doesn't affect the user process.
+        if self.seed is None:
+            self.seed = 0
+            if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
+                logger.warning(
+                    "The global random seed is set to %d. Since "
+                    "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
+                    "affect the random state of the Python process that "
+                    "launched vLLM.",
+                    self.seed,
+                )
+
         if self.disable_mm_preprocessor_cache:
             logger.warning(
                 "`--disable-mm-preprocessor-cache` is deprecated "
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 569b2aaa766e..e1a109eca0a8 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -106,9 +106,6 @@ def __init__(
                 "Profiling enabled. Traces will be saved to: %s", self.profile_dir
             )
 
-        if self.model_config.seed is None:
-            self.model_config.seed = 0
-
     def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks

From 711241c13cf9c1e543a1948bb25a40623f3da78c Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 21 Nov 2025 09:58:38 -0600
Subject: [PATCH 290/578] [CI/Build] Fix illegal memory access and unsupported
 test in kernels/attention/test_cache.py (#29118)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/attention/test_cache.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index f33a27d1fd85..028e164cb801 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -68,6 +68,7 @@ def test_copy_blocks(
         pytest.skip()
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
     # Generate random block mappings where each source block is mapped to two
     # destination blocks.
     assert 2 * num_mappings <= num_blocks
@@ -152,6 +153,7 @@ def test_reshape_and_cache(
         pytest.skip()
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
     slot_mapping_lst = random.sample(range(num_slots), num_tokens)
@@ -272,6 +274,7 @@ def test_reshape_and_cache_flash(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
     assert implementation in ["cuda", "triton"]
     if implementation == "triton" and kv_cache_layout == "HND":
         pytest.skip("Triton implementation only supports NHD layout.")
@@ -593,6 +596,7 @@ def test_concat_and_cache_mla(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
@@ -662,11 +666,14 @@ def test_concat_and_cache_ds_mla(
     seed: int,
     device: str,
 ) -> None:
+    if current_platform.is_rocm():
+        pytest.skip("concat_and_cache_mla doesn't support fp8_ds_mla on ROCm")
     if dtype.itemsize != 2:
         pytest.skip("ds_mla only supports 16-bit input")
     kv_cache_dtype = "fp8_ds_mla"
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
@@ -779,6 +786,7 @@ def test_copy_blocks_mla(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     entry_size = kv_lora_rank + qk_rope_head_dim
 
@@ -843,6 +851,7 @@ def test_swap_blocks_mla(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     entry_size = kv_lora_rank + qk_rope_head_dim
 

From 1f400c58b8a6d2852b137cd841206a6ea8aaf43a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 21 Nov 2025 11:20:33 -0500
Subject: [PATCH 291/578] [CI] Add batch invariant test to ci (#27842)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .buildkite/test-pipeline.yaml                 | 12 ++++++++++++
 tests/v1/determinism/test_batch_invariance.py |  2 ++
 tests/v1/determinism/utils.py                 |  3 ++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6169b279dc8a..a5719d438eec 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -346,6 +346,18 @@ steps:
   commands:
     - pytest -v -s v1/attention
 
+- label: Batch Invariance Tests (H100) # 10min
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
 - label: V1 Test attention (B200) # 10min
   timeout_in_minutes: 30
   gpu: b200
diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index 74ae5e182da7..b9e2daafb870 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -190,6 +190,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",  # not everything is supported
+        gpu_memory_utilization=0.9,
     )
 
     # Use more realistic prompts for better token generation
@@ -444,6 +445,7 @@ def test_logprobs_without_batch_invariance_should_fail(
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
 
     # CRITICAL: Disable batch invariance for this test
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
     monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index 7ee442551e2c..ecbb6a112693 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 
+from vllm.attention.utils.fa_utils import flash_attn_supports_mla
 from vllm.platforms import current_platform
 
 skip_unsupported = pytest.mark.skipif(
@@ -18,7 +19,7 @@
     "FLASHINFER",
 ]
 
-if current_platform.is_cuda() and current_platform.is_device_capability(90):
+if flash_attn_supports_mla():
     BACKENDS.append("FLASH_ATTN_MLA")
 
 DEFAULT_MODEL = "Qwen/Qwen3-1.7B"

From 30b44a1598ea62fd3dcfd0d72a799ca4685e829e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Nov 2025 08:20:55 -0800
Subject: [PATCH 292/578] GPU Model Runner V2 (#25266)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .github/CODEOWNERS                       |   3 +
 vllm/envs.py                             |   5 +
 vllm/v1/attention/backends/flashinfer.py |   3 +
 vllm/v1/core/sched/output.py             |  24 +
 vllm/v1/core/sched/scheduler.py          |  28 +-
 vllm/v1/worker/gpu/README.md             |   4 +
 vllm/v1/worker/gpu/__init__.py           |   0
 vllm/v1/worker/gpu/async_utils.py        |  89 +++
 vllm/v1/worker/gpu/attn_utils.py         | 187 ++++++
 vllm/v1/worker/gpu/block_table.py        | 315 +++++++++
 vllm/v1/worker/gpu/cudagraph_utils.py    | 198 ++++++
 vllm/v1/worker/gpu/dp_utils.py           |  22 +
 vllm/v1/worker/gpu/input_batch.py        | 265 ++++++++
 vllm/v1/worker/gpu/model_runner.py       | 814 +++++++++++++++++++++++
 vllm/v1/worker/gpu/sampler.py            | 327 +++++++++
 vllm/v1/worker/gpu/states.py             | 265 ++++++++
 vllm/v1/worker/gpu/structured_outputs.py |  76 +++
 vllm/v1/worker/gpu_worker.py             |  26 +-
 18 files changed, 2639 insertions(+), 12 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/README.md
 create mode 100644 vllm/v1/worker/gpu/__init__.py
 create mode 100644 vllm/v1/worker/gpu/async_utils.py
 create mode 100644 vllm/v1/worker/gpu/attn_utils.py
 create mode 100644 vllm/v1/worker/gpu/block_table.py
 create mode 100644 vllm/v1/worker/gpu/cudagraph_utils.py
 create mode 100644 vllm/v1/worker/gpu/dp_utils.py
 create mode 100644 vllm/v1/worker/gpu/input_batch.py
 create mode 100644 vllm/v1/worker/gpu/model_runner.py
 create mode 100644 vllm/v1/worker/gpu/sampler.py
 create mode 100644 vllm/v1/worker/gpu/states.py
 create mode 100644 vllm/v1/worker/gpu/structured_outputs.py

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 6e178bb690c5..0e834c057c40 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -35,6 +35,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC
 
+# Model runner V2
+/vllm/v1/worker/gpu @WoosukKwon
+
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
diff --git a/vllm/envs.py b/vllm/envs.py
index 888a09cf6d3e..d2d691740342 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -231,6 +231,7 @@
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
+    VLLM_USE_V2_MODEL_RUNNER: bool = False
 
 
 def get_default_cache_root():
@@ -1522,6 +1523,10 @@ def get_vllm_port() -> int | None:
     "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices(
         "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"]
     ),
+    # Flag to enable v2 model runner.
+    "VLLM_USE_V2_MODEL_RUNNER": lambda: bool(
+        int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0"))
+    ),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 3ad7e8c52fc1..e3f499216d7f 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -593,6 +593,9 @@ def _get_workspace_buffer(self):
             )
         return self._workspace_buffer
 
+    def set_workspace_buffer(self, workspace_buffer: torch.Tensor):
+        self._workspace_buffer = workspace_buffer
+
     def _get_prefill_wrapper(
         self,
     ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper:
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 20fdb3446404..7902513dce49 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -44,11 +44,15 @@ class NewRequestData:
     lora_request: LoRARequest | None
     prompt_embeds: "torch.Tensor | None" = None
 
+    # Only used for v2 model runner.
+    prefill_token_ids: list[int] | None = None
+
     @classmethod
     def from_request(
         cls,
         request: Request,
         block_ids: tuple[list[int], ...],
+        prefill_token_ids: list[int] | None = None,
     ) -> "NewRequestData":
         return cls(
             req_id=request.request_id,
@@ -60,6 +64,7 @@ def from_request(
             num_computed_tokens=request.num_computed_tokens,
             lora_request=request.lora_request,
             prompt_embeds=request.prompt_embeds,
+            prefill_token_ids=prefill_token_ids,
         )
 
     def __repr__(self) -> str:
@@ -68,6 +73,7 @@ def __repr__(self) -> str:
             f"NewRequestData("
             f"req_id={self.req_id},"
             f"prompt_token_ids={self.prompt_token_ids},"
+            f"prefill_token_ids={self.prefill_token_ids},"
             f"mm_features={self.mm_features},"
             f"sampling_params={self.sampling_params},"
             f"block_ids={self.block_ids},"
@@ -183,6 +189,10 @@ class SchedulerOutput:
     # freed from the encoder cache.
     free_encoder_mm_hashes: list[str]
 
+    # Request IDs that are preempted in this step.
+    # Only used for v2 model runner.
+    preempted_req_ids: set[str] | None = None
+
     # Whether the scheduled requests have all the output tokens they
     # need to perform grammar bitmask computation.
     pending_structured_output_tokens: bool = False
@@ -193,6 +203,20 @@ class SchedulerOutput:
     # EC Cache Connector metadata
     ec_connector_metadata: ECConnectorMetadata | None = None
 
+    @classmethod
+    def make_empty(cls) -> "SchedulerOutput":
+        return cls(
+            scheduled_new_reqs=[],
+            scheduled_cached_reqs=CachedRequestData.make_empty(),
+            num_scheduled_tokens={},
+            total_num_scheduled_tokens=0,
+            scheduled_spec_decode_tokens={},
+            scheduled_encoder_inputs={},
+            num_common_prefix_blocks=[],
+            finished_req_ids=set(),
+            free_encoder_mm_hashes=[],
+        )
+
 
 @dataclass
 class GrammarOutput:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 1ac8520a8ed2..9195b112d869 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -6,6 +6,7 @@
 from collections.abc import Iterable
 from typing import Any
 
+from vllm import envs
 from vllm.config import VllmConfig
 from vllm.distributed.ec_transfer.ec_connector.base import (
     ECConnectorMetadata,
@@ -187,6 +188,7 @@ def __init__(
             pcp_world_size=self.pcp_world_size,
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
+        self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
 
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
@@ -658,12 +660,25 @@ def schedule(self) -> SchedulerOutput:
                 )
 
         # Construct the scheduler output.
-        new_reqs_data = [
-            NewRequestData.from_request(
-                req, req_to_new_blocks[req.request_id].get_block_ids()
-            )
-            for req in scheduled_new_reqs
-        ]
+        if self.use_v2_model_runner:
+            scheduled_new_reqs = scheduled_new_reqs + scheduled_resumed_reqs
+            scheduled_resumed_reqs = []
+            new_reqs_data = [
+                NewRequestData.from_request(
+                    req,
+                    req_to_new_blocks[req.request_id].get_block_ids(),
+                    req._all_token_ids,
+                )
+                for req in scheduled_new_reqs
+            ]
+        else:
+            new_reqs_data = [
+                NewRequestData.from_request(
+                    req, req_to_new_blocks[req.request_id].get_block_ids()
+                )
+                for req in scheduled_new_reqs
+            ]
+
         with record_function_or_nullcontext("schedule: make_cached_request_data"):
             cached_reqs_data = self._make_cached_request_data(
                 scheduled_running_reqs,
@@ -685,6 +700,7 @@ def schedule(self) -> SchedulerOutput:
             scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
             num_common_prefix_blocks=num_common_prefix_blocks,
+            preempted_req_ids={req.request_id for req in preempted_reqs},
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
             # It contains the request IDs that are finished in between
diff --git a/vllm/v1/worker/gpu/README.md b/vllm/v1/worker/gpu/README.md
new file mode 100644
index 000000000000..093f524b3250
--- /dev/null
+++ b/vllm/v1/worker/gpu/README.md
@@ -0,0 +1,4 @@
+# [Experimental] Model Runner V2
+
+This directory contains the new model runner which is under active development.
+Ping [Woosuk Kwon](https://github.com/WoosukKwon) for any changes.
diff --git a/vllm/v1/worker/gpu/__init__.py b/vllm/v1/worker/gpu/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
new file mode 100644
index 000000000000..638ec6fb0b08
--- /dev/null
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import contextmanager
+
+import numpy as np
+import torch
+
+from vllm.v1.outputs import (
+    AsyncModelRunnerOutput,
+    ModelRunnerOutput,
+    SamplerOutput,
+)
+
+
+class AsyncOutput(AsyncModelRunnerOutput):
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        sampler_output: SamplerOutput,
+        num_sampled_tokens: np.ndarray,
+        copy_stream: torch.cuda.Stream,
+        copy_event: torch.cuda.Event,
+    ):
+        self.model_runner_output = model_runner_output
+        self.sampler_output = sampler_output
+        self.num_sampled_tokens = num_sampled_tokens
+        self.copy_stream = copy_stream
+        self.copy_event = copy_event
+
+        default_stream = torch.cuda.current_stream()
+        with torch.cuda.stream(self.copy_stream):
+            self.copy_stream.wait_stream(default_stream)
+
+            # NOTE(woosuk): We must ensure that CPU tensors are not freed
+            # before the device-to-host copy is fully completed. For instance,
+            # operations like
+            # self.sampled_token_np = ...to("cpu", non_blocking=True).numpy()
+            # are unsafe because the underlying CPU tensor can be prematurely freed and
+            # reused by other tensors before the asynchronous copy finishes, potentially
+            # causing race conditions. To prevent this, we delay freeing by holding
+            # references until the copy event signals completion.
+            # Likewise, we also need to keep the reference to the GPU tensors.
+            # This is done by keeping the reference to sampler_output and
+            # model_runner_output.
+            self.sampled_token_ids = sampler_output.sampled_token_ids.to(
+                "cpu", non_blocking=True
+            )
+            if sampler_output.logprobs_tensors is not None:
+                self.logprobs_tensors = (
+                    sampler_output.logprobs_tensors.to_cpu_nonblocking()
+                )
+            else:
+                self.logprobs_tensors = None
+            self.prompt_logprobs_dict = {}
+            if self.model_runner_output.prompt_logprobs_dict:
+                for k, v in self.model_runner_output.prompt_logprobs_dict.items():
+                    self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking()
+            self.copy_event.record(self.copy_stream)
+
+    def get_output(self) -> ModelRunnerOutput:
+        self.copy_event.synchronize()
+
+        # NOTE(woosuk): The following code is to ensure compatibility with
+        # the existing model runner.
+        # Going forward, we should keep the data structures as NumPy arrays
+        # rather than Python lists.
+        sampled_token_ids_np = self.sampled_token_ids.numpy()
+        num_reqs = sampled_token_ids_np.shape[0]
+        sampled_token_ids: list[np.ndarray] = [
+            sampled_token_ids_np[i, : self.num_sampled_tokens[i]]
+            for i in range(num_reqs)
+        ]
+        self.model_runner_output.sampled_token_ids = sampled_token_ids
+
+        if self.logprobs_tensors is not None:
+            self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
+        self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
+        return self.model_runner_output
+
+
+@contextmanager
+def async_barrier(event: torch.cuda.Event | None):
+    if event is not None:
+        event.synchronize()
+    try:
+        yield
+    finally:
+        if event is not None:
+            event.record()
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
new file mode 100644
index 000000000000..8850c1809229
--- /dev/null
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Sequence
+from typing import Any
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
+from vllm.v1.kv_cache_interface import (
+    KVCacheConfig,
+    KVCacheSpec,
+)
+from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.utils import bind_kv_cache
+
+
+def get_kv_cache_spec(vllm_config: VllmConfig) -> dict[str, KVCacheSpec]:
+    kv_cache_spec: dict[str, KVCacheSpec] = {}
+    attn_layers = get_layers_from_vllm_config(vllm_config, AttentionLayerBase)
+    for layer_name, attn_module in attn_layers.items():
+        # Skip modules that don't need KV cache (eg encoder-only attention)
+        if spec := attn_module.get_kv_cache_spec(vllm_config):
+            kv_cache_spec[layer_name] = spec
+    return kv_cache_spec
+
+
+def init_attn_backend(
+    kv_cache_config: KVCacheConfig,
+    vllm_config: VllmConfig,
+    device: torch.device,
+):
+    attn_backends: dict[str, AttentionBackend] = {}
+    attn_metadata_builders: list[AttentionMetadataBuilder] = []
+    flashinfer_workspace: torch.Tensor | None = None
+    for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
+        layer_names = kv_cache_group_spec.layer_names
+        any_layer_name = next(iter(layer_names))
+
+        attn_layers = get_layers_from_vllm_config(
+            vllm_config, AttentionLayerBase, layer_names
+        )
+        attn_backend = attn_layers[any_layer_name].get_attn_backend()
+        for layer_name in layer_names:
+            attn_backends[layer_name] = attn_backend
+
+        attn_metadata_builder = attn_backend.get_builder_cls()(
+            kv_cache_group_spec.kv_cache_spec,
+            layer_names,
+            vllm_config,
+            device,
+        )
+        attn_metadata_builders.append(attn_metadata_builder)  # type: ignore
+
+        if "FLASHINFER" in attn_backend.get_name():
+            if flashinfer_workspace is None:
+                flashinfer_workspace = attn_metadata_builder._get_workspace_buffer()
+            else:
+                attn_metadata_builder.set_workspace_buffer(flashinfer_workspace)
+    return attn_backends, attn_metadata_builders
+
+
+def _allocate_kv_cache(
+    kv_cache_config: KVCacheConfig,
+    device: torch.device,
+):
+    kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        tensor = torch.zeros(kv_cache_tensor.size, dtype=torch.int8, device=device)
+        for layer_name in kv_cache_tensor.shared_by:
+            kv_cache_raw_tensors[layer_name] = tensor
+
+    layer_names = set()
+    for group in kv_cache_config.kv_cache_groups:
+        for layer_name in group.layer_names:
+            layer_names.add(layer_name)
+    assert layer_names == set(kv_cache_raw_tensors.keys()), (
+        "Some layers are not correctly initialized"
+    )
+    return kv_cache_raw_tensors
+
+
+def _reshape_kv_cache(
+    kv_cache_config: KVCacheConfig,
+    kv_cache_raw_tensors: dict[str, torch.Tensor],
+    attn_backends: dict[str, AttentionBackend],
+) -> dict[str, torch.Tensor]:
+    kv_caches: dict[str, torch.Tensor] = {}
+    for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
+        kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+        for layer_name in kv_cache_group_spec.layer_names:
+            raw_tensor = kv_cache_raw_tensors[layer_name]
+            assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
+            num_blocks = raw_tensor.numel() // kv_cache_spec.page_size_bytes
+
+            attn_backend = attn_backends[layer_name]
+            kv_cache_shape = attn_backend.get_kv_cache_shape(
+                num_blocks,
+                kv_cache_spec.block_size,
+                kv_cache_spec.num_kv_heads,
+                kv_cache_spec.head_size,
+            )
+
+            # FIXME(woosuk): Add kv_cache_stride_order to all attention backends.
+            try:
+                kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
+                assert len(kv_cache_stride_order) == len(kv_cache_shape)
+            except (AttributeError, NotImplementedError):
+                kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
+
+            kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
+            inv_order = [
+                kv_cache_stride_order.index(i)
+                for i in range(len(kv_cache_stride_order))
+            ]
+
+            dtype = kv_cache_spec.dtype
+            raw_tensor = raw_tensor.view(dtype)
+            raw_tensor = raw_tensor.view(kv_cache_shape)
+            kv_caches[layer_name] = raw_tensor.permute(*inv_order)
+    return kv_caches
+
+
+def init_kv_cache(
+    runner_kv_caches: list[torch.Tensor],
+    forward_context: dict[str, Any],
+    kv_cache_config: KVCacheConfig,
+    attn_backends: dict[str, AttentionBackend],
+    device: torch.device,
+) -> None:
+    kv_cache_raw_tensors = _allocate_kv_cache(kv_cache_config, device)
+    kv_caches = _reshape_kv_cache(kv_cache_config, kv_cache_raw_tensors, attn_backends)
+    bind_kv_cache(kv_caches, forward_context, runner_kv_caches)
+
+
+def build_attn_metadata(
+    attn_metadata_builders: list[AttentionMetadataBuilder],
+    num_reqs: int,
+    num_tokens: int,
+    query_start_loc: CpuGpuBuffer,
+    seq_lens: CpuGpuBuffer,
+    num_computed_tokens_cpu: torch.Tensor,
+    block_tables: Sequence[torch.Tensor],
+    slot_mappings: torch.Tensor,
+    kv_cache_config: KVCacheConfig,
+) -> dict[str, Any]:
+    query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
+    query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
+    max_query_len = int(query_start_loc.np[: num_reqs + 1].max())
+    seq_lens_gpu = seq_lens.gpu[:num_reqs]
+    seq_lens_cpu = seq_lens.cpu[:num_reqs]
+    max_seq_len = int(seq_lens.np[:num_reqs].max())
+
+    attn_metadata: dict[str, Any] = {}
+    kv_cache_groups = kv_cache_config.kv_cache_groups
+    for i, kv_cache_spec in enumerate(kv_cache_groups):
+        block_table = block_tables[i]
+        slot_mapping = slot_mappings[i]
+
+        common_attn_metadata = CommonAttentionMetadata(
+            query_start_loc=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
+            seq_lens=seq_lens_gpu,
+            seq_lens_cpu=seq_lens_cpu,
+            max_seq_len=max_seq_len,
+            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            num_reqs=num_reqs,
+            num_actual_tokens=num_tokens,
+            max_query_len=max_query_len,
+            block_table_tensor=block_table,
+            slot_mapping=slot_mapping,
+            causal=True,
+        )
+
+        attn_metadata_builder = attn_metadata_builders[i]
+        metadata = attn_metadata_builder.build(
+            common_prefix_len=0,
+            common_attn_metadata=common_attn_metadata,
+        )
+        for layer_name in kv_cache_spec.layer_names:
+            attn_metadata[layer_name] = metadata
+    return attn_metadata
diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py
new file mode 100644
index 000000000000..ff24e88ede2c
--- /dev/null
+++ b/vllm/v1/worker/gpu/block_table.py
@@ -0,0 +1,315 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.utils.math_utils import cdiv
+from vllm.v1.utils import CpuGpuBuffer
+
+
+class BlockTables:
+    def __init__(
+        self,
+        block_sizes: list[int],
+        max_num_reqs: int,
+        max_num_batched_tokens: int,
+        max_model_len: int,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.block_sizes = block_sizes
+        self.max_num_reqs = max_num_reqs
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.max_model_len = max_model_len
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.num_kv_cache_groups = len(self.block_sizes)
+        # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
+        self.block_tables: list[torch.Tensor] = []
+        for i in range(self.num_kv_cache_groups):
+            block_size = self.block_sizes[i]
+            max_num_blocks = cdiv(self.max_model_len, block_size)
+            block_table = torch.zeros(
+                self.max_num_reqs,
+                max_num_blocks,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            self.block_tables.append(block_table)
+        self.block_table_ptrs = self._make_ptr_tensor(self.block_tables)
+
+        # Block tables used for model's forward pass.
+        # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
+        self.input_block_tables: list[torch.Tensor] = [
+            torch.zeros_like(block_table) for block_table in self.block_tables
+        ]
+        self.input_block_table_ptrs = self._make_ptr_tensor(self.input_block_tables)
+
+        self.block_table_strides = torch.tensor(
+            [b.stride(0) for b in self.block_tables],
+            dtype=torch.int64,
+            device=self.device,
+        )
+        self.block_sizes_tensor = torch.tensor(
+            self.block_sizes, dtype=torch.int32, device=self.device
+        )
+        self.num_blocks = torch.zeros(
+            self.num_kv_cache_groups,
+            self.max_num_reqs,
+            dtype=torch.int32,
+            device=self.device,
+        )
+        self.slot_mappings = torch.zeros(
+            self.num_kv_cache_groups,
+            self.max_num_batched_tokens,
+            dtype=torch.int64,
+            device=self.device,
+        )
+
+        # Misc buffers.
+        self.req_indices = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        self.overwrite = self._make_buffer(self.max_num_reqs, dtype=torch.bool)
+        self.cu_num_new_blocks = self._make_buffer(
+            self.num_kv_cache_groups, self.max_num_reqs + 1, dtype=torch.int32
+        )
+
+    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            *args, dtype=dtype, pin_memory=self.pin_memory, device=self.device
+        )
+
+    def _make_ptr_tensor(self, x: Iterable[torch.Tensor]) -> torch.Tensor:
+        # NOTE(woosuk): Use uint64 instead of int64 to cover all possible addresses.
+        ptrs_tensor_cpu = torch.tensor(
+            [t.data_ptr() for t in x],
+            dtype=torch.uint64,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
+        return ptrs_tensor_cpu.to(self.device, non_blocking=True)
+
+    def append_block_ids(
+        self,
+        # [num_reqs]
+        req_indices: list[int],
+        # [num_kv_cache_groups, num_reqs + 1]
+        cu_num_new_blocks: tuple[list[int], ...],
+        # [num_kv_cache_groups, num_new_blocks]
+        new_block_ids: tuple[list[int], ...],
+        # [num_reqs]
+        overwrite: list[bool],
+    ) -> None:
+        num_reqs = len(req_indices)
+        self.req_indices.np[:num_reqs] = req_indices
+        self.overwrite.np[:num_reqs] = overwrite
+        for i in range(self.num_kv_cache_groups):
+            self.cu_num_new_blocks.np[i, : num_reqs + 1] = cu_num_new_blocks[i]
+
+        # NOTE(woosuk): Here, we cannot use a fixed-size buffer because there's
+        # no clear upper bound to the number of new blocks in a single step.
+        # NOTE(woosuk): The buffer has to be cached, because otherwise we cannot
+        # guarantee that the buffer is not freed before the copy is completed.
+        self.new_block_ids_cpu = torch.empty(
+            self.num_kv_cache_groups,
+            max(len(x) for x in new_block_ids),
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
+        new_block_ids_np = self.new_block_ids_cpu.numpy()
+        for i in range(self.num_kv_cache_groups):
+            new_block_ids_np[i, : len(new_block_ids[i])] = new_block_ids[i]
+        new_block_ids_gpu = self.new_block_ids_cpu.to(self.device, non_blocking=True)
+
+        _append_block_ids_kernel[(self.num_kv_cache_groups, num_reqs)](
+            self.req_indices.copy_to_gpu(num_reqs),
+            self.cu_num_new_blocks.copy_to_gpu(),
+            self.cu_num_new_blocks.gpu.stride(0),
+            new_block_ids_gpu,
+            new_block_ids_gpu.stride(0),
+            self.overwrite.copy_to_gpu(num_reqs),
+            self.block_table_strides,
+            self.block_table_ptrs,
+            self.num_blocks,
+            self.num_blocks.stride(0),
+            BLOCK_SIZE=1024,  # type: ignore
+        )
+
+    def gather_block_tables(
+        self,
+        idx_mapping: torch.Tensor,
+    ) -> tuple[torch.Tensor, ...]:
+        num_reqs = idx_mapping.shape[0]
+        _gather_block_tables_kernel[(self.num_kv_cache_groups, num_reqs)](
+            idx_mapping,
+            self.block_table_ptrs,
+            self.input_block_table_ptrs,
+            self.block_table_strides,
+            self.num_blocks,
+            self.num_blocks.stride(0),
+            BLOCK_SIZE=1024,  # type: ignore
+        )
+        return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
+
+    def get_dummy_block_tables(self, num_reqs: int) -> tuple[torch.Tensor, ...]:
+        return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
+
+    def compute_slot_mappings(
+        self,
+        query_start_loc: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        num_reqs = query_start_loc.shape[0] - 1
+        num_tokens = positions.shape[0]
+        num_groups = self.num_kv_cache_groups
+        _compute_slot_mappings_kernel[(num_groups, num_reqs + 1)](
+            num_tokens,
+            self.max_num_batched_tokens,
+            query_start_loc,
+            positions,
+            self.input_block_table_ptrs,
+            self.block_table_strides,
+            self.block_sizes_tensor,
+            self.slot_mappings,
+            self.slot_mappings.stride(0),
+            PAD_ID=PAD_SLOT_ID,
+            BLOCK_SIZE=1024,  # type: ignore
+        )
+        return self.slot_mappings[:, :num_tokens]
+
+    def get_dummy_slot_mappings(self, num_tokens: int) -> torch.Tensor:
+        self.slot_mappings.fill_(PAD_SLOT_ID)
+        return self.slot_mappings[:, :num_tokens]
+
+
+@triton.jit
+def _append_block_ids_kernel(
+    # Inputs
+    req_indices,  # [num_reqs]
+    cu_num_new_blocks_ptr,  # [num_kv_cache_groups, num_reqs + 1]
+    cu_num_new_blocks_stride,
+    new_block_ids_ptr,  # [num_kv_cache_groups, num_new_blocks]
+    new_block_ids_stride,
+    overwrite,  # [num_reqs]
+    block_table_strides,  # [num_kv_cache_groups]
+    # Outputs
+    block_table_ptrs,  # [num_kv_cache_groups]
+    num_blocks_ptr,  # [num_kv_cache_groups, max_num_reqs]
+    num_blocks_stride,
+    # Constants
+    BLOCK_SIZE: tl.constexpr,
+):
+    group_id = tl.program_id(0)
+    batch_idx = tl.program_id(1)
+    req_idx = tl.load(req_indices + batch_idx)
+    do_overwrite = tl.load(overwrite + batch_idx)
+
+    group_new_blocks_ptr = cu_num_new_blocks_ptr + group_id * cu_num_new_blocks_stride
+    start_idx = tl.load(group_new_blocks_ptr + batch_idx)
+    end_idx = tl.load(group_new_blocks_ptr + batch_idx + 1)
+    num_new_blocks = end_idx - start_idx
+
+    group_num_blocks_ptr = num_blocks_ptr + group_id * num_blocks_stride
+    dst_start_idx = tl.load(group_num_blocks_ptr + req_idx) if not do_overwrite else 0
+    dst_end_idx = dst_start_idx + num_new_blocks
+    tl.store(group_num_blocks_ptr + req_idx, dst_end_idx)
+
+    # Destination
+    block_table_ptr = _load_ptr(block_table_ptrs + group_id, tl.int32)
+    block_table_stride = tl.load(block_table_strides + group_id)
+    row_ptr = block_table_ptr + req_idx * block_table_stride
+
+    group_new_block_ids_ptr = new_block_ids_ptr + group_id * new_block_ids_stride
+    for i in range(0, num_new_blocks, BLOCK_SIZE):
+        offset = i + tl.arange(0, BLOCK_SIZE)
+        block_ids = tl.load(
+            group_new_block_ids_ptr + start_idx + offset, mask=offset < num_new_blocks
+        )
+        tl.store(
+            row_ptr + dst_start_idx + offset, block_ids, mask=offset < num_new_blocks
+        )
+
+
+@triton.jit
+def _gather_block_tables_kernel(
+    batch_idx_to_req_idx,  # [batch_size]
+    src_block_table_ptrs,  # [num_kv_cache_groups]
+    dst_block_table_ptrs,  # [num_kv_cache_groups]
+    block_table_strides,  # [num_kv_cache_groups]
+    num_blocks_ptr,  # [num_kv_cache_groups, max_num_reqs]
+    num_blocks_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # kv cache group id
+    group_id = tl.program_id(0)
+    batch_idx = tl.program_id(1)
+    req_idx = tl.load(batch_idx_to_req_idx + batch_idx)
+
+    group_num_blocks_ptr = num_blocks_ptr + group_id * num_blocks_stride
+    num_blocks = tl.load(group_num_blocks_ptr + req_idx)
+
+    stride = tl.load(block_table_strides + group_id)
+    src_block_table_ptr = _load_ptr(src_block_table_ptrs + group_id, tl.int32)
+    src_row_ptr = src_block_table_ptr + req_idx * stride
+    dst_block_table_ptr = _load_ptr(dst_block_table_ptrs + group_id, tl.int32)
+    dst_row_ptr = dst_block_table_ptr + batch_idx * stride
+
+    for i in tl.range(0, num_blocks, BLOCK_SIZE):
+        offset = i + tl.arange(0, BLOCK_SIZE)
+        block_ids = tl.load(src_row_ptr + offset, mask=offset < num_blocks)
+        tl.store(dst_row_ptr + offset, block_ids, mask=offset < num_blocks)
+
+
+@triton.jit
+def _compute_slot_mappings_kernel(
+    num_tokens,
+    max_num_tokens,
+    cu_num_tokens,  # [num_reqs + 1]
+    pos,  # [num_tokens]
+    block_table_ptrs,  # [num_kv_cache_groups]
+    block_table_strides,  # [num_kv_cache_groups]
+    page_sizes,  # [num_kv_cache_groups]
+    slot_mappings_ptr,  # [num_kv_cache_groups, max_num_tokens]
+    slot_mappings_stride,
+    PAD_ID: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # kv cache group id
+    group_id = tl.program_id(0)
+    req_idx = tl.program_id(1)
+    slot_mapping_ptr = slot_mappings_ptr + group_id * slot_mappings_stride
+
+    if req_idx == tl.num_programs(1) - 1:
+        # Pad remaining slots to -1. This is needed for CUDA graphs.
+        for i in range(num_tokens, max_num_tokens, BLOCK_SIZE):
+            offset = i + tl.arange(0, BLOCK_SIZE)
+            tl.store(slot_mapping_ptr + offset, PAD_ID, mask=offset < max_num_tokens)
+        return
+
+    block_table_ptr = _load_ptr(block_table_ptrs + group_id, tl.int32)
+    block_table_stride = tl.load(block_table_strides + group_id)
+    page_size = tl.load(page_sizes + group_id)
+
+    start_idx = tl.load(cu_num_tokens + req_idx)
+    end_idx = tl.load(cu_num_tokens + req_idx + 1)
+    for i in range(start_idx, end_idx, BLOCK_SIZE):
+        offset = i + tl.arange(0, BLOCK_SIZE)
+        positions = tl.load(pos + offset, mask=offset < end_idx, other=0)
+        block_indices = positions // page_size
+        block_numbers = tl.load(
+            block_table_ptr + req_idx * block_table_stride + block_indices
+        )
+        slot_ids = block_numbers * page_size + positions % page_size
+        tl.store(slot_mapping_ptr + offset, slot_ids, mask=offset < end_idx)
+
+
+@triton.jit
+def _load_ptr(ptr_to_ptr, elem_dtype):
+    ptr = tl.load(ptr_to_ptr)
+    ptr = tl.cast(ptr, tl.pointer_type(elem_dtype))
+    return tl.multiple_of(ptr, 16)
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
new file mode 100644
index 000000000000..7fd1f76669f4
--- /dev/null
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+from contextlib import contextmanager
+
+import numpy as np
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.distributed.parallel_state import graph_capture, is_global_first_rank
+from vllm.forward_context import set_forward_context
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+
+class CudaGraphManager:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.device = device
+
+        self.max_model_len = vllm_config.model_config.max_model_len
+        self.dp_size = vllm_config.parallel_config.data_parallel_size
+        self.compilation_config = vllm_config.compilation_config
+        assert self.compilation_config is not None
+
+        self.cudagraph_mode = self.compilation_config.cudagraph_mode
+        self.cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
+        self.padded_sizes = self._init_padded_sizes()
+
+        self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
+        self.pool = torch.cuda.graph_pool_handle()
+        self.hidden_states: torch.Tensor | None = None
+
+    def _init_padded_sizes(self) -> dict[int, int]:
+        if not self.cudagraph_mode.has_full_cudagraphs():
+            # Full cuda graphs are not used.
+            return {}
+
+        padded_sizes: dict[int, int] = {}
+        assert len(self.cudagraph_sizes) > 0
+        for i in range(1, self.cudagraph_sizes[-1] + 1):
+            for x in self.cudagraph_sizes:
+                if i <= x:
+                    padded_sizes[i] = x
+                    break
+        return padded_sizes
+
+    def needs_capture(self) -> bool:
+        return len(self.padded_sizes) > 0
+
+    def get_cudagraph_size(
+        self,
+        scheduler_output: SchedulerOutput,
+        num_tokens_after_padding: int,
+    ) -> int | None:
+        if not self.cudagraph_mode.has_full_cudagraphs():
+            return None
+        if self.cudagraph_mode != CUDAGraphMode.FULL:
+            # TODO(woosuk): Support uniform decode with multiple tokens (spec decoding).
+            all_decode = all(
+                x == 1 for x in scheduler_output.num_scheduled_tokens.values()
+            )
+            if not all_decode:
+                # Prefill is included.
+                return None
+        return self.padded_sizes.get(num_tokens_after_padding)
+
+    def capture_graph(
+        self,
+        batch_size: int,
+        model: nn.Module,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        assert batch_size not in self.graphs
+
+        # Prepare dummy inputs.
+        input_ids = input_buffers.input_ids.gpu[:batch_size]
+        positions = input_buffers.positions.gpu[:batch_size]
+
+        input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1)
+        input_buffers.query_start_loc.np[batch_size:] = batch_size
+        input_buffers.query_start_loc.copy_to_gpu()
+        input_buffers.seq_lens.np[:batch_size] = self.max_model_len
+        input_buffers.seq_lens.np[batch_size:] = 0
+        input_buffers.seq_lens.copy_to_gpu()
+
+        input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables]
+        slot_mappings = block_tables.slot_mappings[:, :batch_size]
+
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=attn_metadata_builders,
+            num_reqs=batch_size,
+            num_tokens=batch_size,
+            query_start_loc=input_buffers.query_start_loc,
+            seq_lens=input_buffers.seq_lens,
+            num_computed_tokens_cpu=None,  # FIXME
+            block_tables=input_block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=kv_cache_config,
+        )
+        if self.dp_size > 1:
+            num_tokens_across_dp = torch.full(
+                (self.dp_size,),
+                batch_size,
+                dtype=torch.int32,
+                device="cpu",
+            )
+        else:
+            num_tokens_across_dp = None
+
+        # Warm up.
+        with set_forward_context(
+            attn_metadata,
+            self.vllm_config,
+            num_tokens=batch_size,
+            num_tokens_across_dp=num_tokens_across_dp,
+        ):
+            hidden_states = model(
+                input_ids=input_ids,
+                positions=positions,
+            )
+            if self.hidden_states is None:
+                self.hidden_states = torch.empty_like(hidden_states)
+        torch.cuda.synchronize()
+
+        # Capture the graph.
+        graph = torch.cuda.CUDAGraph()
+        with (
+            set_forward_context(
+                attn_metadata,
+                self.vllm_config,
+                num_tokens=batch_size,
+                num_tokens_across_dp=num_tokens_across_dp,
+            ),
+            torch.cuda.graph(graph, self.pool),
+        ):
+            hidden_states = model(
+                input_ids=input_ids,
+                positions=positions,
+            )
+            self.hidden_states[:batch_size] = hidden_states
+        self.graphs[batch_size] = graph
+
+    @torch.inference_mode()
+    def capture(
+        self,
+        model: nn.Module,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        assert self.needs_capture()
+        # Capture larger graphs first.
+        sizes_to_capture = sorted(self.cudagraph_sizes, reverse=True)
+        if is_global_first_rank():
+            sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs")
+
+        with freeze_gc(), graph_capture(device=self.device):
+            for batch_size in sizes_to_capture:
+                self.capture_graph(
+                    batch_size,
+                    model,
+                    input_buffers,
+                    block_tables,
+                    attn_metadata_builders,
+                    kv_cache_config,
+                )
+
+    def run(self, batch_size: int) -> torch.Tensor:
+        assert batch_size in self.graphs
+        self.graphs[batch_size].replay()
+        assert self.hidden_states is not None
+        return self.hidden_states[:batch_size]
+
+
+@contextmanager
+def freeze_gc():
+    gc.collect()
+    gc.freeze()
+    try:
+        yield
+    finally:
+        gc.unfreeze()
diff --git a/vllm/v1/worker/gpu/dp_utils.py b/vllm/v1/worker/gpu/dp_utils.py
new file mode 100644
index 000000000000..9bfc7f25bef3
--- /dev/null
+++ b/vllm/v1/worker/gpu/dp_utils.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.parallel_state import get_dp_group
+
+
+def get_batch_metadata_across_dp(
+    num_tokens: int,
+    cudagraph_size: int,
+    dp_size: int,
+    dp_rank: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert dp_size > 1
+    # Use CPU group to avoid CPU-GPU synchronization.
+    group = get_dp_group().cpu_group
+    tensor = torch.zeros(2, dp_size, dtype=torch.int32, device="cpu")
+    tensor[0][dp_rank] = num_tokens
+    tensor[1][dp_rank] = cudagraph_size
+    dist.all_reduce(tensor, group=group)
+    return tensor[0], tensor[1]
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
new file mode 100644
index 000000000000..89f375649146
--- /dev/null
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Any
+
+import numba
+import numba.types as types
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+
+from vllm.utils import random_uuid
+from vllm.utils.math_utils import cdiv
+from vllm.v1.utils import CpuGpuBuffer
+
+
+class InputBuffers:
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_num_tokens: int,
+        hidden_size: int,
+        vocab_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_num_tokens = max_num_tokens
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.idx_mapping = self._make_buffer(max_num_reqs, dtype=torch.int32)
+        self.input_ids = self._make_buffer(max_num_tokens, dtype=torch.int32)
+        self.positions = self._make_buffer(max_num_tokens, dtype=torch.int64)
+        self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
+        self.seq_lens = self._make_buffer(max_num_reqs, dtype=torch.int32)
+
+        # Structured outputs.
+        self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
+        self.grammar_bitmask = self._make_buffer(
+            max_num_reqs, cdiv(vocab_size, 32), dtype=torch.int32
+        )
+
+    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            *args, dtype=dtype, pin_memory=self.pin_memory, device=self.device
+        )
+
+
+@dataclass
+class InputBatch:
+    # batch_idx -> req_id
+    req_ids: list[str]
+    num_reqs: int
+
+    # batch_idx -> req_state_idx
+    idx_mapping: torch.Tensor
+    idx_mapping_np: np.ndarray
+
+    # [num_reqs]
+    # batch_idx -> num_scheduled_tokens
+    num_scheduled_tokens: np.ndarray
+    # sum(num_scheduled_tokens)
+    num_tokens: int
+    num_tokens_after_padding: int
+
+    # [num_reqs + 1]
+    query_start_loc: torch.Tensor
+    query_start_loc_np: np.ndarray
+    # [num_reqs]
+    seq_lens: torch.Tensor
+    seq_lens_np: np.ndarray
+
+    # [num_tokens_after_padding]
+    input_ids: torch.Tensor
+    # [num_tokens_after_padding]
+    positions: torch.Tensor
+
+    # layer_name -> Metadata
+    attn_metadata: dict[str, Any]
+
+    # [num_reqs]
+    logits_indices: torch.Tensor
+
+    @classmethod
+    def make_dummy(
+        cls,
+        num_reqs: int,
+        num_tokens: int,
+        input_buffers: InputBuffers,
+        device: torch.device,
+    ) -> "InputBatch":
+        assert 0 < num_reqs <= num_tokens
+        req_ids = [f"req_{i}_{random_uuid()}" for i in range(num_reqs)]
+        idx_mapping_np = np.arange(num_reqs, dtype=np.int32)
+        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
+        num_scheduled_tokens = np.full(num_reqs, num_tokens // num_reqs, dtype=np.int32)
+        num_scheduled_tokens[-1] += num_tokens % num_reqs
+        assert int(num_scheduled_tokens.sum()) == num_tokens
+
+        input_buffers.query_start_loc.np[0] = 0
+        input_buffers.query_start_loc.np[1 : num_reqs + 1] = np.cumsum(
+            num_scheduled_tokens
+        )
+        input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
+        query_start_loc_np = input_buffers.query_start_loc.np[: num_reqs + 1]
+        query_start_loc = input_buffers.query_start_loc.copy_to_gpu()[: num_reqs + 1]
+        # seq_len equals to query_len
+        input_buffers.seq_lens.np[:num_reqs] = num_scheduled_tokens
+        input_buffers.seq_lens.np[num_reqs:] = 0
+        seq_lens_np = input_buffers.seq_lens.np[:num_reqs]
+        seq_lens = input_buffers.seq_lens.copy_to_gpu()[:num_reqs]
+
+        input_ids = input_buffers.input_ids.copy_to_gpu(num_tokens)
+        positions = input_buffers.positions.copy_to_gpu(num_tokens)
+        # attn_metadata = defaultdict(lambda: None)
+        logits_indices = query_start_loc[1:] - 1
+        return cls(
+            req_ids=req_ids,
+            num_reqs=num_reqs,
+            idx_mapping=idx_mapping,
+            idx_mapping_np=idx_mapping_np,
+            num_scheduled_tokens=num_scheduled_tokens,
+            num_tokens=num_tokens,
+            num_tokens_after_padding=num_tokens,
+            query_start_loc=query_start_loc,
+            query_start_loc_np=query_start_loc_np,
+            seq_lens=seq_lens,
+            seq_lens_np=seq_lens_np,
+            input_ids=input_ids,
+            positions=positions,
+            attn_metadata=None,  # type: ignore
+            logits_indices=logits_indices,
+        )
+
+
+# NOTE: With the type annotations, this function is pre-compiled
+# before the first call.
+@numba.jit(
+    [
+        types.none(
+            types.int32[:],  # idx_mapping
+            types.int32[:, :],  # token_ids
+            types.int32[:],  # num_computed_tokens
+            types.int32[:],  # num_scheduled_tokens
+            types.int32[:],  # input_ids
+            types.int64[:],  # positions
+            types.int32[:],  # query_start_loc
+            types.int32[:],  # seq_lens
+        )
+    ],
+    nopython=True,
+    cache=True,
+)
+def _prepare_inputs(
+    idx_mapping: np.ndarray,  # batch_idx -> req_idx
+    token_ids: np.ndarray,  # [N, max_model_len]
+    num_computed_tokens: np.ndarray,  # [N]
+    num_scheduled_tokens: np.ndarray,  # [B]
+    input_ids: np.ndarray,  # [num_input_tokens]
+    positions: np.ndarray,  # [num_input_tokens]
+    query_start_loc: np.ndarray,  # [B + 1]
+    seq_lens: np.ndarray,  # [B]
+) -> None:
+    num_reqs = num_scheduled_tokens.shape[0]
+    query_start_loc[0] = 0
+
+    cu_num_tokens = 0
+    for i in range(num_reqs):
+        req_idx = idx_mapping[i]
+        query_len = num_scheduled_tokens[i]
+        start = num_computed_tokens[req_idx]
+        end = start + query_len
+        seq_lens[i] = end
+
+        start_idx = cu_num_tokens
+        end_idx = start_idx + query_len
+        input_ids[start_idx:end_idx] = token_ids[req_idx, start:end]
+        positions[start_idx:end_idx] = np.arange(start, end, dtype=np.int64)
+
+        cu_num_tokens = end_idx
+        query_start_loc[i + 1] = cu_num_tokens
+
+    # Pad the inputs for CUDA graphs.
+    # Note: pad query_start_loc to be non-decreasing, as kernels
+    # like FlashAttention requires that
+    query_start_loc[num_reqs + 1 :].fill(cu_num_tokens)
+    # Fill unused with 0 for full cuda graph mode.
+    seq_lens[num_reqs:].fill(0)
+
+
+def prepare_inputs(
+    idx_mapping: np.ndarray,
+    prefill_token_ids: np.ndarray,
+    num_computed_tokens: np.ndarray,
+    num_scheduled_tokens: np.ndarray,
+    input_ids: CpuGpuBuffer,
+    positions: CpuGpuBuffer,
+    query_start_loc: CpuGpuBuffer,
+    seq_lens: CpuGpuBuffer,
+    num_tokens: int,
+) -> None:
+    _prepare_inputs(
+        idx_mapping,
+        prefill_token_ids,
+        num_computed_tokens,
+        num_scheduled_tokens,
+        input_ids.np,
+        positions.np,
+        query_start_loc.np,
+        seq_lens.np,
+    )
+    input_ids.copy_to_gpu(num_tokens)
+    positions.copy_to_gpu(num_tokens)
+    # NOTE(woosuk): We should copy the whole query_start_loc and seq_lens
+    # tensors from CPU to GPU, because they may include paddings needed
+    # for full CUDA graph mode.
+    query_start_loc.copy_to_gpu()
+    seq_lens.copy_to_gpu()
+    return
+
+
+@triton.jit
+def _combine_last_token_ids_kernel(
+    input_ids_ptr,
+    idx_mapping_ptr,
+    last_token_ids_ptr,
+    query_start_loc_ptr,
+    seq_lens_ptr,
+    prefill_len_ptr,
+):
+    batch_idx = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+
+    seq_len = tl.load(seq_lens_ptr + batch_idx)
+    prefill_len = tl.load(prefill_len_ptr + req_state_idx)
+    if seq_len <= prefill_len:
+        # Handling prefill tokens.
+        return
+
+    last_token_id = tl.load(last_token_ids_ptr + req_state_idx)
+    end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    tl.store(input_ids_ptr + end - 1, last_token_id)
+
+
+def combine_last_token_ids(
+    input_ids: torch.Tensor,
+    idx_mapping: torch.Tensor,
+    last_token_ids: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    seq_lens: torch.Tensor,
+    prefill_len: torch.Tensor,
+) -> torch.Tensor:
+    num_reqs = seq_lens.shape[0]
+    _combine_last_token_ids_kernel[(num_reqs,)](
+        input_ids,
+        idx_mapping,
+        last_token_ids,
+        query_start_loc,
+        seq_lens,
+        prefill_len,
+    )
+    return input_ids
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
new file mode 100644
index 000000000000..08aad9ddd06b
--- /dev/null
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -0,0 +1,814 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+import time
+from copy import deepcopy
+from typing import Any
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model_loader
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import DeviceMemoryProfiler
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.outputs import (
+    EMPTY_MODEL_RUNNER_OUTPUT,
+    LogprobsTensors,
+    ModelRunnerOutput,
+)
+from vllm.v1.sample.sampler import SamplerOutput
+from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier
+from vllm.v1.worker.gpu.attn_utils import (
+    build_attn_metadata,
+    get_kv_cache_spec,
+    init_attn_backend,
+    init_kv_cache,
+)
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
+from vllm.v1.worker.gpu.dp_utils import get_batch_metadata_across_dp
+from vllm.v1.worker.gpu.input_batch import (
+    InputBatch,
+    InputBuffers,
+    combine_last_token_ids,
+    prepare_inputs,
+)
+from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
+from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
+from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
+from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
+from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+
+logger = init_logger(__name__)
+
+
+class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.compilation_config = vllm_config.compilation_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+
+        self.device = device
+        self.pin_memory = is_pin_memory_available()
+        self.dtype = self.model_config.dtype
+        self.kv_cache_dtype = self.dtype
+        if self.cache_config.cache_dtype != "auto":
+            # Quantized KV cache.
+            self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
+                self.cache_config.cache_dtype
+            ]
+        self.is_pooling_model = False
+
+        self.vocab_size = self.model_config.get_vocab_size()
+        self.max_model_len = self.model_config.max_model_len
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.hidden_size = self.model_config.get_hidden_size()
+
+        self.dp_size = self.parallel_config.data_parallel_size
+        self.dp_rank = self.parallel_config.data_parallel_rank
+
+        self.use_async_scheduling = self.scheduler_config.async_scheduling
+        self.output_copy_stream = torch.cuda.Stream(self.device)
+        self.output_copy_event = torch.cuda.Event()
+        if self.use_async_scheduling:
+            self.input_prep_event = torch.cuda.Event()
+            self.structured_outputs_event = torch.cuda.Event()
+        else:
+            self.input_prep_event = None
+            self.structured_outputs_event = None
+
+        self.req_states = RequestState(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.max_model_len,
+            max_num_batched_tokens=self.max_num_tokens,
+            vocab_size=self.vocab_size,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        )
+        self.input_buffers = InputBuffers(
+            max_num_reqs=self.max_num_reqs,
+            max_num_tokens=self.max_num_tokens,
+            hidden_size=self.hidden_size,
+            vocab_size=self.vocab_size,
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        )
+        self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
+
+        # CUDA graphs.
+        self.cudagraph_manager = CudaGraphManager(
+            vllm_config=self.vllm_config,
+            device=self.device,
+        )
+
+    def get_supported_tasks(self) -> tuple[str]:
+        return ("generate",)
+
+    def load_model(self, *args, **kwargs) -> None:
+        time_before_load = time.perf_counter()
+        with DeviceMemoryProfiler() as m:
+            model_loader = get_model_loader(self.vllm_config.load_config)
+            logger.info("Loading model from scratch...")
+
+            self.model = model_loader.load_model(
+                vllm_config=self.vllm_config,
+                model_config=self.vllm_config.model_config,
+            )
+            if self.lora_config:
+                self.model = self.load_lora_model(
+                    self.model,
+                    self.vllm_config,
+                    self.device,
+                )
+        time_after_load = time.perf_counter()
+
+        self.model_memory_usage = m.consumed_memory
+        logger.info(
+            "Model loading took %.4f GiB and %.6f seconds",
+            m.consumed_memory / GiB_bytes,
+            time_after_load - time_before_load,
+        )
+
+    def get_model(self) -> nn.Module:
+        return self.model
+
+    def get_kv_cache_spec(self):
+        return get_kv_cache_spec(self.vllm_config)
+
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        kv_cache_config = deepcopy(kv_cache_config)
+        self.kv_cache_config = kv_cache_config
+        block_sizes = [
+            kv_cache_group.kv_cache_spec.block_size
+            for kv_cache_group in kv_cache_config.kv_cache_groups
+        ]
+
+        self.block_tables = BlockTables(
+            block_sizes=block_sizes,
+            max_num_reqs=self.max_num_reqs,
+            max_num_batched_tokens=self.max_num_tokens,
+            max_model_len=self.max_model_len,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        )
+
+        self.attn_backends, self.attn_metadata_builders = init_attn_backend(
+            self.kv_cache_config,
+            self.vllm_config,
+            self.device,
+        )
+
+        self.kv_caches: list[torch.Tensor] = []
+        init_kv_cache(
+            self.kv_caches,
+            self.compilation_config.static_forward_context,
+            self.kv_cache_config,
+            self.attn_backends,
+            self.device,
+        )
+        # Attention groups are not supported.
+        self.attn_groups = []  # type: ignore
+
+    def prepare_dummy_attn_metadata(self, input_batch: InputBatch) -> None:
+        block_tables = self.block_tables.get_dummy_block_tables(input_batch.num_reqs)
+        slot_mappings = self.block_tables.get_dummy_slot_mappings(
+            input_batch.num_tokens
+        )
+        num_computed_tokens_cpu = torch.zeros(
+            input_batch.num_reqs, dtype=torch.int32, device="cpu"
+        )
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=self.attn_metadata_builders,
+            num_reqs=input_batch.num_reqs,
+            num_tokens=input_batch.num_tokens,
+            query_start_loc=self.input_buffers.query_start_loc,
+            seq_lens=self.input_buffers.seq_lens,
+            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=self.kv_cache_config,
+        )
+        input_batch.attn_metadata = attn_metadata
+
+    @torch.inference_mode()
+    def _dummy_run(
+        self,
+        num_tokens: int,
+        *args,
+        skip_attn: bool = True,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        input_batch = InputBatch.make_dummy(
+            num_reqs=num_reqs,
+            num_tokens=num_tokens,
+            input_buffers=self.input_buffers,
+            device=self.device,
+        )
+        if not skip_attn:
+            self.prepare_dummy_attn_metadata(input_batch)
+
+        if self.dp_size == 1:
+            num_tokens_across_dp: torch.Tensor | None = None
+        else:
+            num_tokens_across_dp = torch.full(
+                (self.dp_size,), num_tokens, dtype=torch.int32, device="cpu"
+            )
+        num_sampled_tokens = np.ones(input_batch.num_reqs, dtype=np.int32)
+        with (
+            self.maybe_dummy_run_with_lora(
+                self.lora_config,
+                input_batch.num_scheduled_tokens,
+                num_sampled_tokens,
+            ),
+            set_forward_context(
+                input_batch.attn_metadata,
+                self.vllm_config,
+                num_tokens=num_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
+            ),
+        ):
+            hidden_states = self.model(
+                input_ids=input_batch.input_ids,
+                positions=input_batch.positions,
+            )
+            sample_hidden_states = hidden_states[input_batch.logits_indices]
+        return hidden_states, sample_hidden_states
+
+    @torch.inference_mode()
+    def _dummy_sampler_run(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> None:
+        num_reqs = hidden_states.shape[0]
+        sampling_metadata = SamplingMetadata.make_dummy(
+            num_reqs=num_reqs,
+            device=self.device,
+        )
+        logits = self.model.compute_logits(hidden_states)
+        self.sampler(logits, sampling_metadata)
+
+    @torch.inference_mode()
+    def profile_run(self) -> None:
+        hidden_states, sample_hidden_states = self._dummy_run(
+            self.max_num_tokens,
+            skip_attn=True,
+        )
+        self._dummy_sampler_run(sample_hidden_states)
+        torch.cuda.synchronize()
+        del hidden_states, sample_hidden_states
+        gc.collect()
+
+    def reset_mm_cache(self) -> None:
+        pass
+
+    def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int:
+        # SP is not supported yet.
+        return num_scheduled_tokens
+
+    @torch.inference_mode()
+    def capture_model(self) -> int:
+        if not self.cudagraph_manager.needs_capture():
+            logger.warning(
+                "Skipping CUDA graph capture. To turn on CUDA graph capture, "
+                "ensure `cudagraph_mode` was not manually set to `NONE`"
+            )
+            return 0
+
+        start_time = time.perf_counter()
+        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        with self.maybe_setup_dummy_loras(self.lora_config):
+            self.cudagraph_manager.capture(
+                model=self.model,
+                input_buffers=self.input_buffers,
+                block_tables=self.block_tables,
+                attn_metadata_builders=self.attn_metadata_builders,
+                kv_cache_config=self.kv_cache_config,
+            )
+
+        end_time = time.perf_counter()
+        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
+        elapsed_time = end_time - start_time
+        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
+        # This usually takes 5~20 seconds.
+        logger.info(
+            "Graph capturing finished in %.0f secs, took %.2f GiB",
+            elapsed_time,
+            cuda_graph_size / (1 << 30),
+        )
+        return cuda_graph_size
+
+    def warmup_for_prefill(self) -> None:
+        # For FlashInfer, we would like to execute a dummy prefill run
+        # to trigger JIT compilation.
+        if all("FLASHINFER" in b.get_name() for b in self.attn_backends.values()):
+            self._dummy_run(self.max_num_tokens, skip_attn=False)
+            torch.cuda.synchronize()
+
+    def update_states(self, scheduler_output: SchedulerOutput) -> None:
+        for req_id in scheduler_output.preempted_req_ids:
+            self.req_states.remove_request(req_id)
+        for req_id in scheduler_output.finished_req_ids:
+            self.req_states.remove_request(req_id)
+
+        # TODO(woosuk): Change SchedulerOutput.
+        req_indices: list[int] = []
+        cu_num_new_blocks = tuple(
+            [0] for _ in range(self.block_tables.num_kv_cache_groups)
+        )
+        new_block_ids: tuple[list[int], ...] = tuple(
+            [] for _ in range(self.block_tables.num_kv_cache_groups)
+        )
+        overwrite: list[bool] = []
+
+        # Add new requests.
+        for new_req_data in scheduler_output.scheduled_new_reqs:
+            req_id = new_req_data.req_id
+            self.req_states.add_request(
+                req_id=req_id,
+                prompt_len=len(new_req_data.prompt_token_ids),
+                prefill_token_ids=new_req_data.prefill_token_ids,
+                num_computed_tokens=new_req_data.num_computed_tokens,
+                sampling_params=new_req_data.sampling_params,
+                lora_request=new_req_data.lora_request,
+            )
+
+            req_index = self.req_states.req_id_to_index[req_id]
+            req_indices.append(req_index)
+            for i, block_ids in enumerate(new_req_data.block_ids):
+                x = cu_num_new_blocks[i][-1]
+                cu_num_new_blocks[i].append(x + len(block_ids))
+                new_block_ids[i].extend(block_ids)
+            overwrite.append(True)
+
+        # Add new blocks for the existing requests.
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for i, req_id in enumerate(cached_reqs.req_ids):
+            req_index = self.req_states.req_id_to_index[req_id]
+
+            req_new_block_ids = cached_reqs.new_block_ids[i]
+            if req_new_block_ids is not None:
+                req_indices.append(req_index)
+                for group_id, block_ids in enumerate(req_new_block_ids):
+                    x = cu_num_new_blocks[group_id][-1]
+                    cu_num_new_blocks[group_id].append(x + len(block_ids))
+                    new_block_ids[group_id].extend(block_ids)
+                overwrite.append(False)
+
+        if req_indices:
+            self.block_tables.append_block_ids(
+                req_indices=req_indices,
+                cu_num_new_blocks=cu_num_new_blocks,
+                new_block_ids=new_block_ids,
+                overwrite=overwrite,
+            )
+
+    def prepare_inputs(
+        self,
+        scheduler_output: SchedulerOutput,
+        num_tokens_after_padding: int,
+    ) -> InputBatch:
+        num_tokens = scheduler_output.total_num_scheduled_tokens
+        assert num_tokens > 0
+        num_reqs = len(scheduler_output.num_scheduled_tokens)
+
+        # Decode first, then prefill.
+        # batch_idx -> req_id
+        req_ids = sorted(
+            scheduler_output.num_scheduled_tokens,
+            key=scheduler_output.num_scheduled_tokens.get,
+        )
+        num_scheduled_tokens = np.array(
+            [scheduler_output.num_scheduled_tokens[i] for i in req_ids], dtype=np.int32
+        )
+
+        idx_mapping_list = [
+            self.req_states.req_id_to_index[req_id] for req_id in req_ids
+        ]
+        idx_mapping = self.input_buffers.idx_mapping
+        idx_mapping.np[:num_reqs] = idx_mapping_list
+        idx_mapping_np = idx_mapping.np[:num_reqs]
+        idx_mapping = idx_mapping.copy_to_gpu(num_reqs)
+
+        # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
+        block_tables = self.block_tables.gather_block_tables(idx_mapping)
+
+        prepare_inputs(
+            idx_mapping_np,
+            self.req_states.prefill_token_ids,
+            self.req_states.num_computed_tokens,
+            num_scheduled_tokens,
+            self.input_buffers.input_ids,
+            self.input_buffers.positions,
+            self.input_buffers.query_start_loc,
+            self.input_buffers.seq_lens,
+            num_tokens,
+        )
+
+        query_start_loc = self.input_buffers.query_start_loc
+        query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
+        query_start_loc_np = query_start_loc.np[: num_reqs + 1]
+        seq_lens_gpu = self.input_buffers.seq_lens.gpu[:num_reqs]
+        seq_lens_np = self.input_buffers.seq_lens.np[:num_reqs]
+
+        # Some input token ids are directly read from the last sampled tokens.
+        combine_last_token_ids(
+            self.input_buffers.input_ids.gpu,
+            idx_mapping,
+            self.req_states.last_sampled_tokens,
+            query_start_loc_gpu,
+            seq_lens_gpu,
+            self.req_states.prefill_len.copy_to_gpu(),
+        )
+
+        # Compute slot mappings: [num_kv_cache_groups, num_tokens]
+        slot_mappings = self.block_tables.compute_slot_mappings(
+            query_start_loc_gpu, self.input_buffers.positions.gpu[:num_tokens]
+        )
+
+        num_computed_tokens_cpu = torch.from_numpy(
+            self.req_states.num_computed_tokens[idx_mapping_np]
+        )
+
+        # Logits indices to sample next token from.
+        logits_indices = query_start_loc_gpu[1:] - 1
+
+        # Layer name -> attention metadata.
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=self.attn_metadata_builders,
+            num_reqs=num_reqs,
+            num_tokens=num_tokens,
+            query_start_loc=self.input_buffers.query_start_loc,
+            seq_lens=self.input_buffers.seq_lens,
+            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=self.kv_cache_config,
+        )
+
+        input_ids = self.input_buffers.input_ids.gpu[:num_tokens_after_padding]
+        positions = self.input_buffers.positions.gpu[:num_tokens_after_padding]
+        return InputBatch(
+            req_ids=req_ids,
+            num_reqs=num_reqs,
+            idx_mapping=idx_mapping,
+            idx_mapping_np=idx_mapping_np,
+            num_scheduled_tokens=num_scheduled_tokens,
+            num_tokens=num_tokens,
+            num_tokens_after_padding=num_tokens_after_padding,
+            query_start_loc=query_start_loc_gpu,
+            query_start_loc_np=query_start_loc_np,
+            seq_lens=seq_lens_gpu,
+            seq_lens_np=seq_lens_np,
+            input_ids=input_ids,
+            positions=positions,
+            attn_metadata=attn_metadata,
+            logits_indices=logits_indices,
+        )
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        input_batch: InputBatch,
+        sampling_metadata: SamplingMetadata,
+        grammar_output: GrammarOutput | None,
+    ) -> SamplerOutput:
+        sample_hidden_states = hidden_states[input_batch.logits_indices]
+        logits = self.model.compute_logits(sample_hidden_states)
+        if grammar_output is not None:
+            # Apply grammar bitmask to the logits in-place.
+            with async_barrier(self.structured_outputs_event):
+                apply_grammar_bitmask(
+                    logits,
+                    input_batch.req_ids,
+                    grammar_output.structured_output_request_ids,
+                    grammar_output.grammar_bitmask,
+                    self.input_buffers,
+                )
+        sampler_output = self.sampler(logits, sampling_metadata)
+        return sampler_output
+
+    def compute_prompt_logprobs(
+        self,
+        hidden_states: torch.Tensor,
+        input_batch: InputBatch,
+    ) -> dict[str, LogprobsTensors]:
+        idx_mapping_np = input_batch.idx_mapping_np
+        needs_prompt_logprobs = self.req_states.needs_prompt_logprobs[idx_mapping_np]
+        if not np.any(needs_prompt_logprobs):
+            # No request asks for prompt logprobs.
+            return {}
+
+        num_computed_tokens = self.req_states.num_computed_tokens[idx_mapping_np]
+        prompt_lens = self.req_states.prompt_len[idx_mapping_np]
+        # NOTE(woosuk): -1 because the last prompt token's hidden state is not
+        # needed for prompt logprobs.
+        includes_prompt = num_computed_tokens < prompt_lens - 1
+        # NOTE(woosuk): If the request was resumed after preemption, its prompt
+        # logprobs must have been computed before preemption. Skip.
+        resumed_after_prompt = (
+            prompt_lens < self.req_states.prefill_len.np[idx_mapping_np]
+        )
+        needs_prompt_logprobs &= includes_prompt & ~resumed_after_prompt
+        if not np.any(needs_prompt_logprobs):
+            return {}
+
+        # Just to be safe, clone the input ids.
+        n = input_batch.num_tokens
+        # Shift the input ids by one.
+        token_ids = torch.empty_like(input_batch.input_ids[:n])
+        token_ids[: n - 1] = input_batch.input_ids[1:n]
+        # To avoid out-of-bound access, set the last token id to 0.
+        token_ids[n - 1] = 0
+
+        # Handle chunked prompts.
+        seq_lens = self.input_buffers.seq_lens.np[: input_batch.num_reqs]
+        is_prompt_chunked = seq_lens < prompt_lens
+        prefill_token_ids = self.req_states.prefill_token_ids
+        query_start_loc = self.input_buffers.query_start_loc.np
+        for i, req_id in enumerate(input_batch.req_ids):
+            if not needs_prompt_logprobs[i]:
+                continue
+            if not is_prompt_chunked[i]:
+                continue
+            # The prompt is chunked. Get the next prompt token.
+            req_idx = input_batch.idx_mapping_np[i]
+            next_prompt_token = int(prefill_token_ids[req_idx, seq_lens[i]])
+            idx = int(query_start_loc[i + 1] - 1)
+            # Set the next prompt token.
+            # NOTE(woosuk): This triggers a GPU operation.
+            token_ids[idx] = next_prompt_token
+
+        # NOTE(woosuk): We mask out logprobs for negative tokens.
+        prompt_logprobs, prompt_ranks = compute_prompt_logprobs(
+            token_ids,
+            hidden_states[:n],
+            self.model.compute_logits,
+        )
+
+        prompt_token_ids = token_ids.unsqueeze(-1)
+        prompt_logprobs_dict: dict[str, LogprobsTensors] = {}
+        for i, req_id in enumerate(input_batch.req_ids):
+            if not needs_prompt_logprobs[i]:
+                continue
+
+            start_idx = query_start_loc[i]
+            end_idx = query_start_loc[i + 1]
+            assert start_idx < end_idx, (
+                f"start_idx ({start_idx}) >= end_idx ({end_idx})"
+            )
+            logprobs = LogprobsTensors(
+                logprob_token_ids=prompt_token_ids[start_idx:end_idx],
+                logprobs=prompt_logprobs[start_idx:end_idx],
+                selected_token_ranks=prompt_ranks[start_idx:end_idx],
+            )
+
+            req_extra_data = self.req_states.extra_data[req_id]
+            prompt_logprobs_list = req_extra_data.in_progress_prompt_logprobs
+            if is_prompt_chunked[i]:
+                # Prompt is chunked. Do not return the logprobs yet.
+                prompt_logprobs_list.append(logprobs)
+                continue
+
+            if prompt_logprobs_list:
+                # Merge the in-progress logprobs.
+                prompt_logprobs_list.append(logprobs)
+                logprobs = LogprobsTensors(
+                    logprob_token_ids=torch.cat(
+                        [x.logprob_token_ids for x in prompt_logprobs_list]
+                    ),
+                    logprobs=torch.cat([x.logprobs for x in prompt_logprobs_list]),
+                    selected_token_ranks=torch.cat(
+                        [x.selected_token_ranks for x in prompt_logprobs_list]
+                    ),
+                )
+                prompt_logprobs_list.clear()
+
+            prompt_logprobs_dict[req_id] = logprobs
+        return prompt_logprobs_dict
+
+    def postprocess(
+        self,
+        sampler_output: SamplerOutput,
+        prompt_logprobs_dict: dict[str, LogprobsTensors],
+        input_batch: InputBatch,
+    ) -> AsyncOutput | ModelRunnerOutput:
+        # Store the last sampled token ids.
+        self.req_states.last_sampled_tokens[input_batch.idx_mapping] = (
+            sampler_output.sampled_token_ids
+        )
+        # Get the number of sampled tokens.
+        # 0 if chunked-prefilling, 1 if not.
+        idx_mapping_np = input_batch.idx_mapping_np
+        is_chunked_prefilling = (
+            input_batch.seq_lens_np < self.req_states.num_tokens[idx_mapping_np]
+        )
+        num_sampled_tokens = (~is_chunked_prefilling).astype(np.int32)
+        # Increment the number of tokens.
+        self.req_states.num_tokens[idx_mapping_np] += num_sampled_tokens
+        # Increment the number of computed tokens.
+        self.req_states.num_computed_tokens[idx_mapping_np] += (
+            input_batch.num_scheduled_tokens
+        )
+
+        model_runner_output = ModelRunnerOutput(
+            req_ids=input_batch.req_ids,
+            req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)},
+            sampled_token_ids=None,
+            logprobs=None,
+            prompt_logprobs_dict=prompt_logprobs_dict,
+            pooler_output=[],
+            kv_connector_output=None,
+            num_nans_in_logits=None,
+        )
+        async_output = AsyncOutput(
+            model_runner_output=model_runner_output,
+            sampler_output=sampler_output,
+            num_sampled_tokens=num_sampled_tokens,
+            copy_stream=self.output_copy_stream,
+            copy_event=self.output_copy_event,
+        )
+        if self.use_async_scheduling:
+            return async_output
+        return async_output.get_output()
+
+    def get_cudagraph_and_dp_padding(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> tuple[CUDAGraphMode, int, torch.Tensor | None]:
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        if self.dp_size == 1:
+            # No DP. Only consider CUDA graphs.
+            if total_num_scheduled_tokens == 0:
+                # Special case: no tokens to run.
+                return CUDAGraphMode.NONE, 0, None
+
+            cudagraph_size = self.cudagraph_manager.get_cudagraph_size(
+                scheduler_output, total_num_scheduled_tokens
+            )
+            if cudagraph_size is not None:
+                # Use full CUDA graph.
+                return CUDAGraphMode.FULL, cudagraph_size, None
+            # Fall back to eager mode.
+            # TODO(woosuk): Support piecewise CUDA graphs.
+            return CUDAGraphMode.NONE, total_num_scheduled_tokens, None
+
+        # Consider DP padding and CUDA graph.
+        if total_num_scheduled_tokens == 0:
+            # Special handling is needed for 0.
+            cudagraph_size_before_dp: int | None = 0
+        else:
+            cudagraph_size_before_dp = self.cudagraph_manager.get_cudagraph_size(
+                scheduler_output, total_num_scheduled_tokens
+            )
+            if cudagraph_size_before_dp is None:
+                cudagraph_size_before_dp = -1
+
+        assert cudagraph_size_before_dp is not None
+        num_tokens_across_dp, cudagraph_size_across_dp = get_batch_metadata_across_dp(
+            total_num_scheduled_tokens,
+            cudagraph_size_before_dp,
+            self.dp_size,
+            self.dp_rank,
+        )
+        if all(cudagraph_size_across_dp >= 0):
+            # If all ranks can use CUDA graph, pad to the maximum number of tokens
+            # across DP and use CUDA graph.
+            num_tokens_after_padding = int(cudagraph_size_across_dp.max().item())
+            cudagraph_mode = CUDAGraphMode.FULL
+        else:
+            # If any of the ranks cannot use CUDA graph, use eager mode for all ranks.
+            # No padding is needed except for ranks that have no tokens to run.
+            num_tokens_across_dp = torch.clamp(num_tokens_across_dp, min=1)
+            num_tokens_after_padding = num_tokens_across_dp[self.dp_rank]
+            cudagraph_mode = CUDAGraphMode.NONE
+        return cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: SchedulerOutput,
+        intermediate_tensors: Any | None = None,
+        dummy_run: bool = False,
+    ) -> ModelRunnerOutput | None:
+        assert intermediate_tensors is None
+        if scheduler_output.total_num_scheduled_tokens == 0 and not dummy_run:
+            # No need to run the model.
+            with async_barrier(self.input_prep_event):
+                self.update_states(scheduler_output)
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+        # NOTE: Call this before the async barrier so CPU all-reduce and
+        # GPU execution can overlap.
+        cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp = (
+            self.get_cudagraph_and_dp_padding(scheduler_output)
+        )
+        with async_barrier(self.input_prep_event):
+            self.update_states(scheduler_output)
+            if num_tokens_after_padding == 0:
+                # All DP ranks have zero tokens to run.
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+            if not dummy_run:
+                # Common case.
+                # Prepare all the inputs and copy to the input buffers.
+                input_batch = self.prepare_inputs(
+                    scheduler_output,
+                    num_tokens_after_padding,
+                )
+
+                # NOTE(woosuk): Sampling metadata should be built under the async
+                # barrier to avoid race conditions.
+                pos = input_batch.positions[input_batch.logits_indices]
+                sampling_metadata = self.req_states.make_sampling_metadata(
+                    input_batch.idx_mapping_np, pos
+                )
+
+                if self.lora_config:
+                    # Activate LoRA adapters.
+                    lora_inputs = self.req_states.make_lora_inputs(
+                        input_batch.req_ids,
+                        input_batch.idx_mapping_np,
+                        input_batch.num_scheduled_tokens,
+                    )
+                    self._set_active_loras(*lora_inputs)
+            else:
+                # No actual tokens to run. A dummy run for DP.
+                num_reqs = min(num_tokens_after_padding, self.max_num_reqs)
+                input_batch = InputBatch.make_dummy(
+                    num_reqs=num_reqs,
+                    num_tokens=num_tokens_after_padding,
+                    input_buffers=self.input_buffers,
+                    device=self.device,
+                )
+                self.prepare_dummy_attn_metadata(input_batch)
+                sampling_metadata = None
+
+        # Run model.
+        if cudagraph_mode == CUDAGraphMode.FULL:
+            # Run CUDA graph.
+            # NOTE(woosuk): Here, we don't need to pass the input tensors,
+            # because they are already copied to the CUDA graph input buffers.
+            hidden_states = self.cudagraph_manager.run(
+                input_batch.num_tokens_after_padding
+            )
+        else:
+            # Run PyTorch model in eager mode.
+            with set_forward_context(
+                input_batch.attn_metadata,
+                self.vllm_config,
+                num_tokens=input_batch.num_tokens_after_padding,
+                cudagraph_runtime_mode=cudagraph_mode,
+                num_tokens_across_dp=num_tokens_across_dp,
+            ):
+                hidden_states = self.model(
+                    input_ids=input_batch.input_ids,
+                    positions=input_batch.positions,
+                )
+
+        self.execute_model_state = hidden_states, input_batch, sampling_metadata
+        return None
+
+    @torch.inference_mode()
+    def sample_tokens(
+        self,
+        grammar_output: GrammarOutput | None,
+    ) -> AsyncOutput | ModelRunnerOutput:
+        assert self.execute_model_state is not None
+        hidden_states, input_batch, sampling_metadata = self.execute_model_state
+        self.execute_model_state = None  # type: ignore
+        assert sampling_metadata is not None
+
+        sampler_output = self.sample(
+            hidden_states, input_batch, sampling_metadata, grammar_output
+        )
+        prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch)
+        output = self.postprocess(
+            sampler_output,
+            prompt_logprobs_dict,
+            input_batch,
+        )
+        return output
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
new file mode 100644
index 000000000000..e916aadb6b5a
--- /dev/null
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -0,0 +1,327 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.config.model import LogprobsMode
+from vllm.v1.outputs import LogprobsTensors, SamplerOutput
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+
+
+class Sampler:
+    def __init__(
+        self,
+        logprobs_mode: LogprobsMode = "raw_logprobs",
+    ):
+        if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
+            raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
+        self.logprobs_mode = logprobs_mode
+
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+        if sampling_metadata.max_num_logprobs is not None:
+            if self.logprobs_mode == "processed_logprobs":
+                sampled, logits = self.sample(
+                    logits, sampling_metadata, return_logits=True
+                )
+            else:
+                assert self.logprobs_mode == "raw_logprobs"
+                sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
+
+            logprobs_tensors = compute_topk_logprobs(
+                logits,
+                sampling_metadata.max_num_logprobs,
+                sampled,
+            )
+        else:
+            sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
+            logprobs_tensors = None
+
+        # These are GPU tensors.
+        sampler_output = SamplerOutput(
+            # The sampled tokens are expanded to 2D tensor with shape
+            # [num_requests, 1], where each row represents one generated
+            # token per request.
+            sampled_token_ids=sampled.view(-1, 1),
+            logprobs_tensors=logprobs_tensors,
+        )
+        return sampler_output
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        return_logits: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        is_greedy = sampling_metadata.temperature == 0
+        temp = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
+        logits = logits / temp.view(-1, 1)
+        logits = apply_top_k_top_p(
+            logits, sampling_metadata.top_k, sampling_metadata.top_p
+        )
+
+        sampled = gumbel_sample(
+            logits,
+            is_greedy,
+            sampling_metadata.seeds,
+            sampling_metadata.pos,
+        )
+        return sampled, logits if return_logits else None
+
+
+@triton.jit
+def _gumbel_sample_kernel(
+    sampled_ptr,
+    logits_ptr,
+    logits_stride,
+    seeds_ptr,
+    pos_ptr,
+    is_greedy_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    is_greedy = tl.load(is_greedy_ptr + req_idx)
+
+    if is_greedy:
+        # Greedy sampling. Don't apply gumbel noise.
+        max_val = float("-inf")
+        max_idx = 0
+        for i in range(0, vocab_size, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            mask = block < vocab_size
+            logits = tl.load(
+                logits_ptr + req_idx * logits_stride + block,
+                mask=mask,
+                other=float("-inf"),
+            )
+
+            idx = tl.argmax(logits, axis=0)
+            value = tl.max(logits, axis=0)
+            is_greater = value > max_val
+            max_val = tl.where(is_greater, value, max_val)
+            max_idx = tl.where(is_greater, i + idx, max_idx)
+        tl.store(sampled_ptr + req_idx, max_idx)
+        return
+
+    # Random sampling.
+    # Calculate gumbel seed.
+    seed = tl.load(seeds_ptr + req_idx)
+    pos = tl.load(pos_ptr + req_idx)
+    gumbel_seed = tl.randint(seed, pos)
+
+    max_val = float("-inf")
+    max_idx = 0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < vocab_size
+
+        # Generate gumbel noise.
+        r = tl.rand(gumbel_seed, block).to(tl.float64)
+        gumbel_noise = -tl.log(-tl.log(r + 1e-20) + 1e-20)
+        gumbel_noise = gumbel_noise.to(tl.float32)
+
+        # Apply gumbel noise.
+        logits = tl.load(logits_ptr + req_idx * logits_stride + block, mask=mask)
+        logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
+
+        # Argmax to get the sampled token.
+        idx = tl.argmax(logits, axis=0)
+        value = tl.max(logits, axis=0)
+        is_greater = value > max_val
+        max_val = tl.where(is_greater, value, max_val)
+        max_idx = tl.where(is_greater, i + idx, max_idx)
+    tl.store(sampled_ptr + req_idx, max_idx)
+
+
+def gumbel_sample(
+    logits: torch.Tensor,  # [num_reqs, vocab_size]
+    is_greedy: torch.Tensor,  # [num_reqs]
+    seed: torch.Tensor,  # [num_reqs]
+    pos: torch.Tensor,  # [num_reqs]
+) -> torch.Tensor:
+    num_reqs, vocab_size = logits.shape
+    # NOTE(woosuk): Use int64 for later indexing.
+    sampled = torch.empty(
+        num_reqs,
+        dtype=torch.int64,
+        device=logits.device,
+    )
+    _gumbel_sample_kernel[(num_reqs,)](
+        sampled,
+        logits,
+        logits.stride(0),
+        seed,
+        pos,
+        is_greedy,
+        vocab_size,
+        num_warps=8,
+        BLOCK_SIZE=16384,  # type: ignore
+    )
+    return sampled
+
+
+@triton.jit
+def _topk_log_softmax_kernel(
+    output_ptr,
+    logits_ptr,
+    logits_stride,
+    topk_ids_ptr,
+    topk,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+    PADDED_TOPK: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = logits_ptr + req_idx * logits_stride
+
+    max_val = float("-inf")
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
+        max_val = tl.max(tl.maximum(logits, max_val))
+    max_val = max_val.to(tl.float32)  # type: ignore
+
+    se = 0.0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=0.0)
+        # NOTE(woosuk): Make sure that logits and all following operations use FP32.
+        logits = logits.to(tl.float32)
+        e = tl.exp(logits - max_val)
+        e = tl.where(block < vocab_size, e, 0.0)
+        se += tl.sum(e)
+    lse = tl.log(se)
+
+    k_offset = tl.arange(0, PADDED_TOPK)
+    k_mask = k_offset < topk
+    topk_ids = tl.load(topk_ids_ptr + req_idx * topk + k_offset, mask=k_mask, other=0)
+
+    logits = tl.load(row_ptr + topk_ids, mask=k_mask)
+    logits = logits.to(tl.float32)
+    o = logits - max_val - lse
+    tl.store(output_ptr + req_idx * topk + k_offset, o, mask=k_mask)
+
+
+@triton.jit
+def _ranks_kernel(
+    output_ptr,
+    logits_ptr,
+    logits_stride,
+    token_ids_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = logits_ptr + req_idx * logits_stride
+
+    token_id = tl.load(token_ids_ptr + req_idx)
+    x = tl.load(row_ptr + token_id)
+
+    n = 0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
+        n += tl.sum((logits > x).to(tl.int32))
+    tl.store(output_ptr + req_idx, n)
+
+
+def compute_token_logprobs(
+    logits: torch.Tensor,
+    token_ids: torch.Tensor,
+) -> torch.Tensor:
+    batch_size = logits.shape[0]
+    vocab_size = logits.shape[1]
+    token_ids = token_ids.to(torch.int64)
+    num_logprobs = token_ids.shape[1]
+    logprobs = torch.empty(
+        batch_size,
+        num_logprobs,
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    _topk_log_softmax_kernel[(batch_size,)](
+        logprobs,
+        logits,
+        logits.stride(0),
+        token_ids,
+        num_logprobs,
+        vocab_size,
+        BLOCK_SIZE=1024,  # type: ignore
+        PADDED_TOPK=triton.next_power_of_2(num_logprobs),
+    )
+    return logprobs
+
+
+def compute_topk_logprobs(
+    logits: torch.Tensor,
+    num_logprobs: int,
+    sampled_token_ids: torch.Tensor,
+) -> LogprobsTensors:
+    assert num_logprobs >= 0
+    batch_size, vocab_size = logits.shape
+    if num_logprobs == 0:
+        logprob_token_ids = sampled_token_ids.unsqueeze(-1)
+    else:
+        topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
+        logprob_token_ids = torch.cat(
+            (sampled_token_ids.unsqueeze(-1), topk_indices), dim=1
+        )
+
+    # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full
+    # logprobs tensor. Instead, we only compute and return the logprobs of
+    # the topk + 1 tokens.
+    logprobs = compute_token_logprobs(logits, logprob_token_ids)
+    token_ranks = torch.empty(
+        batch_size,
+        dtype=torch.int64,
+        device=logits.device,
+    )
+    _ranks_kernel[(batch_size,)](
+        token_ranks,
+        logits,
+        logits.stride(0),
+        sampled_token_ids,
+        vocab_size,
+        BLOCK_SIZE=8192,  # type: ignore
+    )
+    return LogprobsTensors(
+        logprob_token_ids=logprob_token_ids,
+        logprobs=logprobs,
+        selected_token_ranks=token_ranks,
+    )
+
+
+def compute_prompt_logprobs(
+    prompt_token_ids: torch.Tensor,
+    prompt_hidden_states: torch.Tensor,
+    logits_fn: Callable[[torch.Tensor], torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # Since materializing the full prompt logits can take too much memory,
+    # we compute it in chunks.
+    CHUNK_SIZE = 1024
+    logprobs = []
+    ranks = []
+    prompt_token_ids = prompt_token_ids.to(torch.int64)
+    for start_idx in range(0, prompt_token_ids.shape[0], CHUNK_SIZE):
+        end_idx = start_idx + CHUNK_SIZE
+        # NOTE(woosuk): logits_fn can be slow because it involves all-gather.
+        prompt_logits = logits_fn(prompt_hidden_states[start_idx:end_idx])
+        prompt_logprobs = compute_topk_logprobs(
+            prompt_logits,
+            0,  # num_logprobs
+            prompt_token_ids[start_idx:end_idx],
+        )
+        logprobs.append(prompt_logprobs.logprobs)
+        ranks.append(prompt_logprobs.selected_token_ranks)
+
+    logprobs = torch.cat(logprobs, dim=0) if len(logprobs) > 1 else logprobs[0]
+    ranks = torch.cat(ranks, dim=0) if len(ranks) > 1 else ranks[0]
+    return logprobs, ranks
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
new file mode 100644
index 000000000000..5d05c3f57790
--- /dev/null
+++ b/vllm/v1/worker/gpu/states.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass, field
+
+import numpy as np
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingParams
+from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.utils import CpuGpuBuffer
+
+_NP_INT64_MIN = np.iinfo(np.int64).min
+_NP_INT64_MAX = np.iinfo(np.int64).max
+NO_LORA_ID = 0
+
+
+@dataclass
+class SamplingMetadata:
+    temperature: torch.Tensor
+
+    top_p: torch.Tensor | None
+    top_k: torch.Tensor | None
+
+    seeds: torch.Tensor
+    pos: torch.Tensor
+
+    # None means no logprobs, 0 means sampled token logprobs only
+    max_num_logprobs: int | None
+
+    @classmethod
+    def make_dummy(
+        cls,
+        num_reqs: int,
+        device: torch.device,
+    ) -> "SamplingMetadata":
+        assert num_reqs > 0
+        temperature = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        temperature[0] = 0.5
+        # TODO(woosuk): Use top-p and top-k for dummy sampler.
+        # Currently, they are disabled because of memory usage.
+        # top_p = torch.full((num_reqs,), 0.95, dtype=torch.float32, device=device)
+        # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device)
+        top_p = None
+        top_k = None
+        seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        pos = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        max_num_logprobs = 20
+
+        return cls(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            seeds=seeds,
+            pos=pos,
+            max_num_logprobs=max_num_logprobs,
+        )
+
+
+class RequestState:
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_batched_tokens: int,
+        vocab_size: int,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.vocab_size = vocab_size
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.req_id_to_index: dict[str, int] = {}
+        self.index_to_req_id: dict[int, str] = {}
+        self.free_indices = list(range(max_num_reqs))
+        self.extra_data: dict[str, ExtraData] = {}
+
+        self.prompt_len = np.zeros(self.max_num_reqs, dtype=np.int32)
+        self.prefill_token_ids = np.zeros(
+            (self.max_num_reqs, self.max_model_len),
+            dtype=np.int32,
+        )
+        self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        self.num_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
+
+        # Last sampled tokens.
+        self.last_sampled_tokens = torch.zeros(
+            self.max_num_reqs,
+            1,
+            dtype=torch.int64,
+            device=device,
+        )
+
+        # LoRA.
+        self.lora_ids = np.zeros(self.max_num_reqs, dtype=np.int32)
+        self.lora_ids.fill(NO_LORA_ID)
+
+        # Sampling parameters.
+        self.temperature = self._make_param(self.max_num_reqs, torch.float32)
+        self.top_p = self._make_param(self.max_num_reqs, torch.float32)
+        self.top_k = self._make_param(self.max_num_reqs, torch.int32)
+        self.seeds = self._make_param(self.max_num_reqs, torch.int64)
+
+        self.num_logprobs = np.empty(self.max_num_reqs, dtype=np.int32)
+        # -1 means no logprobs are requested.
+        self.num_logprobs.fill(-1)
+        self.needs_prompt_logprobs = np.zeros(self.max_num_reqs, dtype=bool)
+
+    def _make_param(self, size: int, dtype: torch.dtype) -> "Param":
+        return Param(size, dtype=dtype, device=self.device, pin_memory=self.pin_memory)
+
+    def _make_buffer(self, size: int, dtype: torch.dtype) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            size, dtype=dtype, device=self.device, pin_memory=self.pin_memory
+        )
+
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_id_to_index)
+
+    def add_request(
+        self,
+        req_id: str,
+        prompt_len: int,
+        prefill_token_ids: list[int],
+        num_computed_tokens: int,
+        sampling_params: SamplingParams,
+        lora_request: LoRARequest | None,
+    ) -> None:
+        assert len(self.free_indices) > 0, "No free indices"
+        req_idx = self.free_indices.pop()
+        self.req_id_to_index[req_id] = req_idx
+        self.index_to_req_id[req_idx] = req_id
+        self.extra_data[req_id] = ExtraData(lora_request)
+
+        self.prompt_len[req_idx] = prompt_len
+        prefill_len = len(prefill_token_ids)
+        assert prefill_len >= prompt_len, (
+            f"prefill_len {prefill_len} < prompt_len {prompt_len}"
+        )
+        self.prefill_len.np[req_idx] = prefill_len
+        self.prefill_token_ids[req_idx, :prefill_len] = prefill_token_ids
+        self.num_tokens[req_idx] = prefill_len
+        self.num_computed_tokens[req_idx] = num_computed_tokens
+
+        if lora_request is not None:
+            self.lora_ids[req_idx] = lora_request.lora_int_id
+        else:
+            self.lora_ids[req_idx] = NO_LORA_ID
+
+        self.temperature.np[req_idx] = sampling_params.temperature
+        self.top_p.np[req_idx] = sampling_params.top_p
+        if 0 < sampling_params.top_k < self.vocab_size:
+            top_k = sampling_params.top_k
+        else:
+            top_k = self.vocab_size
+        self.top_k.np[req_idx] = top_k
+
+        if sampling_params.seed is not None:
+            seed = sampling_params.seed
+        else:
+            seed = np.random.randint(_NP_INT64_MIN, _NP_INT64_MAX)
+        self.seeds.np[req_idx] = seed
+
+        if sampling_params.logprobs is not None:
+            num_logprobs = sampling_params.logprobs
+        else:
+            num_logprobs = -1
+        self.num_logprobs[req_idx] = num_logprobs
+
+        # For now, only support prompt logprobs for the prompt tokens.
+        needs_prompt_logprobs = sampling_params.prompt_logprobs is not None
+        self.needs_prompt_logprobs[req_idx] = needs_prompt_logprobs
+
+    def remove_request(self, req_id: str) -> None:
+        self.extra_data.pop(req_id, None)
+        req_idx = self.req_id_to_index.pop(req_id, None)
+        if req_idx is None:
+            # Request not found.
+            return
+        self.index_to_req_id.pop(req_idx, None)
+        self.free_indices.append(req_idx)
+
+    def make_sampling_metadata(
+        self,
+        idx_mapping: np.ndarray,
+        pos: torch.Tensor,
+    ) -> SamplingMetadata:
+        temperature = self.temperature.np[idx_mapping]
+        temperature = self.temperature.copy_np_to_gpu(temperature)
+
+        top_p = self.top_p.np[idx_mapping]
+        no_top_p = np.all(top_p == 1.0)
+        top_p = self.top_p.copy_np_to_gpu(top_p) if not no_top_p else None
+
+        top_k = self.top_k.np[idx_mapping]
+        no_top_k = np.all(top_k == self.vocab_size)
+        top_k = self.top_k.copy_np_to_gpu(top_k) if not no_top_k else None
+
+        seeds = self.seeds.np[idx_mapping]
+        seeds = self.seeds.copy_np_to_gpu(seeds)
+
+        num_logprobs = self.num_logprobs[idx_mapping]
+        max_num_logprobs: int | None = int(np.max(num_logprobs))
+        if max_num_logprobs == -1:
+            max_num_logprobs = None
+
+        return SamplingMetadata(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            seeds=seeds,
+            pos=pos,
+            max_num_logprobs=max_num_logprobs,
+        )
+
+    def make_lora_inputs(
+        self,
+        req_ids: list[str],
+        idx_mapping: np.ndarray,
+        num_scheduled_tokens: np.ndarray,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
+        lora_ids = self.lora_ids[idx_mapping]
+        prompt_lora_mapping = tuple(lora_ids)
+        token_lora_mapping = tuple(lora_ids.repeat(num_scheduled_tokens))
+
+        active_lora_requests: set[LoRARequest] = set()
+        for req_id in req_ids:
+            lora_request = self.extra_data[req_id].lora_request
+            if lora_request is not None:
+                active_lora_requests.add(lora_request)
+        return prompt_lora_mapping, token_lora_mapping, active_lora_requests
+
+
+class Param:
+    def __init__(
+        self,
+        size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.buffer = CpuGpuBuffer(
+            size,
+            dtype=dtype,
+            device=device,
+            pin_memory=pin_memory,
+        )
+        self.np = np.zeros_like(self.buffer.np)
+
+    def copy_np_to_gpu(self, x: np.ndarray) -> torch.Tensor:
+        n = x.shape[0]
+        self.buffer.np[:n] = x
+        return self.buffer.copy_to_gpu(n)
+
+
+@dataclass
+class ExtraData:
+    lora_request: LoRARequest | None
+    in_progress_prompt_logprobs: list[LogprobsTensors] = field(default_factory=list)
diff --git a/vllm/v1/worker/gpu/structured_outputs.py b/vllm/v1/worker/gpu/structured_outputs.py
new file mode 100644
index 000000000000..83051b0ed33f
--- /dev/null
+++ b/vllm/v1/worker/gpu/structured_outputs.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+
+def apply_grammar_bitmask(
+    logits: torch.Tensor,
+    req_ids: list[str],
+    grammar_req_ids: list[str],
+    grammar_bitmask: np.ndarray,
+    input_buffers: InputBuffers,
+) -> None:
+    input_buffers.grammar_bitmask.np[: grammar_bitmask.shape[0]] = grammar_bitmask
+    input_buffers.grammar_bitmask.copy_to_gpu(grammar_bitmask.shape[0])
+
+    batch_size = logits.shape[0]
+    grammar_req_id_to_idx = {req_id: i for i, req_id in enumerate(grammar_req_ids)}
+    # logits -> bitmask mapping
+    mapping = [grammar_req_id_to_idx.get(req_id, -1) for req_id in req_ids]
+    input_buffers.bitmask_indices.np[:batch_size] = mapping
+    input_buffers.bitmask_indices.copy_to_gpu(batch_size)
+
+    vocab_size = logits.shape[-1]
+    BLOCK_SIZE = 8192
+    grid = (batch_size, triton.cdiv(vocab_size, BLOCK_SIZE))
+    _apply_grammar_bitmask_kernel[grid](
+        logits,
+        logits.stride(0),
+        input_buffers.grammar_bitmask.gpu,
+        input_buffers.grammar_bitmask.gpu.stride(0),
+        input_buffers.bitmask_indices.gpu,
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+
+# Adapted from
+# https://github.com/mlc-ai/xgrammar/blob/main/python/xgrammar/kernels/apply_token_bitmask_inplace_triton.py
+@triton.jit
+def _apply_grammar_bitmask_kernel(
+    logits_ptr,
+    logits_stride,
+    bitmask_ptr,
+    bitmask_stride,
+    bitmask_indices_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    logits_idx = tl.program_id(0)
+    bitmask_idx = tl.load(bitmask_indices_ptr + logits_idx)
+    if bitmask_idx == -1:
+        # No bitmask to apply.
+        return
+
+    # Load the bitmask.
+    block_id = tl.program_id(1)
+    bitmask_offset = (block_id * BLOCK_SIZE) // 32 + tl.arange(0, BLOCK_SIZE // 32)
+    packed_bitmask = tl.load(
+        bitmask_ptr + bitmask_idx * bitmask_stride + bitmask_offset,
+        mask=bitmask_offset < bitmask_stride,
+    )
+    # Unpack the bitmask.
+    bitmask = ((packed_bitmask[:, None] >> (tl.arange(0, 32)[None, :])) & 1) == 0
+    bitmask = bitmask.reshape(BLOCK_SIZE)
+
+    # Apply the bitmask to the logits.
+    block_offset = block_id * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    tl.store(
+        logits_ptr + logits_idx * logits_stride + block_offset,
+        -float("inf"),
+        mask=bitmask & (block_offset < vocab_size),
+    )
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index f1fd5be966c3..6a4bfde5f972 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -41,7 +41,7 @@
 from vllm.tasks import SupportedTask
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
-from vllm.v1.core.sched.output import GrammarOutput
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import (
@@ -58,7 +58,6 @@
 
 if TYPE_CHECKING:
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-    from vllm.v1.core.sched.output import SchedulerOutput
 
 
 class Worker(WorkerBase):
@@ -101,6 +100,8 @@ def __init__(
         else:
             self.profiler = None
 
+        self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
+
     def sleep(self, level: int = 1) -> None:
         from vllm.device_allocator.cumem import CuMemAllocator
 
@@ -237,9 +238,17 @@ def init_device(self):
             raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
         # Construct the model runner
-        self.model_runner: GPUModelRunner = GPUModelRunner(
-            self.vllm_config, self.device
-        )
+        if self.use_v2_model_runner:
+            from vllm.v1.worker.gpu.model_runner import (
+                GPUModelRunner as GPUModelRunnerV2,
+            )
+
+            # HACK(woosuk): This is a temporary fix to avoid type errors.
+            self.model_runner: GPUModelRunner = GPUModelRunnerV2(  # type: ignore
+                self.vllm_config, self.device
+            )
+        else:
+            self.model_runner = GPUModelRunner(self.vllm_config, self.device)
 
         if self.rank == 0:
             # If usage stat is enabled, collect relevant info.
@@ -573,7 +582,12 @@ def profile(self, is_start: bool = True):
             self.profiler.stop()
 
     def execute_dummy_batch(self) -> None:
-        self.model_runner._dummy_run(1, uniform_decode=True)
+        if self.use_v2_model_runner:
+            self.model_runner.execute_model(
+                SchedulerOutput.make_empty(), dummy_run=True
+            )
+        else:
+            self.model_runner._dummy_run(1, uniform_decode=True)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)

From b7f1f490a61c99d0b371e39aefbe5546cba231a9 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Fri, 21 Nov 2025 08:34:46 -0800
Subject: [PATCH 293/578] Upstream triton fp4 weight preshuffle (#28888)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
---
 vllm/_aiter_ops.py                            | 25 +++++++
 .../quark/schemes/quark_ocp_mx.py             | 67 ++++++++++++++-----
 2 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index e53e4ae6e529..db79b3f5e8bc 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -948,6 +948,31 @@ def is_triton_gemm_w8a8_tuned(n: int, k: int) -> bool:
             (8192, 32768),
         ]
 
+    @staticmethod
+    def is_triton_gemm_afp4wfp4_presh_ws_tuned(n: int, k: int) -> bool:
+        return (n, k) in [
+            (8192, 4096),
+            (1280, 8192),
+            (16384, 53248),
+            (106496, 16384),
+            (57344, 8192),
+            (8192, 2048),
+            (2560, 8192),
+            (10240, 8192),
+            (16384, 16384),
+            (8192, 28672),
+            (28672, 8192),
+            (18432, 16384),
+            (8192, 1024),
+            (7168, 8192),
+            (5120, 8192),
+            (8192, 8192),
+            (8192, 7168),
+            (14336, 8192),
+            (8192, 14336),
+            (8192, 3584),
+        ]
+
     @staticmethod
     def shuffle_weight(
         self, tensor: torch.Tensor, layout: tuple[int, int] = (16, 16)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index 007e78e68d5c..33e9f9806b27 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -10,6 +10,7 @@
 import torch.nn.functional as F
 
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     dequant_mxfp4,
@@ -49,7 +50,10 @@ def is_rocm_aiter_fp4_asm_gemm_enabled() -> bool:
 
 try:
     from aiter.ops.shuffle import shuffle_weight
-    from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
+    from aiter.ops.triton.gemm_afp4wfp4 import (
+        gemm_afp4wfp4,
+        gemm_afp4wfp4_preshuffled_weight_scales,
+    )
     from aiter.ops.triton.quant import dynamic_mxfp4_quant
 
     from vllm.utils.torch_utils import direct_register_custom_op
@@ -66,23 +70,56 @@ def gemm_with_dynamic_quant(
         x_scales: torch.Tensor | None = None,
     ) -> torch.Tensor:
         M = x.shape[0]
+        N = weight.shape[0]
+        K = weight.shape[1]
         if rocm_use_aiter_fp4_asm_gemm:
-            if x_scales is None:
-                # use hip quant kernel for performance
-                x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
+            if M <= 64 and rocm_aiter_ops.is_triton_gemm_afp4wfp4_presh_ws_tuned(N, K):
+                if x_scales is None:
+                    # use hip quant kernel for performance
+                    if M >= 32:
+                        x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
+                    else:
+                        x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=False)
+                else:
+                    x_q = x
+                    x_s = x_scales
+
+                if M >= 32:
+                    x_s = x_s.view(torch.uint8).view(x_s.shape[0] // 32, -1)
+                else:
+                    x_s = x_s[:M, ...].view(torch.uint8)
+
+                y = torch.empty(M, N, device=x_q.device, dtype=out_dtype)
+                gemm_afp4wfp4_preshuffled_weight_scales(
+                    x_q.view(torch.uint8),
+                    weight.view(torch.uint8).view(weight.shape[0] // 16, -1),
+                    x_s,
+                    weight_scale.view(torch.uint8).view(
+                        weight_scale.shape[0] // 32, -1
+                    ),
+                    out_dtype,
+                    y,
+                )
             else:
-                x_q = x
-                x_s = x_scales
-
-            # 32 alignment is enough for dim0 padding of output for
-            # gemm_a4w4 kernel
-            y = torch.empty(
-                (M + 31) // 32 * 32, weight.shape[0], device=x_q.device, dtype=out_dtype
-            )
+                if x_scales is None:
+                    # use hip quant kernel for performance
+                    x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
+                else:
+                    x_q = x
+                    x_s = x_scales
+
+                # 32 alignment is enough for dim0 padding of output for
+                # gemm_a4w4 kernel
+                y = torch.empty(
+                    (M + 31) // 32 * 32,
+                    weight.shape[0],
+                    device=x_q.device,
+                    dtype=out_dtype,
+                )
 
-            gemm_a4w4(
-                x_q, weight, x_s, weight_scale.view(x_s.dtype), y, bpreshuffle=True
-            )
+                gemm_a4w4(
+                    x_q, weight, x_s, weight_scale.view(x_s.dtype), y, bpreshuffle=True
+                )
             return y[:M]
         else:
             if x_scales is None:

From a42ab317acff8c4b7d4808bb34548a530ee04f0f Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 21 Nov 2025 11:46:20 -0500
Subject: [PATCH 294/578] [Log] Optimize startup log (#28948)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 .../layers/fused_moe/fused_moe.py             |  6 +++--
 .../model_executor/layers/quantization/fp8.py | 10 +++++---
 vllm/profiler/gpu_profiler.py                 | 25 ++++++++++---------
 vllm/v1/core/kv_cache_utils.py                |  3 ++-
 4 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index f44328418f1b..df208eae2e71 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -872,8 +872,10 @@ def get_moe_configs(
     for config_file_path in config_file_paths:
         if os.path.exists(config_file_path):
             with open(config_file_path) as f:
-                logger.info(
-                    "Using configuration from %s for MoE layer.", config_file_path
+                logger.info_once(
+                    "Using configuration from %s for MoE layer.",
+                    config_file_path,
+                    scope="global",
                 )
                 # If a configuration has been found, return it
                 tuned_config = json.load(f)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 92fbdd709348..91bd45bf879c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -162,9 +162,11 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
     # deepGEMM on supported platforms with block-quantized weights
     if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
         if not has_deep_gemm():
-            logger.warning_once("DeepGEMM backend requested but not available.")
+            logger.warning_once(
+                "DeepGEMM backend requested but not available.", scope="local"
+            )
         elif is_deep_gemm_supported():
-            logger.info_once("Using DeepGEMM backend for FP8 MoE")
+            logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
             return Fp8MoeBackend.DEEPGEMM
 
     # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
@@ -173,7 +175,9 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
         and current_platform.is_device_capability(100)
         and block_quant
     ):
-        logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE")
+        logger.info_once(
+            "Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
+        )
         return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
 
     # default to Triton
diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py
index 2155b67a3db4..3e2cbe7296e9 100644
--- a/vllm/profiler/gpu_profiler.py
+++ b/vllm/profiler/gpu_profiler.py
@@ -139,18 +139,19 @@ def __init__(self, worker_name: str, local_rank: int) -> None:
 
         self.local_rank = local_rank
         torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
-        logger.info(
-            "Torch profiling enabled. Traces will be saved to: %s",
-            torch_profiler_trace_dir,
-        )
-        logger.debug(
-            "Profiler config: record_shapes=%s,"
-            "profile_memory=%s,with_stack=%s,with_flops=%s",
-            envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-            envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-            envs.VLLM_TORCH_PROFILER_WITH_STACK,
-            envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-        )
+        if local_rank in (None, 0):
+            logger.info(
+                "Torch profiling enabled. Traces will be saved to: %s",
+                torch_profiler_trace_dir,
+            )
+            logger.debug(
+                "Profiler config: record_shapes=%s,"
+                "profile_memory=%s,with_stack=%s,with_flops=%s",
+                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            )
         self.profiler = torch.profiler.profile(
             activities=[
                 torch.profiler.ProfilerActivity.CPU,
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 01ecd881115d..b18ba8e8b2c7 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1236,10 +1236,11 @@ def _report_kv_cache_config(
     max_concurrency = get_max_concurrency_for_kv_cache_config(
         vllm_config, kv_cache_config
     )
-    logger.info(
+    logger.info_once(
         "Maximum concurrency for %s tokens per request: %.2fx",
         max_model_len_str,
         max_concurrency,
+        scope="local",
     )
 
 
From e99e467384001e284e0722a33362866b10fed65b Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 21 Nov 2025 10:53:09 -0600
Subject: [PATCH 295/578] [CI/Build][Kernel][AMD] Move extra dim to after load
 in _fwd_kv_parallel in lighting_attn.py (#29132)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 vllm/model_executor/layers/lightning_attn.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
index 99853680eac6..ffccdc12241c 100644
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -198,7 +198,7 @@ def _fwd_kv_parallel(
     )
 
     # Load the decay factors for the current head and block
-    k_decay_ptr = K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :]
+    k_decay_ptr = K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)
 
     kv_index = tl.arange(0, CBLOCK)
 
@@ -228,6 +228,12 @@ def _fwd_kv_parallel(
 
         # Load decay factor and compute weighted key-value outer product
         k_decay = tl.load(k_decay_ptr)
+
+        # NOTE: Need to add the extra dim here due to AMD MLIR lowering error.
+        # Please don't move it back until issue is resolved.
+        # Issue: https://github.com/ROCm/triton/issues/907
+        k_decay = k_decay[None, :]
+
         kv += tl.dot(k_trans * k_decay, v)
 
         # Move to the next sub-block

From b4c8fbaae2592501f442817f86e32cfeb795d81f Mon Sep 17 00:00:00 2001
From: Mingyuan Ma <111467530+Victor49152@users.noreply.github.com>
Date: Fri, 21 Nov 2025 08:54:11 -0800
Subject: [PATCH 296/578] Add TRTLLM MoE NVFP4 kernel to
 CompressedTensorsW4A4MoeMethod (#28892)

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 .../compressed_tensors_moe.py                 | 142 +++++++++--
 .../layers/quantization/modelopt.py           | 205 ++--------------
 .../quantization/utils/flashinfer_fp4_moe.py  | 221 ++++++++++++++++++
 3 files changed, 358 insertions(+), 210 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index fa254030a271..ad547dd40982 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -8,6 +8,7 @@
 import torch
 from compressed_tensors import CompressionFormat
 from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy
+from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -50,9 +51,15 @@
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize,
+    flashinfer_trtllm_fp4_moe,
+    prepare_static_weights_for_trtllm_fp4_moe,
     reorder_w1w3_to_w3w1,
     select_nvfp4_gemm_impl,
 )
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    FlashinferMoeBackend,
+    get_flashinfer_moe_backend,
+)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     expert_weight_is_col_major,
     requant_weight_ue8m0_inplace,
@@ -193,6 +200,13 @@ def __init__(self, moe: FusedMoEConfig):
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
+        self.flashinfer_moe_backend = None
+        if self.allow_flashinfer:
+            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
+            logger.info_once(
+                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
+                " for CompressedTensorsW4A4MoeMethod."
+            )
 
     def create_weights(
         self,
@@ -344,21 +358,20 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if self.use_marlin:
             prepare_moe_fp4_layer_for_marlin(layer)
             return
-
-        # swizzle weight scales
-        layer.w13_weight_scale = torch.nn.Parameter(
-            swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
-        )
-
-        layer.w2_weight_scale = torch.nn.Parameter(
-            swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
-        )
-
         # w13
-        w13_input_global_scale = layer.w13_input_global_scale.max(dim=1).values.to(
-            torch.float32
-        )
-
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            w13_input_global_scale = (
+                layer.w13_input_global_scale.min()
+                .to(torch.float32)
+                .expand(layer.num_experts)
+            )
+        else:
+            w13_input_global_scale = layer.w13_input_global_scale.min(dim=1).values.to(
+                torch.float32
+            )
         layer.g1_alphas = torch.nn.Parameter(
             ((1 / w13_input_global_scale) * layer.w13_weight_scale_2),
             requires_grad=False,
@@ -369,22 +382,92 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         )
 
         # w2
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            w2_input_global_scale = (
+                layer.w2_input_global_scale.min()
+                .to(torch.float32)
+                .expand(layer.num_experts)
+            )
+        else:
+            w2_input_global_scale = layer.w2_input_global_scale
+
         layer.g2_alphas = torch.nn.Parameter(
-            ((1 / layer.w2_input_global_scale) * layer.w2_weight_scale_2).to(
-                torch.float32
-            ),
+            ((1 / w2_input_global_scale) * layer.w2_weight_scale_2).to(torch.float32),
             requires_grad=False,
         )
 
         layer.w2_input_scale_quant = torch.nn.Parameter(
-            (layer.w2_input_global_scale), requires_grad=False
+            (w2_input_global_scale), requires_grad=False
         )
 
+        # TensorRT-LLM specific processing
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            # Prepare static weights for TRT-LLM kernel
+            # alternate: prepare_static_weight_layouts_for_trtllm_moe
+            (
+                gemm1_weights_fp4_shuffled,
+                gemm1_scales_fp4_shuffled,
+                gemm2_weights_fp4_shuffled,
+                gemm2_scales_fp4_shuffled,
+            ) = prepare_static_weights_for_trtllm_fp4_moe(
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                layer.w2_weight.size(-2),  # hidden_size
+                layer.w13_weight.size(-2) // 2,  # intermediate_size
+                layer.w13_weight.size(0),  # num_experts
+            )
+            logger.debug_once("Finished shuffling weights for TRT-LLM MOE")
+
+            layer.gemm1_weights_fp4_shuffled = Parameter(
+                gemm1_weights_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm2_weights_fp4_shuffled = Parameter(
+                gemm2_weights_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm1_scales_fp4_shuffled = Parameter(
+                gemm1_scales_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm2_scales_fp4_shuffled = Parameter(
+                gemm2_scales_fp4_shuffled, requires_grad=False
+            )
+
+            # Additional parameter needed for TRT-LLM
+            layer.g1_scale_c = Parameter(
+                (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
+                requires_grad=False,
+            )
+
+            # Clean up weights that won't be used by TRT-LLM
+            del layer.w2_weight
+            del layer.w2_weight_scale
+            del layer.w13_weight
+            del layer.w13_weight_scale
+        else:
+            # swizzle weight scales
+            layer.w13_weight_scale = torch.nn.Parameter(
+                swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
+            )
+
+            layer.w2_weight_scale = torch.nn.Parameter(
+                swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
+            )
+
     def maybe_make_prepare_finalize(
         self,
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
-        if self.use_marlin:
+        if self.use_marlin or (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
             return None
         elif not self.allow_flashinfer:
             return super().maybe_make_prepare_finalize(routing_tables)
@@ -411,7 +494,10 @@ def select_gemm_impl(
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        if self.use_marlin:
+        if (
+            self.use_marlin
+            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
             return None
 
         return nvfp4_moe_quant_config(
@@ -452,6 +538,22 @@ def apply(
             )
         assert activation == "silu", "Only SiLU activation is supported."
 
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            return flashinfer_trtllm_fp4_moe(
+                layer=layer,
+                x=x,
+                router_logits=router_logits,
+                top_k=top_k,
+                global_num_experts=global_num_experts,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                custom_routing_function=custom_routing_function,
+                e_score_correction_bias=e_score_correction_bias,
+            )
+
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 6b5ed7762eb3..01a23168bdde 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -15,7 +15,6 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
-    RoutingMethodType,
     fp8_w8a8_moe_quant_config,
     nvfp4_moe_quant_config,
 )
@@ -38,6 +37,8 @@
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize,
+    flashinfer_trtllm_fp4_moe,
+    prepare_static_weights_for_trtllm_fp4_moe,
     reorder_w1w3_to_w3w1,
     select_nvfp4_gemm_impl,
 )
@@ -1136,7 +1137,6 @@ def __init__(
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.flashinfer_moe_backend = None
-        self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
         if self.allow_flashinfer:
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
@@ -1303,138 +1303,14 @@ def create_weights(
         )
         layer.register_parameter("w2_input_scale", w2_input_scale)
 
-    def prepare_static_weights_for_trtllm_fp4_moe(
-        self,
-        # args_dequant,
-        # args,
-        gemm1_weights,
-        gemm2_weights,
-        gemm1_scales_linear_fp4_bytes,
-        gemm2_scales_linear_fp4_bytes,
-        hidden_size,
-        intermediate_size,
-        num_experts,
-    ):
-        from flashinfer import nvfp4_block_scale_interleave
-        from flashinfer.fused_moe.core import (
-            _maybe_get_cached_w3_w1_permute_indices,
-            get_w2_permute_indices_with_cache,
-        )
-
-        """Prepare quantized weights for kernel (done offline with weights)."""
-        epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
-
-        # Convert quantized weights to proper formats
-        gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape(
-            num_experts, 2 * intermediate_size, hidden_size // 2
-        )  # packed fp4
-        gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view(
-            torch.float8_e4m3fn
-        ).reshape(
-            num_experts, 2 * intermediate_size, hidden_size // 16
-        )  # fp8 scaling factors
-
-        gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape(
-            num_experts, hidden_size, intermediate_size // 2
-        )  # packed fp4
-        gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view(
-            torch.float8_e4m3fn
-        ).reshape(
-            num_experts, hidden_size, intermediate_size // 16
-        )  # fp8 scaling factors
-
-        gemm1_weights_fp4_shuffled = []
-        gemm1_scales_fp4_shuffled = []
-        gemm2_weights_fp4_shuffled = []
-        gemm2_scales_fp4_shuffled = []
-        for i in range(num_experts):
-            # Calculate the permute indices for the following:
-            # 1. Reorder rows of W1 and scales for fused gated activation
-            # 2. Shuffle weights and scaling factors for transposed mma output
-            # for both w3_w1 and w2 weights and scale factors
-            permute_indices = _maybe_get_cached_w3_w1_permute_indices(
-                self._cache_permute_indices,
-                gemm1_weights_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-            )
-            gemm1_weights_fp4_shuffled.append(
-                gemm1_weights_fp4[i]
-                .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)]
-                .contiguous()
-            )
-
-            permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices(
-                self._cache_permute_indices,
-                gemm1_scales_linear_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-                num_elts_per_sf=16,
-            )
-            gemm1_scales_fp4_shuffled.append(
-                nvfp4_block_scale_interleave(
-                    gemm1_scales_linear_fp4[i]
-                    .view(torch.uint8)[
-                        permute_sf_indices.to(gemm1_scales_linear_fp4.device)
-                    ]
-                    .contiguous()
-                )
-            )
-
-            permute_indices = get_w2_permute_indices_with_cache(
-                self._cache_permute_indices,
-                gemm2_weights_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-            )
-            gemm2_weights_fp4_shuffled.append(
-                gemm2_weights_fp4[i]
-                .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)]
-                .contiguous()
-            )
-
-            permute_sf_indices = get_w2_permute_indices_with_cache(
-                self._cache_permute_indices,
-                gemm2_scales_linear_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-                num_elts_per_sf=16,
-            )
-            gemm2_scales_fp4_shuffled.append(
-                nvfp4_block_scale_interleave(
-                    gemm2_scales_linear_fp4[i]
-                    .view(torch.uint8)[
-                        permute_sf_indices.to(gemm2_scales_linear_fp4.device)
-                    ]
-                    .contiguous()
-                )
-            )
-
-        # Stack weights for all experts
-        gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled)
-        gemm1_scales_fp4_shuffled = (
-            torch.stack(gemm1_scales_fp4_shuffled)
-            .view(torch.float8_e4m3fn)
-            .reshape(num_experts, 2 * intermediate_size, hidden_size // 16)
-        )
-
-        gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled)
-        gemm2_scales_fp4_shuffled = (
-            torch.stack(gemm2_scales_fp4_shuffled)
-            .view(torch.float8_e4m3fn)
-            .reshape(num_experts, hidden_size, intermediate_size // 16)
-        )
-        return (
-            gemm1_weights_fp4_shuffled,
-            gemm1_scales_fp4_shuffled,
-            gemm2_weights_fp4_shuffled,
-            gemm2_scales_fp4_shuffled,
-        )
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # GEMM 1 processing
         gemm1_weight = layer.w13_weight.data
         gemm1_weight_scale = layer.w13_weight_scale.data
 
-        if (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+        if self.allow_flashinfer and (
+            self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
             gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
                 gemm1_weight, gemm1_weight_scale, dim=-2
@@ -1508,7 +1384,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 gemm1_scales_fp4_shuffled,
                 gemm2_weights_fp4_shuffled,
                 gemm2_scales_fp4_shuffled,
-            ) = self.prepare_static_weights_for_trtllm_fp4_moe(
+            ) = prepare_static_weights_for_trtllm_fp4_moe(
                 layer.w13_weight,
                 layer.w2_weight,
                 layer.w13_weight_scale,
@@ -1614,68 +1490,17 @@ def apply(
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
-            import flashinfer
-
-            from vllm.model_executor.models.llama4 import Llama4MoE
-
-            a1_gscale = layer.w13_input_scale_quant
-            (hidden_states_fp4, hidden_states_scale_linear_fp4) = (
-                flashinfer.fp4_quantize(
-                    x,
-                    a1_gscale,
-                    is_sf_swizzled_layout=False,
-                )
-            )
-            use_llama4_routing = (
-                custom_routing_function is Llama4MoE.custom_routing_function
-            )
-            routing_method_type = layer.routing_method_type
-            if use_llama4_routing:
-                routing_method_type = RoutingMethodType.Llama4
-            router_logits = (
-                router_logits.to(torch.float32)
-                if routing_method_type == RoutingMethodType.DeepSeekV3
-                else router_logits
-            )
-            routing_bias = e_score_correction_bias
-            if routing_bias is not None:
-                routing_bias = routing_bias.to(torch.bfloat16)
-            out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
-                routing_logits=router_logits,
-                routing_bias=routing_bias,
-                hidden_states=hidden_states_fp4,
-                hidden_states_scale=hidden_states_scale_linear_fp4.view(
-                    torch.float8_e4m3fn
-                ).flatten(),
-                gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
-                gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
-                    torch.float8_e4m3fn
-                ),
-                gemm1_bias=None,
-                gemm1_alpha=None,
-                gemm1_beta=None,
-                gemm1_clamp_limit=None,
-                gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
-                gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
-                    torch.float8_e4m3fn
-                ),
-                gemm2_bias=None,
-                output1_scale_scalar=layer.g1_scale_c.data,
-                output1_scale_gate_scalar=layer.g1_alphas.data,
-                output2_scale_scalar=layer.g2_alphas.data,
-                num_experts=global_num_experts,
+            return flashinfer_trtllm_fp4_moe(
+                layer=layer,
+                x=x,
+                router_logits=router_logits,
                 top_k=top_k,
-                n_group=num_expert_group,
+                global_num_experts=global_num_experts,
+                num_expert_group=num_expert_group,
                 topk_group=topk_group,
-                intermediate_size=layer.intermediate_size_per_partition,
-                local_expert_offset=layer.ep_rank * layer.local_num_experts,
-                local_num_experts=layer.local_num_experts,
-                routed_scaling_factor=1.0,
-                tile_tokens_dim=None,
-                routing_method_type=routing_method_type,
-                do_finalize=True,
-            )[0]
-            return out
+                custom_routing_function=custom_routing_function,
+                e_score_correction_bias=e_score_correction_bias,
+            )
 
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 36e8599dd948..eda40657b1e3 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -9,6 +9,7 @@
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
+    RoutingMethodType,
 )
 from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (
     FlashInferCuteDSLExperts,
@@ -110,3 +111,223 @@ def select_nvfp4_gemm_impl(
         "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS "
         "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)"
     )
+
+
+def prepare_static_weights_for_trtllm_fp4_moe(
+    # args_dequant,
+    # args,
+    gemm1_weights,
+    gemm2_weights,
+    gemm1_scales_linear_fp4_bytes,
+    gemm2_scales_linear_fp4_bytes,
+    hidden_size,
+    intermediate_size,
+    num_experts,
+):
+    from flashinfer import nvfp4_block_scale_interleave
+    from flashinfer.fused_moe.core import (
+        _maybe_get_cached_w3_w1_permute_indices,
+        get_w2_permute_indices_with_cache,
+    )
+
+    _cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
+    """Prepare quantized weights for kernel (done offline with weights)."""
+    epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
+
+    # Convert quantized weights to proper formats
+    gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape(
+        num_experts, 2 * intermediate_size, hidden_size // 2
+    )  # packed fp4
+    gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view(
+        torch.float8_e4m3fn
+    ).reshape(
+        num_experts, 2 * intermediate_size, hidden_size // 16
+    )  # fp8 scaling factors
+
+    gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape(
+        num_experts, hidden_size, intermediate_size // 2
+    )  # packed fp4
+    gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view(
+        torch.float8_e4m3fn
+    ).reshape(num_experts, hidden_size, intermediate_size // 16)  # fp8 scaling factors
+
+    gemm1_weights_fp4_shuffled = []
+    gemm1_scales_fp4_shuffled = []
+    gemm2_weights_fp4_shuffled = []
+    gemm2_scales_fp4_shuffled = []
+    for i in range(num_experts):
+        # Calculate the permute indices for the following:
+        # 1. Reorder rows of W1 and scales for fused gated activation
+        # 2. Shuffle weights and scaling factors for transposed mma output
+        # for both w3_w1 and w2 weights and scale factors
+        permute_indices = _maybe_get_cached_w3_w1_permute_indices(
+            _cache_permute_indices,
+            gemm1_weights_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+        )
+        gemm1_weights_fp4_shuffled.append(
+            gemm1_weights_fp4[i]
+            .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)]
+            .contiguous()
+        )
+
+        permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices(
+            _cache_permute_indices,
+            gemm1_scales_linear_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+            num_elts_per_sf=16,
+        )
+        gemm1_scales_fp4_shuffled.append(
+            nvfp4_block_scale_interleave(
+                gemm1_scales_linear_fp4[i]
+                .view(torch.uint8)[
+                    permute_sf_indices.to(gemm1_scales_linear_fp4.device)
+                ]
+                .contiguous()
+            )
+        )
+
+        permute_indices = get_w2_permute_indices_with_cache(
+            _cache_permute_indices,
+            gemm2_weights_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+        )
+        gemm2_weights_fp4_shuffled.append(
+            gemm2_weights_fp4[i]
+            .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)]
+            .contiguous()
+        )
+
+        permute_sf_indices = get_w2_permute_indices_with_cache(
+            _cache_permute_indices,
+            gemm2_scales_linear_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+            num_elts_per_sf=16,
+        )
+        gemm2_scales_fp4_shuffled.append(
+            nvfp4_block_scale_interleave(
+                gemm2_scales_linear_fp4[i]
+                .view(torch.uint8)[
+                    permute_sf_indices.to(gemm2_scales_linear_fp4.device)
+                ]
+                .contiguous()
+            )
+        )
+
+    # Stack weights for all experts
+    gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled)
+    gemm1_scales_fp4_shuffled = (
+        torch.stack(gemm1_scales_fp4_shuffled)
+        .view(torch.float8_e4m3fn)
+        .reshape(num_experts, 2 * intermediate_size, hidden_size // 16)
+    )
+
+    gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled)
+    gemm2_scales_fp4_shuffled = (
+        torch.stack(gemm2_scales_fp4_shuffled)
+        .view(torch.float8_e4m3fn)
+        .reshape(num_experts, hidden_size, intermediate_size // 16)
+    )
+    return (
+        gemm1_weights_fp4_shuffled,
+        gemm1_scales_fp4_shuffled,
+        gemm2_weights_fp4_shuffled,
+        gemm2_scales_fp4_shuffled,
+    )
+
+
+def flashinfer_trtllm_fp4_moe(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    global_num_experts: int,
+    num_expert_group: int | None,
+    topk_group: int | None,
+    custom_routing_function: object | None,
+    e_score_correction_bias: torch.Tensor | None,
+) -> torch.Tensor:
+    """
+    Apply FlashInfer TensorRT-LLM FP4 MoE kernel.
+
+    Args:
+        layer: The MoE layer with weights and scales
+        x: Input tensor
+        router_logits: Router logits for expert selection
+        top_k: Number of experts to select per token
+        global_num_experts: Total number of experts across all ranks
+        num_expert_group: Number of expert groups (for grouped routing)
+        topk_group: Top-k within each group
+        custom_routing_function: Custom routing function (e.g., Llama4)
+        e_score_correction_bias: Optional routing bias correction
+
+    Returns:
+        Output tensor from the MoE layer
+    """
+    import flashinfer
+
+    from vllm.model_executor.models.llama4 import Llama4MoE
+
+    # Quantize input to FP4
+    a1_gscale = layer.w13_input_scale_quant
+    (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
+        x,
+        a1_gscale,
+        is_sf_swizzled_layout=False,
+    )
+
+    # Determine routing method type
+    use_llama4_routing = custom_routing_function is Llama4MoE.custom_routing_function
+    routing_method_type = layer.routing_method_type
+    if use_llama4_routing:
+        routing_method_type = flashinfer.RoutingMethodType.Llama4
+
+    # Prepare routing bias
+    routing_bias = e_score_correction_bias
+    if routing_bias is not None:
+        routing_bias = routing_bias.to(torch.bfloat16)
+
+    router_logits = (
+        router_logits.to(torch.float32)
+        if routing_method_type == RoutingMethodType.DeepSeekV3
+        else router_logits
+    )
+
+    # Call TRT-LLM FP4 block-scale MoE kernel
+    out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
+        routing_logits=router_logits,
+        routing_bias=routing_bias,
+        hidden_states=hidden_states_fp4,
+        hidden_states_scale=hidden_states_scale_linear_fp4.view(
+            torch.float8_e4m3fn
+        ).flatten(),
+        gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
+        gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
+            torch.float8_e4m3fn
+        ),
+        gemm1_bias=None,
+        gemm1_alpha=None,
+        gemm1_beta=None,
+        gemm1_clamp_limit=None,
+        gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
+        gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
+            torch.float8_e4m3fn
+        ),
+        gemm2_bias=None,
+        output1_scale_scalar=layer.g1_scale_c.data,
+        output1_scale_gate_scalar=layer.g1_alphas.data,
+        output2_scale_scalar=layer.g2_alphas.data,
+        num_experts=global_num_experts,
+        top_k=top_k,
+        n_group=num_expert_group if num_expert_group is not None else 0,
+        topk_group=topk_group if topk_group is not None else 0,
+        intermediate_size=layer.intermediate_size_per_partition,
+        local_expert_offset=layer.ep_rank * layer.local_num_experts,
+        local_num_experts=layer.local_num_experts,
+        routed_scaling_factor=None,
+        tile_tokens_dim=None,
+        routing_method_type=routing_method_type,
+        do_finalize=True,
+    )[0]
+
+    return out

From 460d02a417b440ce8b3b8d09c6f5214a2a346426 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Fri, 21 Nov 2025 10:55:27 -0600
Subject: [PATCH 297/578] [NIXL] Fix after virtual block_size for host_buffer
 with heter kv_layout (#29122)

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
---
 .../kv_transfer/kv_connector/v1/nixl_connector.py  | 14 +++++++++++++-
 vllm/platforms/xpu.py                              |  8 --------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 1626f819af8b..7c0911240493 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -1042,10 +1042,12 @@ def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> Non
         NOT directly supported by NIXL (e.g., tpu)
         """
         xfer_buffers: dict[str, torch.Tensor] = {}
+        inv_order = [0, 1, 3, 2, 4]
         try:
             for layer_name, kv_cache in kv_caches.items():
                 kv_shape = kv_cache.shape
                 kv_dtype = kv_cache.dtype
+                permute_shape = False
                 if (
                     self.kv_cache_layout == "NHD"
                     and self.vllm_config.kv_transfer_config is not None
@@ -1059,10 +1061,20 @@ def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> Non
                     # Since NHD will not support Decode/Prefill TP_ratio > 1,
                     # we can leverage host_buffer for permute
                     self.host_buffer_kv_cache_layout = "HND"
-                    kv_shape = tuple(kv_shape[i] for i in [0, 1, 3, 2, 4])
+                    kv_shape = (
+                        tuple(kv_shape[i] for i in inv_order)
+                        if not self.use_mla
+                        else kv_shape
+                    )
+                    permute_shape = not self.use_mla
+
                 xfer_buffers[layer_name] = torch.empty(
                     kv_shape, dtype=kv_dtype, device="cpu"
                 )
+                if permute_shape:
+                    xfer_buffers[layer_name] = xfer_buffers[layer_name].permute(
+                        inv_order
+                    )
         except MemoryError as e:
             logger.error("NIXLConnectorWorker gets %s.", e)
             raise
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 65516827a16d..18a3186b142f 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -251,10 +251,6 @@ def insert_blocks_to_device(
     ) -> None:
         """Copy blocks from src_cache to dst_cache on XPU."""
         _src_cache = src_cache[:, src_block_indices]
-        if _src_cache.shape[2:] != dst_cache.shape[2:]:
-            # To support TP_ratio, HOST KV might be initiated with HND
-            # while XPU device KV is with NHD
-            _src_cache = _src_cache.permute(0, 1, 3, 2, 4)
         dst_cache[:, dst_block_indices] = _src_cache.to(dst_cache.device)
 
     @classmethod
@@ -267,8 +263,4 @@ def swap_out_blocks_to_host(
     ) -> None:
         """Copy blocks from XPU to host (CPU)."""
         _src_cache = src_cache[:, src_block_indices]
-        if _src_cache.shape[2:] != dst_cache.shape[2:]:
-            # XPU device KV is with NHD while HOST KV
-            # might be initiated with HND for TP_ratio support
-            _src_cache = _src_cache.permute(0, 1, 3, 2, 4)
         dst_cache[:, dst_block_indices] = _src_cache.cpu()

From 75648b16ddce1bff02c39c6f06be62a58385ff52 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Fri, 21 Nov 2025 11:12:16 -0600
Subject: [PATCH 298/578] [ROCm][CI] Fix config/test_config_generation.py
 (#29142)

Signed-off-by: charlifu <charlifu@amd.com>
---
 docker/Dockerfile.rocm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 731a97d93da1..42466d1801cf 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -7,6 +7,8 @@ FROM ${BASE_IMAGE} AS base
 
 ARG ARG_PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
+ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 
 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
@@ -121,8 +123,6 @@ COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker
 
-ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false
 
 # ENV that can improve safe tensor loading, and end-to-end time

From ceca06050124a10b33e78ee33d1a25a97edd1f74 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 22 Nov 2025 02:19:25 +0800
Subject: [PATCH 299/578] [Deprecation] Deprecate `seed=None` (#29185)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/arg_utils.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6eaf328eb165..888f57b1ac1d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -367,7 +367,7 @@ class EngineArgs:
     config_format: str = ModelConfig.config_format
     dtype: ModelDType = ModelConfig.dtype
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
-    seed: int | None = None
+    seed: int | None = 0
     max_model_len: int | None = ModelConfig.max_model_len
     cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
     cudagraph_capture_sizes: list[int] | None = (
@@ -1192,6 +1192,12 @@ def create_model_config(self) -> ModelConfig:
         # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
         # doesn't affect the user process.
         if self.seed is None:
+            logger.warning_once(
+                "`seed=None` is equivalent to `seed=0` in V1 Engine. "
+                "You will no longer be allowed to pass `None` in v0.13.",
+                scope="local",
+            )
+
             self.seed = 0
             if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
                 logger.warning(
@@ -1203,28 +1209,31 @@ def create_model_config(self) -> ModelConfig:
                 )
 
         if self.disable_mm_preprocessor_cache:
-            logger.warning(
+            logger.warning_once(
                 "`--disable-mm-preprocessor-cache` is deprecated "
                 "and will be removed in v0.13. "
                 "Please use `--mm-processor-cache-gb 0` instead.",
+                scope="local",
             )
 
             self.mm_processor_cache_gb = 0
         elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
-            logger.warning(
+            logger.warning_once(
                 "VLLM_MM_INPUT_CACHE_GIB` is deprecated "
                 "and will be removed in v0.13. "
                 "Please use `--mm-processor-cache-gb %d` instead.",
                 envs.VLLM_MM_INPUT_CACHE_GIB,
+                scope="local",
             )
 
             self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
 
         if self.enable_multimodal_encoder_data_parallel:
-            logger.warning(
+            logger.warning_once(
                 "--enable-multimodal-encoder-data-parallel` is deprecated "
                 "and will be removed in v0.13. "
-                "Please use `--mm-encoder-tp-mode data` instead."
+                "Please use `--mm-encoder-tp-mode data` instead.",
+                scope="local",
             )
 
             self.mm_encoder_tp_mode = "data"

From 1bed891f72a6cbd32c0c75dfaa29ad21d7a68b75 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Nov 2025 10:21:40 -0800
Subject: [PATCH 300/578] [Chore] Fix pre-commit error after #25266 (#29190)

---
 vllm/v1/worker/gpu/async_utils.py     | 20 +++++++++++---------
 vllm/v1/worker/gpu/attn_utils.py      | 14 ++++++++------
 vllm/v1/worker/gpu/cudagraph_utils.py | 12 ++++++++++--
 vllm/v1/worker/gpu/model_runner.py    | 16 ++++++++++------
 vllm/v1/worker/gpu/sampler.py         |  2 +-
 5 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
index 638ec6fb0b08..e523090aa217 100644
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -7,6 +7,7 @@
 
 from vllm.v1.outputs import (
     AsyncModelRunnerOutput,
+    LogprobsTensors,
     ModelRunnerOutput,
     SamplerOutput,
 )
@@ -46,15 +47,18 @@ def __init__(
                 "cpu", non_blocking=True
             )
             if sampler_output.logprobs_tensors is not None:
-                self.logprobs_tensors = (
+                self.logprobs_tensors: LogprobsTensors | None = (
                     sampler_output.logprobs_tensors.to_cpu_nonblocking()
                 )
             else:
                 self.logprobs_tensors = None
-            self.prompt_logprobs_dict = {}
+            self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
             if self.model_runner_output.prompt_logprobs_dict:
                 for k, v in self.model_runner_output.prompt_logprobs_dict.items():
-                    self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking()
+                    if v is not None:
+                        self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking()
+                    else:
+                        self.prompt_logprobs_dict[k] = None
             self.copy_event.record(self.copy_stream)
 
     def get_output(self) -> ModelRunnerOutput:
@@ -64,12 +68,10 @@ def get_output(self) -> ModelRunnerOutput:
         # the existing model runner.
         # Going forward, we should keep the data structures as NumPy arrays
         # rather than Python lists.
-        sampled_token_ids_np = self.sampled_token_ids.numpy()
-        num_reqs = sampled_token_ids_np.shape[0]
-        sampled_token_ids: list[np.ndarray] = [
-            sampled_token_ids_np[i, : self.num_sampled_tokens[i]]
-            for i in range(num_reqs)
-        ]
+        sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
+        num_reqs = len(sampled_token_ids)
+        for i in range(num_reqs):
+            del sampled_token_ids[i][self.num_sampled_tokens[i] :]
         self.model_runner_output.sampled_token_ids = sampled_token_ids
 
         if self.logprobs_tensors is not None:
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index 8850c1809229..222db565dff1 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import Any
+from typing import Any, cast
 
 import torch
 
@@ -13,6 +13,7 @@
     CommonAttentionMetadata,
 )
 from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
     KVCacheConfig,
     KVCacheSpec,
 )
@@ -22,7 +23,8 @@
 
 def get_kv_cache_spec(vllm_config: VllmConfig) -> dict[str, KVCacheSpec]:
     kv_cache_spec: dict[str, KVCacheSpec] = {}
-    attn_layers = get_layers_from_vllm_config(vllm_config, AttentionLayerBase)
+    layer_type = cast(type[Any], AttentionLayerBase)
+    attn_layers = get_layers_from_vllm_config(vllm_config, layer_type)
     for layer_name, attn_module in attn_layers.items():
         # Skip modules that don't need KV cache (eg encoder-only attention)
         if spec := attn_module.get_kv_cache_spec(vllm_config):
@@ -35,16 +37,15 @@ def init_attn_backend(
     vllm_config: VllmConfig,
     device: torch.device,
 ):
-    attn_backends: dict[str, AttentionBackend] = {}
+    attn_backends: dict[str, type[AttentionBackend]] = {}
     attn_metadata_builders: list[AttentionMetadataBuilder] = []
     flashinfer_workspace: torch.Tensor | None = None
     for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
         layer_names = kv_cache_group_spec.layer_names
         any_layer_name = next(iter(layer_names))
 
-        attn_layers = get_layers_from_vllm_config(
-            vllm_config, AttentionLayerBase, layer_names
-        )
+        layer_type = cast(type[Any], AttentionLayerBase)
+        attn_layers = get_layers_from_vllm_config(vllm_config, layer_type, layer_names)
         attn_backend = attn_layers[any_layer_name].get_attn_backend()
         for layer_name in layer_names:
             attn_backends[layer_name] = attn_backend
@@ -93,6 +94,7 @@ def _reshape_kv_cache(
     kv_caches: dict[str, torch.Tensor] = {}
     for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
         kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+        assert isinstance(kv_cache_spec, AttentionSpec)
         for layer_name in kv_cache_group_spec.layer_names:
             raw_tensor = kv_cache_raw_tensors[layer_name]
             assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 7fd1f76669f4..31a706475243 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -34,8 +34,16 @@ def __init__(
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
 
-        self.cudagraph_mode = self.compilation_config.cudagraph_mode
-        self.cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
+        if self.compilation_config.cudagraph_mode is None:
+            self.cudagraph_mode = CUDAGraphMode.NONE
+        else:
+            self.cudagraph_mode = self.compilation_config.cudagraph_mode
+        if self.compilation_config.cudagraph_capture_sizes is not None:
+            self.cudagraph_sizes = sorted(
+                self.compilation_config.cudagraph_capture_sizes
+            )
+        else:
+            self.cudagraph_sizes = []
         self.padded_sizes = self._init_padded_sizes()
 
         self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 08aad9ddd06b..9ca37ff282d8 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -329,8 +329,9 @@ def warmup_for_prefill(self) -> None:
             torch.cuda.synchronize()
 
     def update_states(self, scheduler_output: SchedulerOutput) -> None:
-        for req_id in scheduler_output.preempted_req_ids:
-            self.req_states.remove_request(req_id)
+        if scheduler_output.preempted_req_ids is not None:
+            for req_id in scheduler_output.preempted_req_ids:
+                self.req_states.remove_request(req_id)
         for req_id in scheduler_output.finished_req_ids:
             self.req_states.remove_request(req_id)
 
@@ -346,6 +347,9 @@ def update_states(self, scheduler_output: SchedulerOutput) -> None:
 
         # Add new requests.
         for new_req_data in scheduler_output.scheduled_new_reqs:
+            assert new_req_data.prompt_token_ids is not None
+            assert new_req_data.prefill_token_ids is not None
+            assert new_req_data.sampling_params is not None
             req_id = new_req_data.req_id
             self.req_states.add_request(
                 req_id=req_id,
@@ -398,8 +402,8 @@ def prepare_inputs(
         # Decode first, then prefill.
         # batch_idx -> req_id
         req_ids = sorted(
-            scheduler_output.num_scheduled_tokens,
-            key=scheduler_output.num_scheduled_tokens.get,
+            scheduler_output.num_scheduled_tokens.keys(),
+            key=lambda k: scheduler_output.num_scheduled_tokens[k],
         )
         num_scheduled_tokens = np.array(
             [scheduler_output.num_scheduled_tokens[i] for i in req_ids], dtype=np.int32
@@ -637,9 +641,9 @@ def postprocess(
         model_runner_output = ModelRunnerOutput(
             req_ids=input_batch.req_ids,
             req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)},
-            sampled_token_ids=None,
+            sampled_token_ids=None,  # type: ignore
             logprobs=None,
-            prompt_logprobs_dict=prompt_logprobs_dict,
+            prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore
             pooler_output=[],
             kv_connector_output=None,
             num_nans_in_logits=None,
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index e916aadb6b5a..55f98ca6bb6a 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -8,8 +8,8 @@
 
 from vllm.config.model import LogprobsMode
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
-from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.worker.gpu.states import SamplingMetadata
 
 
 class Sampler:

From 1840c5cb1818ae036cb4d8276d37ce81142acbee Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 21 Nov 2025 14:41:52 -0500
Subject: [PATCH 301/578] [BugFix] Make sure to allocate worst case MoE
 workspace during profile run in the DP + EP case (#27426)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/envs.py                                  |  4 +-
 .../layers/fused_moe/modular_kernel.py        | 41 +++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index d2d691740342..9b1ed1fc680b 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -55,7 +55,7 @@
     VLLM_CPU_SGL_KERNEL: bool = False
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
     VLLM_XLA_CHECK_RECOMPILATION: bool = False
-    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
+    VLLM_FUSED_MOE_CHUNK_SIZE: int = 16 * 1024
     VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", "shm"] = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
@@ -785,7 +785,7 @@ def get_vllm_port() -> int | None:
     # Enable SPMD mode for TPU backend.
     "VLLM_XLA_USE_SPMD": lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
     "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(
-        os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")
+        os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(16 * 1024))
     ),
     # Control whether to use fused MoE activation chunking. Current chunking
     # logic is incompatible with torch.compile and causes IMA. See issue
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 4af7af9257df..b2af58cdca88 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -10,6 +10,9 @@
 import torch
 
 import vllm.envs as envs
+from vllm.config import get_current_vllm_config
+from vllm.forward_context import get_forward_context, is_forward_context_available
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache,
@@ -26,6 +29,8 @@
     dbo_yield,
 )
 
+logger = init_logger(__name__)
+
 #
 # This file defines a set of base classes used to make MoE kernels more modular.
 # The goal is to be able to utilize different communication mechanisms with
@@ -798,6 +803,42 @@ def _allocate_buffers(
         buffers = self.shared_buffers[ubatch_idx]
         workspace_dtype = self.fused_experts.workspace_dtype(out_dtype)
 
+        # Force worst-case allocation in profiling run for
+        # "mk.FusedMoEModularKernel.Standard" formats where this is only bounded
+        # by `VLLM_FUSED_MOE_CHUNK_SIZE` and may not be seen during profiling with
+        # DP+EP due to the random token routing.
+        is_profile_run = (
+            is_forward_context_available()
+            and get_forward_context().attn_metadata is None
+        )
+        if is_profile_run and self.fused_experts.supports_chunking():
+            parallel_config = get_current_vllm_config().parallel_config
+            is_dp_ep = (
+                parallel_config.data_parallel_size > 1
+                and parallel_config.enable_expert_parallel
+            )
+            if is_dp_ep:
+                max_workspace_13, max_workspace_2, max_fused_out_shape = (
+                    self.fused_experts.workspace_shapes(
+                        envs.VLLM_FUSED_MOE_CHUNK_SIZE,
+                        N,
+                        K,
+                        top_k,
+                        global_num_experts,
+                        local_num_experts,
+                        expert_tokens_meta,
+                    )
+                )
+                buffers.workspace13.get(
+                    max_workspace_13, device=device, dtype=workspace_dtype
+                )
+                buffers.workspace2.get(
+                    max_workspace_2, device=device, dtype=workspace_dtype
+                )
+                buffers.fused_out.get(
+                    max_fused_out_shape, device=device, dtype=workspace_dtype
+                )
+
         # Get intermediate workspace shapes based off the chunked M size.
         workspace13_shape, workspace2_shape, _ = self.fused_experts.workspace_shapes(
             M_chunk,

From 53a1ba6ec584ea93531a3195b3b9f8049786055b Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sat, 22 Nov 2025 05:06:09 +0800
Subject: [PATCH 302/578] [log] add weights loading time log to sharded_state
 loader (#28628)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/model_executor/model_loader/sharded_state_loader.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index d94dbd9f06e0..1538f0c2af65 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -4,6 +4,7 @@
 import collections
 import glob
 import os
+import time
 from collections.abc import Generator
 from typing import Any
 
@@ -132,6 +133,7 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
                 f"pre-sharded checkpoints are currently supported!"
             )
         state_dict = self._filter_subtensors(model.state_dict())
+        counter_before_loading_weights = time.perf_counter()
         for key, tensor in self.iterate_over_files(filepaths):
             # If loading with LoRA enabled, additional padding may
             # be added to certain parameters. We only load into a
@@ -150,6 +152,12 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
                 )
             param_data.copy_(tensor)
             state_dict.pop(key)
+        counter_after_loading_weights = time.perf_counter()
+        logger.info_once(
+            "Loading weights took %.2f seconds",
+            counter_after_loading_weights - counter_before_loading_weights,
+            scope="local",
+        )
         if state_dict:
             raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
 

From c68c7b403dce632dbbbb6d2482ea86fe7bf53d51 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 21 Nov 2025 16:58:32 -0500
Subject: [PATCH 303/578] [BugFix] Fix missing symbol triggering FA2 fallback
 on Hopper (#29107)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 cmake/external_projects/vllm_flash_attn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 6cc5cda14c52..ff687e0af7b4 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 71bb26f6295449be880344b93b51791cc009237d
+          GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From 57430fc95c8a94a7c68b3d525e3b8823b0f2433f Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Fri, 21 Nov 2025 22:58:59 +0100
Subject: [PATCH 304/578] Default model load/config/tokenizer to `mistral`
 format if relevant files exist (#28659)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 docs/features/tool_calling.md                 | 23 +++++--
 .../language/generation/test_mistral.py       |  2 +-
 tests/models/multimodal/test_mapping.py       | 14 ++++-
 .../models/quantization/test_bitsandbytes.py  |  3 +
 tests/tool_use/utils.py                       |  6 ++
 tests/transformers_utils/test_config.py       | 62 +++++++++++++++++++
 tests/transformers_utils/test_utils.py        |  6 +-
 .../llm/test_struct_output_generate.py        | 14 ++++-
 vllm/config/model.py                          |  9 +--
 vllm/model_executor/model_loader/__init__.py  |  2 +
 .../model_loader/default_loader.py            | 20 +++++-
 vllm/transformers_utils/config.py             | 46 +++++++++++++-
 vllm/transformers_utils/configs/mistral.py    |  2 +-
 vllm/transformers_utils/tokenizer.py          | 30 +++++----
 vllm/v1/engine/processor.py                   | 25 +++++++-
 15 files changed, 230 insertions(+), 34 deletions(-)
 create mode 100644 tests/transformers_utils/test_config.py

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 7e6c69e717db..dd79ba19b724 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -142,7 +142,7 @@ Flags: `--tool-call-parser hermes`
 Supported models:
 
 * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
-* Additional mistral function-calling models are compatible as well.
+* Additional Mistral function-calling models are compatible as well.
 
 Known issues:
 
@@ -158,12 +158,25 @@ Known issues:
 
 Recommended flags:
 
-1. To use [mistral-common](https://github.com/mistralai/mistral-common) the official Mistral tokenization backend:
+1. To use the official Mistral AI's format:
 
-    `--tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral`
+    `--tool-call-parser mistral`
 
-2. To use the default Transformers tokenization backend:
-    `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+2. To use the Transformers format when available:
+
+    `--tokenizer_mode hf --config_format hf --load_format hf --tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+
+!!! note
+    Models officially released by Mistral AI have two possible formats:
+
+    1. The official format that is used by default with `auto` or `mistral` arguments:
+
+        `--tokenizer_mode mistral --config_format mistral --load_format mistral`
+        This format uses [mistral-common](https://github.com/mistralai/mistral-common), the Mistral AI's tokenizer backend.
+
+    2. The Transformers format, when available, that is used with `hf` arguments:
+
+        `--tokenizer_mode hf --config_format hf --load_format hf --chat-template examples/tool_chat_template_mistral_parallel.jinja`
 
 ### Llama Models (`llama3_json`)
 
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 0ae83ec16020..80e337d570a3 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -208,7 +208,7 @@ def test_mistral_format(
     with vllm_runner(
         model,
         dtype=dtype,
-        tokenizer_mode="auto",
+        tokenizer_mode="hf",
         load_format="safetensors",
         config_format="hf",
     ) as hf_format_model:
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 2f38dc450ef9..0d2eaca95504 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -50,12 +50,24 @@ def test_hf_model_weights_mapper(model_arch: str):
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
+    is_mistral_model = model_arch in [
+        "Mistral3ForConditionalGeneration",
+        "PixtralForConditionalGeneration",
+        "VoxtralForConditionalGeneration",
+    ]
+
+    if not is_mistral_model or model_info.tokenizer_mode == "mistral":
+        tokenizer_mode = model_info.tokenizer_mode
+    else:
+        tokenizer_mode = "hf"
+
     model_id = model_info.default
 
     model_config = ModelConfig(
         model_id,
         tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
+        tokenizer_mode=tokenizer_mode,
+        config_format="hf",
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py
index dc4b4546e451..5b8aaa299fdc 100644
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -259,6 +259,9 @@ def validate_generated_texts(
         tensor_parallel_size=vllm_tp_size,
         enforce_eager=False,
         default_torch_num_threads=1,
+        tokenizer_mode="hf",
+        load_format="hf",
+        config_format="hf",
     ) as llm:
         vllm_outputs = llm.generate_greedy(prompts, max_tokens)
         vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index d188b2186381..7584b903156b 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -128,6 +128,12 @@ def ensure_system_prompt(
         "arguments": [
             "--enforce-eager",
             "--no-enable-prefix-caching",
+            "--tokenizer_mode",
+            "hf",
+            "--load_format",
+            "hf",
+            "--config_format",
+            "hf",
             "--tool-call-parser",
             "mistral",
             "--chat-template",
diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py
new file mode 100644
index 000000000000..de28ab5f99e8
--- /dev/null
+++ b/tests/transformers_utils/test_config.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
+from vllm.transformers_utils.config import list_filtered_repo_files
+
+
+@pytest.mark.parametrize(
+    "allow_patterns,expected_relative_files",
+    [
+        (
+            ["*.json", "correct*.txt"],
+            ["json_file.json", "subfolder/correct.txt", "correct_2.txt"],
+        ),
+    ],
+)
+def test_list_filtered_repo_files(
+    allow_patterns: list[str], expected_relative_files: list[str]
+):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        # Prep folder and files
+        path_tmp_dir = Path(tmp_dir)
+        subfolder = path_tmp_dir / "subfolder"
+        subfolder.mkdir()
+        (path_tmp_dir / "json_file.json").touch()
+        (path_tmp_dir / "correct_2.txt").touch()
+        (path_tmp_dir / "uncorrect.txt").touch()
+        (path_tmp_dir / "uncorrect.jpeg").touch()
+        (subfolder / "correct.txt").touch()
+        (subfolder / "uncorrect_sub.txt").touch()
+
+        def _glob_path() -> list[str]:
+            return [
+                str(file.relative_to(path_tmp_dir))
+                for file in path_tmp_dir.glob("**/*")
+                if file.is_file()
+            ]
+
+        # Patch list_repo_files called by fn
+        with patch(
+            "vllm.transformers_utils.config.list_repo_files",
+            MagicMock(return_value=_glob_path()),
+        ) as mock_list_repo_files:
+            out_files = sorted(
+                list_filtered_repo_files(
+                    tmp_dir, allow_patterns, "revision", "model", "token"
+                )
+            )
+        assert out_files == sorted(expected_relative_files)
+        assert mock_list_repo_files.call_count == 1
+        assert mock_list_repo_files.call_args_list[0] == call(
+            repo_id=tmp_dir,
+            revision="revision",
+            repo_type="model",
+            token="token",
+        )
diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py
index beaef04d766b..bfe1cec76c13 100644
--- a/tests/transformers_utils/test_utils.py
+++ b/tests/transformers_utils/test_utils.py
@@ -2,7 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-from vllm.transformers_utils.utils import is_cloud_storage, is_gcs, is_s3
+from vllm.transformers_utils.utils import (
+    is_cloud_storage,
+    is_gcs,
+    is_s3,
+)
 
 
 def test_is_gcs():
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index a00600b87eca..d1b037b7956c 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -46,11 +46,15 @@
 
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
+    # FIXME: Since "auto" will use Mistral tokenizer and these backends do not support
+    # it, we skip these tests for now.
+    # ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
+    # ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", None),
     pytest.param(
         "mistralai/Ministral-8B-Instruct-2410",
         "lm-format-enforcer",
-        "auto",
+        "hf",
         None,
         marks=pytest.mark.skip(
             reason=(
@@ -80,7 +84,7 @@
     # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
     # ("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
     ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", NGRAM_SPEC_CONFIG),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", NGRAM_SPEC_CONFIG),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", NGRAM_SPEC_CONFIG),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),
     ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto", EAGLE_SPEC_CONFIG),
 ]
@@ -151,6 +155,8 @@ def test_structured_output(
         ),
         seed=120,
         tokenizer_mode=tokenizer_mode,
+        load_format="auto" if not model_name.startswith("mistralai/") else "hf",
+        config_format="auto" if not model_name.startswith("mistralai/") else "hf",
         speculative_config=speculative_config,
     )
 
@@ -720,6 +726,8 @@ def test_structured_output_auto_mode(
         max_model_len=1024,
         structured_outputs_config=dict(backend="auto"),
         tokenizer_mode=tokenizer_mode,
+        load_format="auto",
+        config_format="auto",
     )
 
     sampling_params = SamplingParams(
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 8f59673f4e1c..49688e17cf93 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -81,7 +81,7 @@
     "transcription",
     "draft",
 ]
-TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
+TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -130,7 +130,8 @@ class ModelConfig:
     name or path will be used."""
     tokenizer_mode: TokenizerMode = "auto"
     """Tokenizer mode:\n
-    - "auto" will use the fast tokenizer if available.\n
+    - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
+    - "hf" will use the fast tokenizer if available.\n
     - "slow" will always use the slow tokenizer.\n
     - "mistral" will always use the tokenizer from `mistral_common`.\n
     - "custom" will use --tokenizer to select the preregistered tokenizer."""
@@ -241,8 +242,8 @@ class ModelConfig:
     first one."""
     config_format: str | ConfigFormat = "auto"
     """The format of the model config to load:\n
-    - "auto" will try to load the config in hf format if available else it
-    will try to load in mistral format.\n
+    - "auto" will try to load the config in hf format if available after trying
+    to load in mistral format.\n
     - "hf" will load the config in hf format.\n
     - "mistral" will load the config in mistral format."""
     hf_token: bool | str | None = None
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index 301f2d00bf40..052d2cfc1099 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -30,6 +30,7 @@
 # if a new load format is added here
 LoadFormats = Literal[
     "auto",
+    "hf",
     "bitsandbytes",
     "dummy",
     "fastsafetensors",
@@ -45,6 +46,7 @@
 ]
 _LOAD_FORMAT_TO_MODEL_LOADER: dict[str, type[BaseModelLoader]] = {
     "auto": DefaultModelLoader,
+    "hf": DefaultModelLoader,
     "bitsandbytes": BitsAndBytesModelLoader,
     "dummy": DummyModelLoader,
     "fastsafetensors": DefaultModelLoader,
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 67aa584c6bda..7401a7a0e2db 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -31,6 +31,7 @@
     safetensors_weights_iterator,
 )
 from vllm.platforms import current_platform
+from vllm.transformers_utils.config import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
@@ -96,8 +97,25 @@ def _prepare_weights(
         load_format = self.load_config.load_format
         use_safetensors = False
         index_file = SAFE_WEIGHTS_INDEX_NAME
-        # Some quantized models use .pt files for storing the weights.
+
+        # First check for 'auto' format that mistral files format are present.
+        # This is to load mistral models with official format by default.
         if load_format == "auto":
+            load_format = (
+                "mistral"
+                if len(
+                    list_filtered_repo_files(
+                        model_name_or_path=model_name_or_path,
+                        allow_patterns=["consolidated*.safetensors"],
+                        revision=revision,
+                    )
+                )
+                > 0
+                else "hf"
+            )
+
+        # Some quantized models use .pt files for storing the weights.
+        if load_format == "hf":
             allow_patterns = ["*.safetensors", "*.bin"]
         elif load_format == "safetensors" or load_format == "fastsafetensors":
             use_safetensors = True
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index df24738477e7..9eac7bb50afa 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import fnmatch
 import json
 import os
 import time
@@ -355,6 +356,41 @@ def lookup_files() -> list[str]:
     return with_retry(lookup_files, "Error retrieving file list")
 
 
+def list_filtered_repo_files(
+    model_name_or_path: str,
+    allow_patterns: list[str],
+    revision: str | None = None,
+    repo_type: str | None = None,
+    token: str | bool | None = None,
+) -> list[str]:
+    try:
+        all_files = list_repo_files(
+            repo_id=model_name_or_path,
+            revision=revision,
+            token=token,
+            repo_type=repo_type,
+        )
+    except Exception:
+        logger.error(
+            "Error retrieving file list. Please ensure your `model_name_or_path`"
+            "`repo_type`, `token` and `revision` arguments are correctly set. "
+            "Returning an empty list."
+        )
+        return []
+
+    file_list = []
+    # Filter patterns on filenames
+    for pattern in allow_patterns:
+        file_list.extend(
+            [
+                file
+                for file in all_files
+                if fnmatch.fnmatch(os.path.basename(file), pattern)
+            ]
+        )
+    return file_list
+
+
 def file_exists(
     repo_id: str,
     file_name: str,
@@ -619,10 +655,14 @@ def get_config(
 
     if config_format == "auto":
         try:
-            if is_gguf or file_or_path_exists(model, HF_CONFIG_NAME, revision=revision):
-                config_format = "hf"
-            elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
+            # First check for Mistral to avoid defaulting to
+            # Transformers implementation.
+            if file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
                 config_format = "mistral"
+            elif is_gguf or file_or_path_exists(
+                model, HF_CONFIG_NAME, revision=revision
+            ):
+                config_format = "hf"
             else:
                 raise ValueError(
                     "Could not detect config format for no config file found. "
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index fe202b2ed156..8da4ab35c56c 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -118,7 +118,7 @@ def _remap_general_mistral_args(config: dict) -> dict:
         "model_type": ("model_type", "transformer"),
         "hidden_act": ("activation", "silu"),
         "tie_word_embeddings": ("tied_embeddings", False),
-        "max_seq_len": ("max_seq_len", 128_000),
+        "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
         "max_position_embeddings": ("max_position_embeddings", 128_000),
     }
 
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index a393568909d2..233076741503 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -3,8 +3,8 @@
 
 import contextlib
 import copy
+import importlib.util
 import os
-import warnings
 from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, TypeAlias
@@ -15,7 +15,10 @@
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
+from vllm.transformers_utils.config import (
+    get_sentence_transformer_tokenizer_config,
+    list_filtered_repo_files,
+)
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import check_gguf_file
 
@@ -182,25 +185,29 @@ def get_tokenizer(
         kwargs["gguf_file"] = Path(tokenizer_name).name
         tokenizer_name = Path(tokenizer_name).parent
 
-    # if tokenizer is from official mistral org
-    is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
-    if is_from_mistral_org and tokenizer_mode != "mistral":
-        warnings.warn(
-            "It is strongly recommended to run mistral models with "
-            '`--tokenizer-mode "mistral"` to ensure correct '
-            "encoding and decoding.",
-            FutureWarning,
-            stacklevel=2,
+    # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
+    # first to use official Mistral tokenizer if possible.
+    mistral_common_installed = importlib.util.find_spec("mistral_common") is not None
+    if tokenizer_mode == "auto" and mistral_common_installed:
+        allow_patterns = ["tekken.json", "tokenizer.model.v*"]
+        files_list = list_filtered_repo_files(
+            model_name_or_path=str(tokenizer_name),
+            allow_patterns=allow_patterns,
+            revision=revision,
         )
+        if len(files_list) > 0:
+            tokenizer_mode = "mistral"
 
     tokenizer: AnyTokenizer
     if tokenizer_mode == "mistral":
+        logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
         tokenizer = MistralTokenizer.from_pretrained(
             str(tokenizer_name), revision=revision
         )
     elif tokenizer_mode == "custom":
         from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
 
+        logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
         tokenizer = TokenizerRegistry.get_tokenizer(
             str(tokenizer_name),
             *args,
@@ -210,6 +217,7 @@ def get_tokenizer(
         )
     else:
         try:
+            logger.debug_once(f"Loading AutoTokenizer from {tokenizer_name}")
             tokenizer = AutoTokenizer.from_pretrained(
                 tokenizer_name,
                 *args,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 905ad406b307..af4f0e410e25 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -20,6 +20,7 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import MultiModalCacheStats
@@ -300,12 +301,24 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
             # allows <|special_token|> and similar, see
             # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
             # Without tokenizer these are disallowed in grammars.
+            if isinstance(self.tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "Mistral tokenizer is not supported for the 'guidance' "
+                    "structured output backend. Please use ['xgrammar', 'outlines'] "
+                    "backends or tokenizer_mode='hf' instead."
+                )
             validate_guidance_grammar(params, tokenizer=None)
         elif backend == "outlines":
             # outlines backend
             validate_structured_output_request_outlines(params)
         elif backend == "lm-format-enforcer":
             # lm format enforcer backend
+            if isinstance(self.tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "Mistral tokenizer is not supported for the 'lm-format-enforcer' "
+                    "structured output backend. Please use ['xgrammar', 'outlines'] "
+                    "backends or tokenizer_mode='hf' instead."
+                )
             validate_structured_output_request_lm_format_enforcer(params)
         else:
             # NOTE: backend must be "auto" here, because we have
@@ -320,9 +333,15 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
             except ValueError:
                 # The request either failed validation
                 # or includes some jsonschema feature(s) that
-                # are not supported in xgrammar. Fall back to guidance.
-                validate_guidance_grammar(params, tokenizer=None)
-                params.structured_outputs._backend = "guidance"
+                # are not supported in xgrammar.
+                if isinstance(self.tokenizer, MistralTokenizer):
+                    # Fall back to outlines if the tokenizer is Mistral
+                    validate_structured_output_request_outlines(params)
+                    params.structured_outputs._backend = "outlines"
+                else:
+                    # Fall back to guidance by default.
+                    validate_guidance_grammar(params, tokenizer=None)
+                    params.structured_outputs._backend = "guidance"
             # Remember that this backend was set automatically
             params.structured_outputs._backend_was_auto = True
 

From 3137991f55c9372d4743154a56933a37e47feca7 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Fri, 21 Nov 2025 17:28:17 -0500
Subject: [PATCH 305/578] [BugFix] EPLB + B200 + DeepGEMM : Handle column-major
 scales tensor (#29162)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 tests/distributed/eplb_utils.py               |  49 +++
 tests/distributed/test_eplb_execute.py        |  42 +--
 .../distributed/test_eplb_fused_moe_layer.py  | 285 ++++++++++++++++++
 vllm/model_executor/layers/fused_moe/layer.py |  41 +++
 4 files changed, 377 insertions(+), 40 deletions(-)
 create mode 100644 tests/distributed/eplb_utils.py
 create mode 100644 tests/distributed/test_eplb_fused_moe_layer.py

diff --git a/tests/distributed/eplb_utils.py b/tests/distributed/eplb_utils.py
new file mode 100644
index 000000000000..27a63e021514
--- /dev/null
+++ b/tests/distributed/eplb_utils.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import random
+
+import torch
+import torch.multiprocessing as mp
+
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+)
+from vllm.utils.system_utils import update_environment_variables
+
+mp.set_start_method("spawn", force=True)
+
+
+def distributed_run(fn, world_size, *args):
+    number_of_processes = world_size
+    processes: list[mp.Process] = []
+    for i in range(number_of_processes):
+        env: dict[str, str] = {}
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = mp.Process(target=fn, args=(env, world_size, *args))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def set_env_vars_and_device(env: dict[str, str]) -> None:
+    update_environment_variables(env)
+    local_rank = os.environ["LOCAL_RANK"]
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    init_distributed_environment()
+
+    # Ensure each worker process has the same random seed
+    random.seed(42)
+    torch.manual_seed(42)
diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 0a97749ac318..9498e75b279b 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -1,57 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import os
 import random
 
 import pytest
 import torch
 import torch.distributed
-import torch.multiprocessing as mp
 
 from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
 from vllm.distributed.parallel_state import (
     ensure_model_parallel_initialized,
     get_tp_group,
-    init_distributed_environment,
 )
-from vllm.utils.system_utils import update_environment_variables
-
-mp.set_start_method("spawn", force=True)
-
-
-def distributed_run(fn, world_size, *args):
-    number_of_processes = world_size
-    processes: list[mp.Process] = []
-    for i in range(number_of_processes):
-        env: dict[str, str] = {}
-        env["RANK"] = str(i)
-        env["LOCAL_RANK"] = str(i)
-        env["WORLD_SIZE"] = str(number_of_processes)
-        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
-        env["MASTER_ADDR"] = "localhost"
-        env["MASTER_PORT"] = "12345"
-        p = mp.Process(target=fn, args=(env, world_size, *args))
-        processes.append(p)
-        p.start()
-
-    for p in processes:
-        p.join()
-
-    for p in processes:
-        assert p.exitcode == 0
-
-
-def set_env_vars_and_device(env: dict[str, str]) -> None:
-    update_environment_variables(env)
-    local_rank = os.environ["LOCAL_RANK"]
-    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
-    init_distributed_environment()
-
-    # Ensure each worker process has the same random seed
-    random.seed(42)
-    torch.manual_seed(42)
+
+from .eplb_utils import distributed_run, set_env_vars_and_device
 
 
 def create_expert_indices_with_redundancy(
diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py
new file mode 100644
index 000000000000..55f26519887a
--- /dev/null
+++ b/tests/distributed/test_eplb_fused_moe_layer.py
@@ -0,0 +1,285 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Test that the interaction between EPLB and FusedMoE Layer is okay
+
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    get_tp_group,
+)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+from .eplb_utils import distributed_run, set_env_vars_and_device
+
+
+@dataclass
+class TestConfig:
+    num_layers: int
+    num_experts: int
+    num_local_experts: int
+    num_topk: int
+    hidden_size: int
+    intermediate_size: int
+    weight_dtype: torch.dtype
+    weight_scale_dtype: torch.dtype | None
+    column_major_scales: bool
+
+
+def make_expert_weights(
+    layer_idx: int,
+    global_expert_idx: int,
+    global_num_experts: int,
+    tensor_shape: tuple[int, ...],
+    tensor_dtype: torch.dtype,
+    tensor_device: torch.device,
+    is_column_major: bool,
+) -> torch.Tensor:
+    assert len(tensor_shape) == 2
+
+    if is_column_major:
+        tensor_shape = (tensor_shape[1], tensor_shape[0])
+
+    x = torch.empty(tensor_shape, dtype=tensor_dtype, device=tensor_device)
+    value_offset = (layer_idx * global_num_experts + global_expert_idx) * x.numel()
+    x.view(-1).copy_(
+        torch.arange(
+            value_offset,
+            value_offset + x.numel(),
+            dtype=tensor_dtype,
+            device=tensor_device,
+        )
+    )
+
+    if is_column_major:
+        x = torch.transpose(x, 1, 0)
+        assert not x.is_contiguous()
+    return x
+
+
+def make_fused_moe_layer(
+    rank: int,
+    layer_idx: int,
+    test_config: TestConfig,
+) -> FusedMoE:
+    fml = FusedMoE(
+        num_experts=test_config.num_experts,
+        top_k=test_config.num_topk,
+        hidden_size=test_config.hidden_size,
+        intermediate_size=test_config.intermediate_size,
+        prefix=f"dummy_layer_{layer_idx}",
+        activation="silu",
+        is_act_and_mul=True,
+        params_dtype=test_config.weight_dtype,
+    )
+
+    device = torch.device(f"cuda:{rank}")
+
+    from functools import partial
+
+    _make_expert_weights = partial(
+        make_expert_weights,
+        layer_idx=layer_idx,
+        global_num_experts=test_config.num_experts,
+        tensor_device=device,
+    )
+
+    assert isinstance(fml.w13_weight.data, torch.Tensor)
+    assert isinstance(fml.w2_weight.data, torch.Tensor)
+    fml.w13_weight.data = fml.w13_weight.data.to(device=device)
+    fml.w2_weight.data = fml.w2_weight.data.to(device=device)
+    w13_weight = fml.w13_weight.data
+    w2_weight = fml.w2_weight.data
+    assert w13_weight.size(0) == test_config.num_local_experts
+    for i in range(test_config.num_local_experts):
+        g_i = rank * test_config.num_local_experts + i
+        w13_weight_e = w13_weight[i]
+        w2_weight_e = w2_weight[i]
+        w13_weight_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w13_weight_e.shape,
+                tensor_dtype=w13_weight_e.dtype,
+                is_column_major=False,
+            )
+        )
+        w2_weight_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w2_weight_e.shape,
+                tensor_dtype=w2_weight_e.dtype,
+                is_column_major=False,
+            )
+        )
+
+    block_size = 16
+
+    def block_quant_scales_shape(
+        shape: tuple[int, ...], is_column_major: bool
+    ) -> tuple[int, ...]:
+        assert len(shape) == 3
+        if not is_column_major:
+            return (shape[0], shape[1] // block_size, shape[2] // block_size)
+        else:
+            return (shape[0], shape[2] // block_size, shape[1] // block_size)
+
+    is_column_major = test_config.column_major_scales
+    w13_weight_scale_inv = torch.empty(
+        block_quant_scales_shape(w13_weight.shape, is_column_major),
+        dtype=test_config.weight_dtype,
+        device=device,
+    )
+    w2_weight_scale_inv = torch.empty(
+        block_quant_scales_shape(w2_weight.shape, is_column_major),
+        dtype=test_config.weight_dtype,
+        device=device,
+    )
+
+    for i in range(test_config.num_local_experts):
+        g_i = rank * test_config.num_local_experts + i
+        w13_s_e = w13_weight_scale_inv[i]
+        w2_s_e = w2_weight_scale_inv[i]
+        w13_s_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w13_s_e.shape,
+                tensor_dtype=w13_s_e.dtype,
+                # Fill data in row-major and then
+                # transpose if test_config requires col-major.
+                is_column_major=False,
+            )
+        )
+        w2_s_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w2_s_e.shape,
+                tensor_dtype=w2_s_e.dtype,
+                is_column_major=False,
+            )
+        )
+    if is_column_major:
+        w13_weight_scale_inv = torch.transpose(w13_weight_scale_inv, 1, 2)
+        w2_weight_scale_inv = torch.transpose(w2_weight_scale_inv, 1, 2)
+        assert not w13_weight_scale_inv.is_contiguous()
+        assert not w2_weight_scale_inv.is_contiguous()
+
+    # Add scales to the parameter list
+    fml.w13_weight_scale_inv = torch.nn.Parameter(
+        w13_weight_scale_inv, requires_grad=False
+    )
+    fml.w2_weight_scale_inv = torch.nn.Parameter(
+        w2_weight_scale_inv, requires_grad=False
+    )
+
+    return fml
+
+
+def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
+    # Initialize model parallel (using tensor parallel as an entrypoint
+    # to expert parallel)
+    set_env_vars_and_device(env)
+
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size
+    vllm_config.parallel_config.enable_expert_parallel = True
+
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+
+        fml_layers = [
+            make_fused_moe_layer(ep_rank, layer_idx, test_config)
+            for layer_idx in range(test_config.num_layers)
+        ]
+        rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers]
+
+        indices = torch.zeros(
+            test_config.num_layers, test_config.num_experts, dtype=torch.long
+        )
+        for lidx in range(test_config.num_layers):
+            indices[lidx] = torch.Tensor(range(test_config.num_experts))
+
+        shuffled_indices = torch.zeros_like(indices)
+        for lidx in range(test_config.num_layers):
+            shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
+
+        rearrange_expert_weights_inplace(
+            indices,
+            shuffled_indices,
+            rank_expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        num_local_experts = test_config.num_local_experts
+        num_global_experts = test_config.num_experts
+        for lidx, fml in enumerate(fml_layers):
+            for name, w in fml.named_parameters():
+                for e in range(num_local_experts):
+                    g_e = shuffled_indices[lidx][ep_rank * num_local_experts + e]
+                    ref = make_expert_weights(
+                        layer_idx=lidx,
+                        global_expert_idx=int(g_e.item()),
+                        global_num_experts=num_global_experts,
+                        tensor_shape=w[e].shape,
+                        tensor_dtype=w[e].dtype,
+                        tensor_device=w[e].device,
+                        is_column_major=not w[e].is_contiguous(),
+                    )
+                    assert w[e].shape == ref.shape and w[e].stride() == ref.stride(), (
+                        f"w[{e}] {w[e].size()} {w[e].stride()} vs "
+                        f"ref {ref.size()} {ref.stride()}"
+                    )
+                    torch.testing.assert_close(w[e], ref)
+
+
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.parametrize("num_layers", [4])
+@pytest.mark.parametrize("num_experts", [16])
+@pytest.mark.parametrize("hidden_size", [256])
+@pytest.mark.parametrize("intermediate_size", [256])
+@pytest.mark.parametrize("column_major_scales", [True, False])
+def test_eplb_fml(
+    world_size: int,
+    num_layers: int,
+    num_experts: int,
+    hidden_size: int,
+    intermediate_size: int,
+    column_major_scales: bool,
+):
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    num_local_experts = num_experts // world_size
+    num_topk = 4
+    # The dtypes are fine as we are essentially just checking data-copies
+    weight_dtype = torch.bfloat16
+    weight_scale_dtype = torch.bfloat16
+
+    test_config = TestConfig(
+        num_layers=num_layers,
+        num_experts=num_experts,
+        num_local_experts=num_local_experts,
+        num_topk=num_topk,
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        weight_dtype=weight_dtype,
+        weight_scale_dtype=weight_scale_dtype,
+        column_major_scales=column_major_scales,
+    )
+
+    distributed_run(
+        _test_eplb_fml,
+        world_size,
+        test_config,
+    )
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b2f554efd8a6..6619b64b2bbc 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1391,7 +1391,48 @@ def load_weights(
                     yield param_name
 
     def get_expert_weights(self) -> Iterable[torch.Tensor]:
+        def _maybe_make_contiguous(
+            name: str, p: torch.nn.Parameter
+        ) -> torch.nn.Parameter:
+            """
+            In some cases, the last 2 dimensions (the non-expert dimensions)
+            of the weight scale tensor are transposed. This function
+            transforms the tensor (view update) so the tensor is contiguous().
+            Example: A non-contiguous scale tensor,
+              `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to
+              `x_` of shape (E, 16, 32) and stride (512, 32, 1).
+              Note that we specifically use torch.transpose() so `x_` refers
+              to the same underlying memory. The tensors `x` and `x_`, pointing
+              to the same underlying memory make this transformation safe in the
+              context of EPLB. i.e. It is the same memory and just the view
+              is different.
+            Note: This function handles the "weight_scale" tensors specifically.
+            This could however be generalized to handle similar tensors.
+            """
+            if p.ndim != 3:
+                return p
+            if p.is_contiguous():
+                # Already contiguous. do nothing.
+                return p
+            # p is non-contiguous. We only handle the case where the last 2
+            # dimensions of the scales tensor is transposed. We can handle
+            # other cases when they become relevant.
+            is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1
+            if "weight_scale" not in name or not is_transposed_12:
+                # do nothing.
+                return p
+
+            # Do not update the layer paramater as the layer's MoE operations would
+            # expect the parameter's tensor to the same shape / stride. Instead,
+            # make a new torch.nn.Parameter that is used just in the context of
+            # EPLB.
+            return torch.nn.Parameter(
+                torch.transpose(p.data, 1, 2), requires_grad=False
+            )
+
         weights = list(self.named_parameters())
+        weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights]
+
         assert all(
             weight.is_contiguous()
             for name, weight in weights

From c6fa3895e90f6daef4d223188f6b4156311f40c9 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 21 Nov 2025 22:45:00 +0000
Subject: [PATCH 306/578] [KV Connector] Fix async connector prefix cache
 metrics (#28585)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
---
 tests/v1/core/test_scheduler.py | 17 +++++++++++++----
 vllm/v1/core/sched/scheduler.py | 16 ++++++++--------
 vllm/v1/request.py              |  3 +++
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 04e738293cd7..d9a69a77c979 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1057,7 +1057,8 @@ def test_kv_connector_basic(is_async: bool):
     )
 
 
-def test_external_prefix_cache_metrics():
+@pytest.mark.parametrize("is_async", [False, True])
+def test_external_prefix_cache_metrics(is_async: bool):
     """
     Verify connector prefix cache metrics are updated
     correctly when the scheduler processes requests with KV connector hits.
@@ -1067,7 +1068,9 @@ def test_external_prefix_cache_metrics():
     NUM_MATCHED_NEW_TOKENS = 4
     scheduler = create_scheduler(
         enable_prefix_caching=False,
-        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
+        use_kv_connector=mock_kv(
+            matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async
+        ),
     )
 
     # --- Prepare simple requests ---
@@ -1079,9 +1082,15 @@ def test_external_prefix_cache_metrics():
         num_tokens=NUM_TOKENS,
         max_tokens=MAX_TOKENS,
     )
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
 
-    for req in requests:
-        scheduler.add_request(req)
+    if is_async:
+        _step_until_kv_transfer_finished(scheduler, req_ids)
 
     # --- Trigger scheduling and simulate model output ---
     output = scheduler.schedule()
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 9195b112d869..4cb5348cbacc 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -470,6 +470,7 @@ def schedule(self) -> SchedulerOutput:
                             skipped_waiting_requests.prepend_request(request)
                             continue
 
+                        request.num_external_computed_tokens = ext_tokens
                         num_external_computed_tokens = ext_tokens
 
                     # Total computed tokens (local + external).
@@ -576,9 +577,6 @@ def schedule(self) -> SchedulerOutput:
                         new_computed_blocks + new_blocks,
                         num_external_computed_tokens,
                     )
-                    self._update_connector_prefix_cache_stats(
-                        request, num_external_computed_tokens
-                    )
 
                 # Request was already popped from self.waiting
                 # unless it was re-added above due to new_blocks being None.
@@ -590,6 +588,8 @@ def schedule(self) -> SchedulerOutput:
                     request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
                     continue
 
+                self._update_connector_prefix_cache_stats(request)
+
                 req_index += 1
                 self.running.append(request)
                 if self.log_stats:
@@ -1380,15 +1380,13 @@ def shutdown(self) -> None:
     # KV Connector Related Methods
     ########################################################################
 
-    def _update_connector_prefix_cache_stats(
-        self, request: Request, num_external_tokens: int
-    ) -> None:
+    def _update_connector_prefix_cache_stats(self, request: Request) -> None:
         if self.connector_prefix_cache_stats is None:
             return
 
         self.connector_prefix_cache_stats.record(
             num_tokens=request.num_tokens,
-            num_hits=num_external_tokens,
+            num_hits=request.num_external_computed_tokens,
             preempted=request.num_preemptions > 0,
         )
 
@@ -1571,9 +1569,11 @@ def _update_requests_with_invalid_blocks(
                 marked_invalid_block = True
                 # Truncate the computed tokens at the first failed block
                 request.num_computed_tokens = idx * self.block_size
-                total_affected_tokens += (
+                num_affected_tokens = (
                     req_num_computed_tokens - request.num_computed_tokens
                 )
+                total_affected_tokens += num_affected_tokens
+                request.num_external_computed_tokens -= num_affected_tokens
 
             if is_affected:
                 if not marked_invalid_block:
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 3d92906fbf4b..366cdadf5a58 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -121,6 +121,9 @@ def __init__(
         # The number of requests being preempted by the scheduler
         self.num_preemptions = 0
 
+        # The number of tokens that have been computed remotely.
+        self.num_external_computed_tokens = 0
+
         self.block_hashes: list[BlockHash] = []
         self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None
         if block_hasher is not None:

From e9af6ba62ac99683139ff8d6bac87677fecf0b0c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Nov 2025 15:52:28 -0800
Subject: [PATCH 307/578] [Model Runner V2] Optimize Gumbel Sampling Kernel
 (#29210)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sampler.py | 93 ++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 50 deletions(-)

diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index 55f98ca6bb6a..499e9d3b1538 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -3,10 +3,9 @@
 from collections.abc import Callable
 
 import torch
-import triton
-import triton.language as tl
 
 from vllm.config.model import LogprobsMode
+from vllm.triton_utils import tl, triton
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
 from vllm.v1.worker.gpu.states import SamplingMetadata
@@ -78,7 +77,10 @@ def sample(
 
 @triton.jit
 def _gumbel_sample_kernel(
-    sampled_ptr,
+    local_argmax_ptr,
+    local_argmax_stride,
+    local_max_ptr,
+    local_max_stride,
     logits_ptr,
     logits_stride,
     seeds_ptr,
@@ -88,40 +90,21 @@ def _gumbel_sample_kernel(
     BLOCK_SIZE: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
-    is_greedy = tl.load(is_greedy_ptr + req_idx)
-
-    if is_greedy:
-        # Greedy sampling. Don't apply gumbel noise.
-        max_val = float("-inf")
-        max_idx = 0
-        for i in range(0, vocab_size, BLOCK_SIZE):
-            block = i + tl.arange(0, BLOCK_SIZE)
-            mask = block < vocab_size
-            logits = tl.load(
-                logits_ptr + req_idx * logits_stride + block,
-                mask=mask,
-                other=float("-inf"),
-            )
-
-            idx = tl.argmax(logits, axis=0)
-            value = tl.max(logits, axis=0)
-            is_greater = value > max_val
-            max_val = tl.where(is_greater, value, max_val)
-            max_idx = tl.where(is_greater, i + idx, max_idx)
-        tl.store(sampled_ptr + req_idx, max_idx)
-        return
-
-    # Random sampling.
-    # Calculate gumbel seed.
-    seed = tl.load(seeds_ptr + req_idx)
-    pos = tl.load(pos_ptr + req_idx)
-    gumbel_seed = tl.randint(seed, pos)
+    block_idx = tl.program_id(1)
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = block < vocab_size
+    logits = tl.load(
+        logits_ptr + req_idx * logits_stride + block,
+        mask=mask,
+        other=float("-inf"),
+    )
 
-    max_val = float("-inf")
-    max_idx = 0
-    for i in range(0, vocab_size, BLOCK_SIZE):
-        block = i + tl.arange(0, BLOCK_SIZE)
-        mask = block < vocab_size
+    is_greedy = tl.load(is_greedy_ptr + req_idx)
+    if not is_greedy:
+        # Calculate the seed for gumbel noise.
+        seed = tl.load(seeds_ptr + req_idx)
+        pos = tl.load(pos_ptr + req_idx)
+        gumbel_seed = tl.randint(seed, pos)
 
         # Generate gumbel noise.
         r = tl.rand(gumbel_seed, block).to(tl.float64)
@@ -129,16 +112,13 @@ def _gumbel_sample_kernel(
         gumbel_noise = gumbel_noise.to(tl.float32)
 
         # Apply gumbel noise.
-        logits = tl.load(logits_ptr + req_idx * logits_stride + block, mask=mask)
         logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
 
-        # Argmax to get the sampled token.
-        idx = tl.argmax(logits, axis=0)
-        value = tl.max(logits, axis=0)
-        is_greater = value > max_val
-        max_val = tl.where(is_greater, value, max_val)
-        max_idx = tl.where(is_greater, i + idx, max_idx)
-    tl.store(sampled_ptr + req_idx, max_idx)
+    idx = tl.argmax(logits, axis=0)
+    token_id = block_idx * BLOCK_SIZE + idx
+    value = tl.max(logits, axis=0)
+    tl.store(local_argmax_ptr + req_idx * local_argmax_stride + block_idx, token_id)
+    tl.store(local_max_ptr + req_idx * local_max_stride + block_idx, value)
 
 
 def gumbel_sample(
@@ -148,23 +128,36 @@ def gumbel_sample(
     pos: torch.Tensor,  # [num_reqs]
 ) -> torch.Tensor:
     num_reqs, vocab_size = logits.shape
-    # NOTE(woosuk): Use int64 for later indexing.
-    sampled = torch.empty(
+    BLOCK_SIZE = 1024
+    num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
+    local_argmax = torch.empty(
         num_reqs,
+        num_blocks,
         dtype=torch.int64,
         device=logits.device,
     )
-    _gumbel_sample_kernel[(num_reqs,)](
-        sampled,
+    local_max = torch.empty(
+        num_reqs,
+        num_blocks,
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    _gumbel_sample_kernel[(num_reqs, num_blocks)](
+        local_argmax,
+        local_argmax.stride(0),
+        local_max,
+        local_max.stride(0),
         logits,
         logits.stride(0),
         seed,
         pos,
         is_greedy,
         vocab_size,
-        num_warps=8,
-        BLOCK_SIZE=16384,  # type: ignore
+        BLOCK_SIZE=BLOCK_SIZE,
     )
+    # NOTE(woosuk): Use int64 for later indexing.
+    max_block_idx = local_max.argmax(dim=-1, keepdim=True)
+    sampled = local_argmax.gather(dim=-1, index=max_block_idx).view(-1)
     return sampled
 
 
From 30d64662387aaa74abcee294f27b83043f2d1ae6 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 21 Nov 2025 19:47:05 -0500
Subject: [PATCH 308/578] [BugFix] Fix Eagle `IndexError: list index out of
 range` for even `num_speculative_tokens` (#29102)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/conftest.py            |  8 ++++++++
 vllm/config/compilation.py   | 16 ++++++++++------
 vllm/v1/spec_decode/eagle.py | 33 +++++++++++++++++++--------------
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index b17081352edc..5afdb225b892 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -748,6 +748,14 @@ def __init__(
             # being captured which can trigger edge cases that we don't handle yet.
             kwargs["compilation_config"] = {"cudagraph_capture_sizes": [4]}
 
+            # Make sure we have atleast one cudagraph large enough for a single decode.
+            if (speculative_config := kwargs.get("speculative_config")) and (
+                num_speculative_tokens := speculative_config["num_speculative_tokens"]
+            ):
+                kwargs["compilation_config"]["cudagraph_capture_sizes"].append(
+                    num_speculative_tokens + 1
+                )
+
         with init_ctx:
             self.llm = LLM(
                 model=model_name,
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index abdae4910612..9b5309598d0e 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -950,14 +950,18 @@ def adjust_cudagraph_sizes_for_spec_decode(
             )
         )
 
+        if len(rounded_sizes) == 0 and multiple_of <= self.max_cudagraph_capture_size:
+            # if one valid but would be round_down use that
+            rounded_sizes = [multiple_of]
+
         if len(rounded_sizes) == 0:
-            logger.warning(
-                "No valid cudagraph sizes after rounding to multiple of "
-                " num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens"
-                " or max_cudagraph_capture_size (or cudagraph_capture_sizes)",
-                multiple_of,
+            raise ValueError(
+                f"No valid cudagraph sizes after rounding to multiple of {multiple_of} "
+                f"(num_speculative_tokens + 1 or tp if sequence parallelism is enabled)"
+                f" please adjust num_speculative_tokens ({uniform_decode_query_len - 1}"
+                f") or max_cudagraph_capture_size ({self.max_cudagraph_capture_size})"
+                f" or cudagraph_capture_sizes ({self.cudagraph_capture_sizes})"
             )
-            return
 
         self.max_cudagraph_capture_size = rounded_sizes[-1]
         self.cudagraph_capture_sizes = rounded_sizes
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 0df9cd3214e5..3de418f1d13c 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -86,9 +86,9 @@ def __init__(
 
         self.use_cuda_graph = False
 
-        compilation_config = self.vllm_config.compilation_config
-        if compilation_config.mode == CompilationMode.VLLM_COMPILE:
-            cudagraph_mode = compilation_config.cudagraph_mode
+        self.compilation_config = self.vllm_config.compilation_config
+        if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
+            cudagraph_mode = self.compilation_config.cudagraph_mode
             if cudagraph_mode != CUDAGraphMode.NONE and not cudagraph_mode.has_mode(
                 CUDAGraphMode.PIECEWISE
             ):
@@ -103,13 +103,6 @@ def __init__(
                 and not self.speculative_config.enforce_eager
             )
 
-        self.cudagraph_batch_sizes = (
-            (sorted(self.vllm_config.compilation_config.cudagraph_capture_sizes))
-            if self.use_cuda_graph
-            else []
-        )
-
-        self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes)
         # persistent buffers for cuda graph
         self.input_ids = torch.zeros(
             self.max_num_tokens, dtype=torch.int32, device=device
@@ -276,7 +269,10 @@ def propose(
             per_layer_attn_metadata[layer_name] = draft_indexer_metadata
 
         cudagraph_runtime_mode = CUDAGraphMode.NONE
-        if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]:
+        if (
+            self.use_cuda_graph
+            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+        ):
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
@@ -366,7 +362,10 @@ def propose(
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
 
-        if self.use_cuda_graph and batch_size <= self.cudagraph_batch_sizes[-1]:
+        if (
+            self.use_cuda_graph
+            and batch_size <= self.compilation_config.max_cudagraph_capture_size
+        ):
             input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
@@ -777,7 +776,10 @@ def propose_tree(
             self.positions[:num_tokens] = tree_positions.view(-1)
             self.hidden_states[:num_tokens] = tree_hidden_states.view(num_tokens, -1)
 
-            if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]:
+            if (
+                self.use_cuda_graph
+                and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+            ):
                 num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
                 cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
             else:
@@ -1114,7 +1116,10 @@ def dummy_run(
     ) -> None:
         # Determine if CUDA graphs should be used for this run.
         cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
-        if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]:
+        if (
+            cudagraphs_enabled
+            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+        ):
             num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
 
         with set_forward_context(

From d5dbdbfcb2cfc2e4d82a1e2605576f1e4e440ca7 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Fri, 21 Nov 2025 17:10:27 -0800
Subject: [PATCH 309/578] [docs] Fix cudagraph mode config (#29170)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 docs/design/debug_vllm_compile.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index 3b454e851b54..8912eb58f8ac 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -9,7 +9,7 @@ TL;DR:
 |----------|----------|-------------|
 | --enforce-eager | enforce_eager=True |  Turn off torch.compile and CUDAGraphs |
 | -O.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
-| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(mode=CompilationMode.NONE) |  Turn off CUDAGraphs only |
+| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) |  Turn off CUDAGraphs only |
 | -O.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
 
 ## vLLM-torch.compile overview

From 9a3101b2ba6821488f4b7a9b93124e479edc4d3e Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Fri, 21 Nov 2025 19:11:02 -0600
Subject: [PATCH 310/578] [Rocm][CI] Fix DeekSeek V2-Lite Accuracy CI (#29135)

Signed-off-by: charlifu <charlifu@amd.com>
---
 .../deepseek_v2_lite_ep_eplb.sh                      | 12 +++++++++++-
 .../qwen30b_a3b_fp8_block_ep.sh                      | 11 ++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
index 5302f524a0ae..8106f50f18f6 100644
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -17,7 +17,17 @@ wait_for_server() {
 }
 
 MODEL="deepseek-ai/DeepSeek-V2-lite"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
 
 cleanup() {
   if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
index a5135299297e..0d06f53a183d 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
@@ -17,7 +17,16 @@ wait_for_server() {
 }
 
 MODEL="QWen/Qwen3-30B-A3B-FP8"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
 
 cleanup() {
   if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then

From 1d34eb11e057f6b42af36bdb13852d2701f04245 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 21 Nov 2025 20:14:49 -0500
Subject: [PATCH 311/578] [CI] Bug: Fix triton import issue (#29202)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/v1/worker/gpu/block_table.py | 3 +--
 vllm/v1/worker/gpu/input_batch.py | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py
index ff24e88ede2c..b31e9b179d26 100644
--- a/vllm/v1/worker/gpu/block_table.py
+++ b/vllm/v1/worker/gpu/block_table.py
@@ -3,10 +3,9 @@
 from collections.abc import Iterable
 
 import torch
-import triton
-import triton.language as tl
 
 from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.triton_utils import tl, triton
 from vllm.utils.math_utils import cdiv
 from vllm.v1.utils import CpuGpuBuffer
 
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 89f375649146..8313b32d2979 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -7,9 +7,8 @@
 import numba.types as types
 import numpy as np
 import torch
-import triton
-import triton.language as tl
 
+from vllm.triton_utils import tl, triton
 from vllm.utils import random_uuid
 from vllm.utils.math_utils import cdiv
 from vllm.v1.utils import CpuGpuBuffer

From d045e22dfeee61ece1a20ac4aec8cf483a42d406 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sat, 22 Nov 2025 01:30:55 +0000
Subject: [PATCH 312/578] [Model][Qwen3VL] Tune Triton w8a8 block fp8 kernel
 for L40s (#29217)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 4 files changed, 584 insertions(+)
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..6b2c1dc1312b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..b0eaf02a541a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..4cd357d5086c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..ca2179ddf3d2
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}

From ed8e6843cc7167113bb9a436818f2e242c841b9f Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Fri, 21 Nov 2025 19:31:22 -0600
Subject: [PATCH 313/578] [CI/Build] Add terratorch for AMD (#29205)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 requirements/rocm-test.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 432e11977872..eabb5065bfce 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -39,3 +39,6 @@ mteb[bm25s]>=1.38.11, <2
 
 # Required for eval tests
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+
+# Plugins test
+terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e

From 5c8f2adf50e0cf2c5acf908ac796089cc45abdcf Mon Sep 17 00:00:00 2001
From: Jie Luo <65482183+Livinfly@users.noreply.github.com>
Date: Sat, 22 Nov 2025 09:34:28 +0800
Subject: [PATCH 314/578] [Bugfix] Fix block size in block_table with PCP
 (#29094)

Signed-off-by: Livinfly <luojie3m@gmail.com>
---
 vllm/v1/worker/block_table.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 76e17f3797a1..37ec0fb97e06 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -84,7 +84,7 @@ def __init__(
             self.pcp_world_size = get_pcp_group().world_size
             self.pcp_rank = get_pcp_group().rank_in_group
         except AssertionError:
-            # DCP might not be initialized in testing
+            # PCP might not be initialized in testing
             self.pcp_world_size = 1
             self.pcp_rank = 0
         try:
@@ -268,6 +268,11 @@ def __init__(
         # (max_model_len//dcp_world_size) tokens in kvcache,
         # so the block_size which used for calc max_num_blocks_per_req
         # must be multiplied by dcp_world_size.
+        try:
+            pcp_world_size = get_pcp_group().world_size
+        except AssertionError:
+            # PCP might not be initialized in testing
+            pcp_world_size = 1
         try:
             dcp_world_size = get_dcp_group().world_size
         except AssertionError:
@@ -280,12 +285,14 @@ def __init__(
                 f"must match block_sizes length ({len(block_sizes)})"
             )
 
+        total_cp_world_size = dcp_world_size * pcp_world_size
+
         self.block_tables = [
             BlockTable(
                 block_size,
                 max_num_reqs,
                 max(
-                    cdiv(max_model_len, block_size * dcp_world_size),
+                    cdiv(max_model_len, block_size * total_cp_world_size),
                     1 + num_speculative_tokens,
                 ),
                 max_num_batched_tokens,

From 1ef9c9e29480f95340e124cc7d81a2876a60516d Mon Sep 17 00:00:00 2001
From: qli88 <qiang.li2@amd.com>
Date: Fri, 21 Nov 2025 19:36:19 -0600
Subject: [PATCH 315/578] [CI/Build] Disable test_gptoss_tp.py in 'LoRA TP
 Test' group for ROCm platform (#29204)

Signed-off-by: qli88 <qiang.li2@amd.com>
---
 .buildkite/test-amd.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4e2ff5c5a6bd..4ee81fdabf66 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1319,7 +1319,10 @@ steps:
     - pytest -v -s -x lora/test_llama_tp.py
     - pytest -v -s -x lora/test_llm_with_multi_loras.py
     - pytest -v -s -x lora/test_olmoe_tp.py
-    - pytest -v -s -x lora/test_gptoss_tp.py
+
+    # Disabled for now because MXFP4 backend on non-cuda platform 
+    # doesn't support LoRA yet
+    #- pytest -v -s -x lora/test_gptoss_tp.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min

From 052950e5b3c48b1189df62f833ed9cff4aabb0bd Mon Sep 17 00:00:00 2001
From: FlintyLemming <muchenran@hotmail.com>
Date: Sat, 22 Nov 2025 09:37:51 +0800
Subject: [PATCH 316/578] Add fused MoE config for H200 E160 N192 fp8 (#29182)

Signed-off-by: FlintyLemming <admin@flinty.moe>
---
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 147 ++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000000..54fe5374cb95
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}

From 6f403501a085f4917e49e1714bdf44d2aabd06f9 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 21 Nov 2025 20:13:18 -0600
Subject: [PATCH 317/578] [CI/Build][AMD] Enable Entrypoints Integration Test
 (Pooling) to run without error on ROCm (#29212)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/entrypoints/pooling/correctness/test_mteb_embed.py    | 6 ++++++
 tests/entrypoints/pooling/correctness/test_mteb_score.py    | 6 ++++++
 tests/entrypoints/pooling/llm/test_embedding.py             | 6 ++++++
 tests/entrypoints/pooling/llm/test_encode.py                | 6 ++++++
 tests/entrypoints/pooling/llm/test_score.py                 | 6 ++++++
 tests/entrypoints/pooling/openai/test_embedding.py          | 6 ++++++
 .../entrypoints/pooling/openai/test_embedding_dimensions.py | 6 ++++++
 .../entrypoints/pooling/openai/test_embedding_long_text.py  | 6 ++++++
 tests/entrypoints/pooling/openai/test_rerank.py             | 6 ++++++
 tests/entrypoints/pooling/openai/test_score.py              | 6 ++++++
 tests/entrypoints/pooling/openai/test_truncation.py         | 6 ++++++
 11 files changed, 66 insertions(+)

diff --git a/tests/entrypoints/pooling/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/correctness/test_mteb_embed.py
index 7f16638e51e2..64673534fd32 100644
--- a/tests/entrypoints/pooling/correctness/test_mteb_embed.py
+++ b/tests/entrypoints/pooling/correctness/test_mteb_embed.py
@@ -11,6 +11,12 @@
     run_mteb_embed_task,
 )
 from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/correctness/test_mteb_score.py
index 1afe68b189db..81ad0097187b 100644
--- a/tests/entrypoints/pooling/correctness/test_mteb_score.py
+++ b/tests/entrypoints/pooling/correctness/test_mteb_score.py
@@ -13,6 +13,12 @@
     run_mteb_rerank,
 )
 from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py
index 5455b5f91fc0..f5eab4c29ae1 100644
--- a/tests/entrypoints/pooling/llm/test_embedding.py
+++ b/tests/entrypoints/pooling/llm/test_embedding.py
@@ -9,6 +9,12 @@
 
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
 
diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py
index ca85d2758fce..f86ecef2e474 100644
--- a/tests/entrypoints/pooling/llm/test_encode.py
+++ b/tests/entrypoints/pooling/llm/test_encode.py
@@ -7,6 +7,12 @@
 
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
 
diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/llm/test_score.py
index b69c6a47c191..ce36d61cb847 100644
--- a/tests/entrypoints/pooling/llm/test_score.py
+++ b/tests/entrypoints/pooling/llm/test_score.py
@@ -9,6 +9,12 @@
 from tests.models.utils import softmax
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
 
diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py
index e971b23e8f1a..0c88d800e2f9 100644
--- a/tests/entrypoints/pooling/openai/test_embedding.py
+++ b/tests/entrypoints/pooling/openai/test_embedding.py
@@ -19,6 +19,7 @@
     EmbeddingResponse,
     PoolingResponse,
 )
+from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.serial_utils import (
     EMBED_DTYPE_TO_TORCH_DTYPE,
@@ -28,6 +29,11 @@
     decode_pooling_output,
 )
 
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 DTYPE = "bfloat16"
diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
index ba9fb6426277..8018dac2d3ff 100644
--- a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
@@ -12,6 +12,12 @@
 from tests.models.utils import EmbedModelInfo
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODELS = [
     EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/openai/test_embedding_long_text.py
index f977c81a9084..a9ade09dad0b 100644
--- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py
@@ -16,6 +16,12 @@
 
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 
 def _generate_random_text(word_count: int) -> str:
diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py
index 1d85190c12a1..5a772e22a741 100644
--- a/tests/entrypoints/pooling/openai/test_rerank.py
+++ b/tests/entrypoints/pooling/openai/test_rerank.py
@@ -8,6 +8,12 @@
 
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/openai/test_score.py
index b8f796d47efa..ceff9d018182 100644
--- a/tests/entrypoints/pooling/openai/test_score.py
+++ b/tests/entrypoints/pooling/openai/test_score.py
@@ -10,6 +10,12 @@
 
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import ScoreResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODELS = [
     {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
diff --git a/tests/entrypoints/pooling/openai/test_truncation.py b/tests/entrypoints/pooling/openai/test_truncation.py
index 6889628dc914..0d2d38584040 100644
--- a/tests/entrypoints/pooling/openai/test_truncation.py
+++ b/tests/entrypoints/pooling/openai/test_truncation.py
@@ -7,6 +7,12 @@
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
 max_model_len = 128

From 77e1c035d039ec546bb01b4915eed6b5735156c2 Mon Sep 17 00:00:00 2001
From: Yihua Cheng <yihua98@uchicago.edu>
Date: Fri, 21 Nov 2025 19:18:00 -0800
Subject: [PATCH 318/578] [chore][LMCache connector] Remove useless logs from
 lmcache connector (#29069)

Signed-off-by: ApostaC <yihua98@uchicago.edu>
---
 .../v1/lmcache_integration/multi_process_adapter.py            | 1 -
 .../kv_transfer/kv_connector/v1/lmcache_mp_connector.py        | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
index ab2eeed9f6b8..6acfb73997f2 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
@@ -310,7 +310,6 @@ def get_finished(
                     request_id,
                     result,
                 )
-            logger.info("Retrieve request for request_id=%s finished", request_id)
 
         # Remove the finished requests from the tracking dicts
         for request_id in finished_stores:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index 22ddabbf1e35..d1d3e475cc88 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -469,9 +469,6 @@ def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> Non
             ops.append(meta.op)
 
         if len(request_ids) > 0:
-            logger.info(
-                "HERE! SUBMITTING THE BATCHED RETRIEVE REQUESTS %s", request_ids
-            )
             self.worker_adapter.batched_submit_retrieve_requests(
                 request_ids, ops, event
             )

From fd65015a14be5f2ce663cd959dff6970285c54b4 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 21 Nov 2025 21:34:33 -0600
Subject: [PATCH 319/578] [CI/Build] Only use supported types and features on
 ROCm in MoE kernel tests (#29149)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/kernels/moe/test_batched_moe.py                  | 9 +++++++--
 tests/kernels/moe/test_block_fp8.py                    | 5 +++++
 tests/kernels/moe/test_gpt_oss_triton_kernels.py       | 5 +++++
 tests/kernels/moe/test_modular_kernel_combinations.py  | 6 ++++++
 tests/kernels/moe/test_moe_permute_unpermute.py        | 6 ++++++
 tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py | 6 ++++++
 tests/kernels/moe/test_triton_moe_ptpc_fp8.py          | 6 ++++++
 7 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 2285709fa7d6..dab1207d7803 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -39,6 +39,11 @@
 NUM_EXPERTS = [8, 64]
 TOP_KS = [1, 2, 6]
 
+DTYPES = [torch.bfloat16]
+
+if not current_platform.is_fp8_fnuz():
+    DTYPES.append(torch.float8_e4m3fn)
+
 vllm_config = VllmConfig()
 
 
@@ -96,7 +101,7 @@ def make_tensors(config: BatchedMMConfig):
 @pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512])
 @pytest.mark.parametrize("K", [128, 1024])
 @pytest.mark.parametrize("N", [128, 1024])
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 def test_batched_mm(
@@ -229,7 +234,7 @@ def test_batched_mm(
 @pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("input_scales", [False])
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index 88db4b3e537c..b0ff1e64e321 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -31,6 +31,11 @@
 
 if current_platform.get_device_capability() < (9, 0):
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
 
 vllm_config = VllmConfig()
 
diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
index af33fd4e3fc3..98e80ec02977 100644
--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -270,6 +270,11 @@ class Case:
 @pytest.mark.parametrize("num_token", [2])
 @pytest.mark.parametrize("tp", [1, 2, 4, 8])
 def test_equiv(num_token, a_dtype, w_dtype, tp):
+    from triton_kernels.tensor_details import layout
+
+    if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"):
+        pytest.skip("make_default_matmul_mxfp4_w_layout not available")
+
     M = num_token
     E = ModelConfig.num_experts
     K = ModelConfig.hidden_size
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index e3b8621b452f..2a30ef235552 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -46,6 +46,12 @@
     reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages",
 )
 
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
+
 
 def format_result(verbose, msg, ex=None):
     if ex is not None:
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index ba1f657b3ecd..12dd322dccc5 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -23,6 +23,12 @@
 EP_SIZE = [1, 4, 16]
 current_platform.seed_everything(0)
 
+if current_platform.is_rocm():
+    pytest.skip(
+        "moe_permute_unpermute_supported is not defined for ROCm",
+        allow_module_level=True,
+    )
+
 
 def torch_permute(
     hidden_states: torch.Tensor,
diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
index d6b78dd2c232..b220205759e2 100644
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -14,6 +14,12 @@
 from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
 from vllm.utils.math_utils import cdiv, round_up
 
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
+
 fp8_dtype = torch.float8_e4m3fn
 
 CASES = [
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
index 7a467e160b78..0ab025dceca4 100644
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -19,6 +19,12 @@
 
 vllm_config = VllmConfig()
 
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
+
 
 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
     """Matrix multiplication function that supports per-token input

From 933f67ecd81231ebfa5e2434d3ae3819b6c28068 Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Fri, 21 Nov 2025 19:59:07 -0800
Subject: [PATCH 320/578] [Bugfix]Fix a conditional to not check zero value
 (#28754)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
---
 vllm/compilation/caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 63b7ad7279e3..6297d9f995aa 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -116,7 +116,8 @@ def optimized_call(*example_inputs):
             the AOT compiled path.
             """
             compile_inputs = [
-                inp or example_inputs[i] for i, inp in enumerate(fn.example_inputs)
+                inp if inp is not None else example_inputs[i]
+                for i, inp in enumerate(fn.example_inputs)
             ]
             with tracing(TracingContext(fake_mode)):
                 fn.optimized_call = vllm_backend(

From 1489902b531bb649f8110c94572b2d8b753a72cc Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 22 Nov 2025 12:01:30 +0800
Subject: [PATCH 321/578] [LoRA] Cleanup FusedMoEWithLoRA (#29187)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers/fused_moe.py           | 185 ++++++++++++------------
 vllm/lora/punica_wrapper/punica_base.py |   4 +-
 vllm/lora/punica_wrapper/punica_gpu.py  |   4 +-
 3 files changed, 94 insertions(+), 99 deletions(-)

diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index adf30855cafc..5aeaca8de5e5 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -42,6 +42,7 @@ def __init__(self, base_layer: FusedMoE) -> None:
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
         self.device = base_layer.w2_weight.device
+        self.w13_slices = 2
         self._inject_lora_into_fused_moe()
 
     def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]:
@@ -60,8 +61,8 @@ def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None
     def _get_lora_moe_configs(
         self,
         op_prefix: str,
-        lora_a_stacked: torch.Tensor,
-        lora_b_stacked: torch.Tensor,
+        num_loras: int,
+        rank: int,
         num_slices: int,
         M: int,
         layer: FusedMoE,
@@ -69,23 +70,25 @@ def _get_lora_moe_configs(
         config_dtype: str,
     ):
         if envs.VLLM_TUNED_CONFIG_FOLDER:
+            hidden_size = layer.hidden_size
+            intermediate_size = layer.intermediate_size_per_partition
             shrink_config = get_lora_op_configs(
                 op_type=f"fused_moe_lora_{op_prefix}_shrink",
-                max_loras=lora_a_stacked.shape[0],
+                max_loras=num_loras,
                 batch=M,
-                hidden_size=lora_a_stacked.shape[-1],
-                rank=lora_a_stacked.shape[-2],
+                hidden_size=hidden_size,
+                rank=rank,
                 num_slices=num_slices,
-                moe_intermediate_size=lora_b_stacked.shape[-2],
+                moe_intermediate_size=intermediate_size,
             )
             expand_config = get_lora_op_configs(
                 op_type=f"fused_moe_lora_{op_prefix}_expand",
-                max_loras=lora_a_stacked.shape[0],
+                max_loras=num_loras,
                 batch=M,
-                hidden_size=lora_a_stacked.shape[-1],
-                rank=lora_a_stacked.shape[-2],
+                hidden_size=hidden_size,  # lora_a_stacked.shape[-1],
+                rank=rank,
                 num_slices=num_slices,
-                moe_intermediate_size=lora_b_stacked.shape[-2],
+                moe_intermediate_size=intermediate_size,  # lora_b_stacked.shape[-2],
             )
         else:  # fall back to the default config
             get_config_func = functools.partial(
@@ -152,12 +155,12 @@ def wrapper(*args, **kwargs):
                 CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
-
+                max_lora_rank = self.w13_lora_a_stacked[0].shape[-2]
                 shrink_config, expand_config = self._get_lora_moe_configs(
                     op_prefix="w13",
-                    lora_a_stacked=self.w1_lora_a_stacked,
-                    lora_b_stacked=self.w1_lora_b_stacked,
-                    num_slices=2,
+                    num_loras=self.max_loras,
+                    rank=max_lora_rank,
+                    num_slices=self.w13_slices,
                     M=M,
                     layer=layer,
                     top_k=top_k,
@@ -165,7 +168,6 @@ def wrapper(*args, **kwargs):
                 )
 
                 # get the block size of m from customized config or default config
-                max_loras = self.w1_lora_a_stacked.shape[0]
                 (
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -175,7 +177,7 @@ def wrapper(*args, **kwargs):
                     num_tokens,
                     shrink_config["BLOCK_SIZE_M"],
                     self.base_layer.local_num_experts,
-                    max_loras,
+                    self.max_loras,
                     self.adapter_enabled,
                     expert_map,
                 )
@@ -186,17 +188,15 @@ def wrapper(*args, **kwargs):
                     num_tokens_post_padded_lora
                 )
 
-                w13_lora_a_stacked = [self.w1_lora_a_stacked, self.w3_lora_a_stacked]
-                w13_lora_b_stacked = [self.w1_lora_b_stacked, self.w3_lora_b_stacked]
-                max_lora_rank = self.w1_lora_a_stacked.shape[-2]
-                expert_ids_lora = expert_ids_lora.view(max_loras, -1)
-                sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1)
+                expert_ids_lora = expert_ids_lora.view(self.max_loras, -1)
+                sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1)
+                #
 
                 self.punica_wrapper.add_lora_fused_moe(
                     input.view(-1, top_k, input.shape[-1]),
                     hidden_states,
-                    w13_lora_a_stacked,
-                    w13_lora_b_stacked,
+                    self.w13_lora_a_stacked,
+                    self.w13_lora_b_stacked,
                     topk_weights,
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -230,11 +230,11 @@ def wrapper(*args, **kwargs):
                 CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
-
+                max_lora_rank = self.w2_lora_a_stacked.shape[-2]
                 shrink_config, expand_config = self._get_lora_moe_configs(
                     op_prefix="w2",
-                    lora_a_stacked=self.w2_lora_a_stacked,
-                    lora_b_stacked=self.w2_lora_b_stacked,
+                    num_loras=self.max_loras,
+                    rank=max_lora_rank,
                     num_slices=1,
                     M=M,
                     layer=layer,
@@ -247,20 +247,19 @@ def wrapper(*args, **kwargs):
                 num_tokens_post_padded_lora = moe_state_dict[
                     "num_tokens_post_padded_lora"
                 ]
-                max_loras = self.w1_lora_a_stacked.shape[0]
-                expert_ids_lora = expert_ids_lora.view(max_loras, -1)
-                sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1)
+
+                expert_ids_lora = expert_ids_lora.view(self.max_loras, -1)
+                sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1)
                 intermediate_cache2 = moe_state_dict["intermediate_cache2"]
                 intermediate_cache3 = args[0]
-                max_lora_rank = self.w2_lora_a_stacked.shape[-2]
 
                 shard_size_w2 = divide(self.base_layer.hidden_size, self.tp_size)
 
                 self.punica_wrapper.add_lora_fused_moe(
                     intermediate_cache3,
                     intermediate_cache2,
-                    [self.w2_lora_a_stacked],
-                    [self.w2_lora_b_stacked],
+                    (self.w2_lora_a_stacked,),
+                    (self.w2_lora_b_stacked,),
                     topk_weights,
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -289,7 +288,6 @@ def wrapper(*args, **kwargs):
         fused_experts.moe_sum = moe_sum_decorator(
             self.base_layer, fused_experts.moe_sum
         )
-
         self.base_layer.quant_method = FusedMoEModularMethod(
             self.base_layer.quant_method, m_fused_moe_fn
         )
@@ -301,33 +299,42 @@ def create_lora_weights(
         model_config: PretrainedConfig | None = None,
     ) -> None:
         """Initializes lora matrices."""
+        assert self.w13_slices == 2
+        self.max_loras = lora_config.max_loras
         self.fully_sharded = lora_config.fully_sharded_loras
 
         self.adapter_enabled = torch.tensor(
             [0] * (max_loras + 1), dtype=torch.int, device=self.device
         )
 
-        self.w1_lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                lora_config.max_lora_rank
-                if not self.fully_sharded
-                else divide(lora_config.max_lora_rank, self.tp_size),
-                self.base_layer.hidden_size,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
+        self.w13_lora_a_stacked = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    lora_config.max_lora_rank
+                    if not self.fully_sharded
+                    else divide(lora_config.max_lora_rank, self.tp_size),
+                    self.base_layer.hidden_size,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+            for _ in range(self.w13_slices)
         )
-        self.w1_lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                self.base_layer.intermediate_size_per_partition,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
+
+        self.w13_lora_b_stacked = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.intermediate_size_per_partition,
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+            for _ in range(self.w13_slices)
         )
 
         self.w2_lora_a_stacked = torch.zeros(
@@ -353,29 +360,6 @@ def create_lora_weights(
             device=self.device,
         )
 
-        self.w3_lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                lora_config.max_lora_rank
-                if not self.fully_sharded
-                else divide(lora_config.max_lora_rank, self.tp_size),
-                self.base_layer.hidden_size,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.w3_lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                self.base_layer.intermediate_size_per_partition,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-
         # They will be used by 'LoRALayerWeights.create_dummy_lora_weights'
         # to create a dummy LoRA weights.
         self.lora_a_stacked = []
@@ -383,20 +367,28 @@ def create_lora_weights(
         for lora_id in range(max_loras):
             for experts_id in range(self.base_layer.local_num_experts):
                 # gate_proj,down_proj,up_proj
-                self.lora_a_stacked.append(self.w1_lora_a_stacked[lora_id][experts_id])
+                self.lora_a_stacked.append(
+                    self.w13_lora_a_stacked[0][lora_id][experts_id]
+                )
                 self.lora_a_stacked.append(self.w2_lora_a_stacked[lora_id][experts_id])
-                self.lora_a_stacked.append(self.w3_lora_a_stacked[lora_id][experts_id])
+                self.lora_a_stacked.append(
+                    self.w13_lora_a_stacked[1][lora_id][experts_id]
+                )
 
-                self.lora_b_stacked.append(self.w1_lora_b_stacked[lora_id][experts_id])
+                self.lora_b_stacked.append(
+                    self.w13_lora_b_stacked[0][lora_id][experts_id]
+                )
                 self.lora_b_stacked.append(self.w2_lora_b_stacked[lora_id][experts_id])
-                self.lora_b_stacked.append(self.w3_lora_b_stacked[lora_id][experts_id])
+                self.lora_b_stacked.append(
+                    self.w13_lora_b_stacked[1][lora_id][experts_id]
+                )
 
     def reset_lora(self, index: int):
         """Resets the lora weights at index back to 0."""
-        self.w1_lora_a_stacked[index] = 0
-        self.w1_lora_b_stacked[index] = 0
-        self.w3_lora_a_stacked[index] = 0
-        self.w3_lora_b_stacked[index] = 0
+        for pos in range(self.w13_slices):
+            self.w13_lora_a_stacked[pos][index] = 0
+            self.w13_lora_b_stacked[pos][index] = 0
+
         self.w2_lora_a_stacked[index] = 0
         self.w2_lora_b_stacked[index] = 0
         self.adapter_enabled[index] = 0
@@ -434,7 +426,7 @@ def set_lora(
                 if self.fully_sharded:
                     # Based on S-LoRA, we slice W1 and W3 A along the rank dim,
                     # and W2 B along the hidden_size dim.
-                    w13_shard_size = self.w1_lora_a_stacked[index, eid].shape[0]
+                    w13_shard_size = self.w13_lora_a_stacked[0][index, eid].shape[0]
                     w13_start_idx = self.tp_rank * w13_shard_size
                     w13_end_idx = (self.tp_rank + 1) * w13_shard_size
                     w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :]
@@ -444,29 +436,32 @@ def set_lora(
                     w2_start_idx = self.tp_rank * w2_shard_size
                     w2_end_idx = (self.tp_rank + 1) * w2_shard_size
                     w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :]
-
-            self.w1_lora_a_stacked[
+            # w1 lora_a
+            self.w13_lora_a_stacked[0][
                 index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1]
             ].copy_(w1_lora_a, non_blocking=True)
-
-            self.w3_lora_a_stacked[
+            # w3 lora_a
+            self.w13_lora_a_stacked[1][
                 index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1]
             ].copy_(w3_lora_a, non_blocking=True)
 
-            self.w2_lora_b_stacked[
-                index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1]
-            ].copy_(w2_lora_b, non_blocking=True)
-
-            self.w1_lora_b_stacked[
+            # w1 lora_b
+            self.w13_lora_b_stacked[0][
                 index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1]
             ].copy_(w1_lora_b, non_blocking=True)
-            self.w3_lora_b_stacked[
+            # w3 lora_b
+            self.w13_lora_b_stacked[1][
                 index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1]
             ].copy_(w3_lora_b, non_blocking=True)
+
             self.w2_lora_a_stacked[
                 index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1]
             ].copy_(w2_lora_a, non_blocking=True)
 
+            self.w2_lora_b_stacked[
+                index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1]
+            ].copy_(w2_lora_b, non_blocking=True)
+
     @classmethod
     def can_replace_layer(
         cls,
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index 7c0fc8167711..ce38751e4b6a 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -470,8 +470,8 @@ def add_lora_fused_moe(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        lora_a_stacked: list[torch.Tensor],
-        lora_b_stacked: list[torch.Tensor],
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
         topk_weights: torch.Tensor,
         sorted_token_ids: torch.Tensor,
         expert_ids: torch.Tensor,
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 52138ef0cc3b..ef4b4ab7c349 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -360,8 +360,8 @@ def add_lora_fused_moe(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        lora_a_stacked: list[torch.Tensor],
-        lora_b_stacked: list[torch.Tensor],
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
         topk_weights: torch.Tensor,
         sorted_token_ids: torch.Tensor,
         expert_ids: torch.Tensor,

From e9056056fbacecbac4318bd0323745fdd7fe55b6 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 21 Nov 2025 20:21:35 -0800
Subject: [PATCH 322/578] [Model Runner V2] Limit cudagraph size to max decode
 batch size (#29221)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 31a706475243..763bd6183462 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -27,9 +27,11 @@ def __init__(
         device: torch.device,
     ):
         self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
         self.device = device
 
         self.max_model_len = vllm_config.model_config.max_model_len
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
         self.dp_size = vllm_config.parallel_config.data_parallel_size
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
@@ -39,9 +41,11 @@ def __init__(
         else:
             self.cudagraph_mode = self.compilation_config.cudagraph_mode
         if self.compilation_config.cudagraph_capture_sizes is not None:
-            self.cudagraph_sizes = sorted(
-                self.compilation_config.cudagraph_capture_sizes
-            )
+            cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
+            # Limit the cudagraph sizes to the max decode batch size.
+            self.cudagraph_sizes = [
+                x for x in cudagraph_sizes if x <= self.max_num_reqs
+            ]
         else:
             self.cudagraph_sizes = []
         self.padded_sizes = self._init_padded_sizes()
@@ -54,9 +58,10 @@ def _init_padded_sizes(self) -> dict[int, int]:
         if not self.cudagraph_mode.has_full_cudagraphs():
             # Full cuda graphs are not used.
             return {}
+        if not self.cudagraph_sizes:
+            return {}
 
         padded_sizes: dict[int, int] = {}
-        assert len(self.cudagraph_sizes) > 0
         for i in range(1, self.cudagraph_sizes[-1] + 1):
             for x in self.cudagraph_sizes:
                 if i <= x:

From 742e9ff6b39ad0433bac0d7417a41bbdc74854a3 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Fri, 21 Nov 2025 23:42:11 -0800
Subject: [PATCH 323/578] [responsesAPI] parse reasoning item input (#28248)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../online_serving/openai_responses_client.py | 44 ++++++++++++
 .../openai/test_response_api_simple.py        | 71 +++++++++++++++++++
 .../openai/test_response_api_with_harmony.py  | 27 ++++++-
 tests/entrypoints/test_responses_utils.py     | 58 +++++++++++++++
 vllm/entrypoints/responses_utils.py           | 13 ++++
 5 files changed, 212 insertions(+), 1 deletion(-)
 create mode 100644 examples/online_serving/openai_responses_client.py
 create mode 100644 tests/entrypoints/openai/test_response_api_simple.py

diff --git a/examples/online_serving/openai_responses_client.py b/examples/online_serving/openai_responses_client.py
new file mode 100644
index 000000000000..b4eb24671507
--- /dev/null
+++ b/examples/online_serving/openai_responses_client.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Set up this example by starting a vLLM OpenAI-compatible server.
+Reasoning models can be used through the Responses API as seen here
+https://platform.openai.com/docs/api-reference/responses
+For example:
+vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3
+
+"""
+
+from openai import OpenAI
+
+input_messages = [{"role": "user", "content": "What model are you?"}]
+
+
+def main():
+    base_url = "http://localhost:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = "Qwen/Qwen3-8B"  # get_first_model(client)
+    response = client.responses.create(
+        model=model,
+        input=input_messages,
+    )
+
+    for message in response.output:
+        if message.type == "reasoning":
+            # append reasoning message
+            input_messages.append(message)
+
+    response_2 = client.responses.create(
+        model=model,
+        input=input_messages,
+    )
+    print(response_2.output_text)
+    # I am Qwen, a large language model developed by Alibaba Cloud.
+    # I am designed to assist with a wide range of tasks, including
+    # answering questions, creating content, coding, and engaging in
+    # conversations. I can help with various topics and provide
+    # information or support in multiple languages. How can I assist you today?
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/test_response_api_simple.py
new file mode 100644
index 000000000000..425b8199a0fd
--- /dev/null
+++ b/tests/entrypoints/openai/test_response_api_simple.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-8B"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
+    env_dict = dict(
+        VLLM_ENABLE_RESPONSES_API_STORE="1",
+        # uncomment for tool calling
+        # PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+    )
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response is not None
+    print("response: ", response)
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_item(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"type": "message", "content": "Hello.", "role": "user"},
+            {
+                "type": "reasoning",
+                "id": "lol",
+                "content": [
+                    {
+                        "type": "reasoning_text",
+                        "text": "We need to respond: greeting.",
+                    }
+                ],
+                "summary": [],
+            },
+        ],
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # make sure we get a reasoning and text output
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "message"
+    assert type(response.output[1].content[0].text) is str
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index dea8d2d28f61..6251e1776c30 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -35,7 +35,7 @@
 
 @pytest.fixture(scope="module")
 def server():
-    args = ["--enforce-eager", "--tool-server", "demo"]
+    args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
     env_dict = dict(
         VLLM_ENABLE_RESPONSES_API_STORE="1",
         PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
@@ -550,6 +550,31 @@ def call_function(name, args):
         raise ValueError(f"Unknown function: {name}")
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_item(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"type": "message", "content": "Hello.", "role": "user"},
+            {
+                "type": "reasoning",
+                "id": "lol",
+                "content": [
+                    {
+                        "type": "reasoning_text",
+                        "text": "We need to respond: greeting.",
+                    }
+                ],
+                "summary": [],
+            },
+        ],
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling(client: OpenAI, model_name: str):
diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
index 48bf06088bc0..91c818374e3f 100644
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -1,7 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+from openai.types.responses.response_reasoning_item import (
+    Content,
+    ResponseReasoningItem,
+    Summary,
+)
+
 from vllm.entrypoints.responses_utils import (
+    construct_chat_message_with_tool_call,
     convert_tool_responses_to_completions_format,
 )
 
@@ -28,3 +36,53 @@ def test_convert_tool_responses_to_completions_format(self):
         result = convert_tool_responses_to_completions_format(input_tool)
 
         assert result == {"type": "function", "function": input_tool}
+
+    def test_construct_chat_message_with_tool_call(self):
+        item = ResponseReasoningItem(
+            id="lol",
+            summary=[],
+            type="reasoning",
+            content=[
+                Content(
+                    text="Leroy Jenkins",
+                    type="reasoning_text",
+                )
+            ],
+            encrypted_content=None,
+            status=None,
+        )
+        formatted_item = construct_chat_message_with_tool_call(item)
+        assert formatted_item["role"] == "assistant"
+        assert formatted_item["reasoning"] == "Leroy Jenkins"
+
+        item = ResponseReasoningItem(
+            id="lol",
+            summary=[
+                Summary(
+                    text='Hmm, the user has just started with a simple "Hello,"',
+                    type="summary_text",
+                )
+            ],
+            type="reasoning",
+            content=None,
+            encrypted_content=None,
+            status=None,
+        )
+
+        formatted_item = construct_chat_message_with_tool_call(item)
+        assert formatted_item["role"] == "assistant"
+        assert (
+            formatted_item["reasoning"]
+            == 'Hmm, the user has just started with a simple "Hello,"'
+        )
+
+        item = ResponseReasoningItem(
+            id="lol",
+            summary=[],
+            type="reasoning",
+            content=None,
+            encrypted_content="TOP_SECRET_MESSAGE",
+            status=None,
+        )
+        with pytest.raises(ValueError):
+            construct_chat_message_with_tool_call(item)
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index d966f58804b6..912e8a690573 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -10,6 +10,7 @@
     Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
 
 from vllm import envs
@@ -37,6 +38,18 @@ def construct_chat_message_with_tool_call(
                 )
             ],
         )
+    elif isinstance(item, ResponseReasoningItem):
+        reasoning_content = ""
+        if item.encrypted_content:
+            raise ValueError("Encrypted content is not supported.")
+        if len(item.summary) == 1:
+            reasoning_content = item.summary[0].text
+        elif item.content and len(item.content) == 1:
+            reasoning_content = item.content[0].text
+        return {
+            "role": "assistant",
+            "reasoning": reasoning_content,
+        }
     elif item.get("type") == "function_call_output":
         # Append the function call output as a tool message.
         return ChatCompletionToolMessageParam(

From ea38474ac564efdc09762ad066139b75cf68f924 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mads=20Kildeg=C3=A5rd?= <mkildegaard99@gmail.com>
Date: Sat, 22 Nov 2025 10:58:22 +0100
Subject: [PATCH 324/578] [Frontend][Responses API] Multi-turn (with type:
 "output_text") support for non-harmony requests (#29175)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mads Kildegård <mkildegaard99@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index aaf8a3ae9d2d..bf80856c1bbf 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1283,6 +1283,7 @@ def _get_full_multimodal_text_prompt(
     "text": lambda part: _TextParser(part).get("text", None),
     "thinking": lambda part: _ThinkParser(part).get("thinking", None),
     "input_text": lambda part: _TextParser(part).get("text", None),
+    "output_text": lambda part: _TextParser(part).get("text", None),
     "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None),
     "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
@@ -1463,7 +1464,7 @@ def _parse_chat_message_content_part(
         )
         return None
 
-    if part_type in ("text", "input_text", "refusal", "thinking"):
+    if part_type in ("text", "input_text", "output_text", "refusal", "thinking"):
         str_content = cast(str, content)
         if wrap_dicts:
             return {"type": "text", "text": str_content}

From 988ee66b0d54ec08a24135f7a947affe69e9dd52 Mon Sep 17 00:00:00 2001
From: jinghanhu <hujinghan.hjh@alibaba-inc.com>
Date: Sat, 22 Nov 2025 18:07:50 +0800
Subject: [PATCH 325/578] Handle triton kernel import exception  (#29062)

---
 vllm/model_executor/layers/fused_moe/config.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 21eb4d590a7d..1826fafa8c4f 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -28,10 +28,11 @@
 if has_triton_kernels():
     try:
         from triton_kernels.matmul_ogs import PrecisionConfig
-    except ImportError:
+    except (ImportError, AttributeError) as e:
         logger.error(
             "Failed to import Triton kernels. Please make sure your triton "
-            "version is compatible."
+            "version is compatible. Error: %s",
+            e,
         )
 
 
From e6309acdba3a26e803d1ea7f66804f4ad30c2b9a Mon Sep 17 00:00:00 2001
From: "Jane (Yuan) Xu" <31798555+janeyx99@users.noreply.github.com>
Date: Sat, 22 Nov 2025 05:35:32 -0500
Subject: [PATCH 326/578] Simplify `from_blob` usage in
 `get_cuda_view_from_cpu_tensor` (#29027)

Signed-off-by: Jane Xu <janeyx@meta.com>
---
 csrc/cuda_view.cu | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/csrc/cuda_view.cu b/csrc/cuda_view.cu
index 938bd4ab7fc6..9853fc942bab 100644
--- a/csrc/cuda_view.cu
+++ b/csrc/cuda_view.cu
@@ -22,15 +22,10 @@ torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
   auto strides = cpu_tensor.strides();
   auto options = cpu_tensor.options().device(torch::kCUDA);
 
-  // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
-  // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
-  // memory, so we don't free it here.
-  auto deleter = [](void*) {
-    // no-op, since the memory is owned by the original CPU tensor
-  };
-
+  // use default no-op deleter, since the memory is owned by the original CPU
+  // tensor
   torch::Tensor cuda_tensor =
-      torch::from_blob(device_ptr, sizes, strides, deleter, options);
+      torch::from_blob(device_ptr, sizes, strides, options);
 
   TORCH_CHECK(cuda_tensor.device().is_cuda(),
               "Resulting tensor is not on CUDA device");

From a4fdf2405c737843d1e95e406959f3e2e6bcf899 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Sat, 22 Nov 2025 04:59:39 -0600
Subject: [PATCH 327/578] [CI/Build] Skip tests that require libcudart in
 test_lmcache_integration.py (#29228)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 .../kv_connector/unit/test_lmcache_integration.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py
index 11507d7cd4e7..33418edc325a 100644
--- a/tests/v1/kv_connector/unit/test_lmcache_integration.py
+++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py
@@ -9,6 +9,12 @@
 # Assumption vs. Correctness Tests:
 # these unit tests do *not* test correctness of LMCache-side or vLLM-side logic
 # it is to ensure that assumptions LMCache makes about vLLM's interface are stable
+
+import pytest
+
+from vllm.platforms import current_platform
+
+
 def assumes(obj, attr, is_callable=False, is_instance_of=None):
     import inspect
     from dataclasses import is_dataclass
@@ -48,6 +54,9 @@ def assumes(obj, attr, is_callable=False, is_instance_of=None):
                 assert isinstance(attr_value, is_instance_of), assumption_msg
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
+)
 def test_multimodal_interface():
     # protect against interface changes
     from vllm.multimodal.inputs import PlaceholderRange
@@ -72,6 +81,9 @@ def test_multimodal_interface():
     assert token_ids.tolist() == [0, 0, 0, 0, 4, 4369, 4369, 4369, 4369, 9]
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
+)
 def test_config_interface():
     # protect against interface changes
     from vllm.config import VllmConfig
@@ -146,6 +158,9 @@ def test_config_interface():
     )
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
+)
 def test_request_interface():
     # protect against interface changes
     from types import NoneType

From 8e22da1d7fcd43efd8fec18c0c0bf6a8e7cf61a6 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Sat, 22 Nov 2025 05:00:54 -0600
Subject: [PATCH 328/578] [CI/Build Don't add FLASHINFER backend in
 test_cpu_offloading.py (#29229)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/kv_offload/test_cpu_offloading.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index 3ee41c40859d..406d4c0b4c1f 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -12,10 +12,14 @@
 from vllm import LLM, SamplingParams, TokensPrompt
 from vllm.config import KVEventsConfig, KVTransferConfig
 from vllm.distributed.kv_events import BlockStored, KVEventBatch
+from vllm.platforms import current_platform
 from vllm.utils.system_utils import set_env_var
 
 CPU_BLOCK_SIZES = [48]
-ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
+ATTN_BACKENDS = ["FLASH_ATTN"]
+
+if current_platform.is_cuda():
+    ATTN_BACKENDS.append("FLASHINFER")
 
 
 class MockSubscriber:

From 5a4802588ed8f7918468986fce130c19ee721674 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 22 Nov 2025 19:34:15 +0800
Subject: [PATCH 329/578] [Misc] Further clean up chunked prefill and prefix
 caching init (#29186)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/engine/test_arg_utils.py  |  2 +-
 tests/v1/core/test_scheduler.py | 19 +++++++------------
 tests/v1/core/utils.py          | 11 +++--------
 vllm/config/cache.py            |  4 ++--
 vllm/engine/arg_utils.py        | 24 +++++++++++++++++++-----
 vllm/v1/core/sched/scheduler.py |  2 +-
 6 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 472b1487ef44..10827e3b4b9c 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -279,7 +279,7 @@ def test_prefix_cache_default():
     args = parser.parse_args([])
 
     engine_args = EngineArgs.from_cli_args(args=args)
-    assert not engine_args.enable_prefix_caching, "prefix caching defaults to off."
+    assert engine_args.enable_prefix_caching, "prefix caching should default to on."
 
     # with flag to turn it on.
     args = parser.parse_args(["--enable-prefix-caching"])
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index d9a69a77c979..09acde6e08fa 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -76,11 +76,11 @@ def test_get_num_unfinished_requests():
 @pytest.mark.parametrize(
     "enable_prefix_caching, prompt_logprobs",
     [
-        (None, None),
+        (False, None),
         (True, 5),
     ],
 )
-def test_schedule(enable_prefix_caching: bool | None, prompt_logprobs: int | None):
+def test_schedule(enable_prefix_caching: bool, prompt_logprobs: int | None):
     """Test scheduling.
     Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
     """
@@ -582,12 +582,12 @@ def test_check_stop_min_tokens():
 @pytest.mark.parametrize(
     "enable_prefix_caching, prompt_logprobs",
     [
-        (None, None),
+        (False, None),
         (True, 5),
     ],
 )
 def test_schedule_concurrent_batches(
-    enable_prefix_caching: bool | None, prompt_logprobs: int | None
+    enable_prefix_caching: bool, prompt_logprobs: int | None
 ):
     scheduler = create_scheduler(
         max_num_batched_tokens=1024,
@@ -1425,7 +1425,7 @@ def create_scheduler_with_priority(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
-    enable_prefix_caching: bool | None = None,
+    enable_prefix_caching: bool = False,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
     use_kv_connector: bool = False,
@@ -1444,7 +1444,7 @@ def create_scheduler_with_priority(
       max_num_batch_tokens: max num tokens to batch
       enable_prefix_caching: optionally force APC config
                              (True/False) or use default
-                             (None)
+                             (False)
 
     Returns:
       {class}`Scheduler` instance with priority scheduling
@@ -1467,17 +1467,12 @@ def create_scheduler_with_priority(
         seed=42,
     )
     # Cache config, optionally force APC
-    kwargs_cache = (
-        {}
-        if enable_prefix_caching is None
-        else {"enable_prefix_caching": enable_prefix_caching}
-    )
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
-        **kwargs_cache,
+        enable_prefix_caching=enable_prefix_caching,
     )
     kv_transfer_config = (
         KVTransferConfig(
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 65511c17473b..6830f6873645 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -42,7 +42,7 @@ def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
-    enable_prefix_caching: bool | None = None,
+    enable_prefix_caching: bool = False,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
     use_kv_connector: None | bool | MockKVConfig = None,
@@ -63,7 +63,7 @@ def create_scheduler(
       max_num_batch_tokens: max num tokens to batch
       enable_prefix_caching: optionally force APC config
                              (True/False) or use default
-                             (None)
+                             (False)
 
     Returns:
       {class}`Scheduler` instance
@@ -87,17 +87,12 @@ def create_scheduler(
         skip_tokenizer_init=skip_tokenizer_init,
     )
     # Cache config, optionally force APC
-    kwargs_cache = (
-        {}
-        if enable_prefix_caching is None
-        else {"enable_prefix_caching": enable_prefix_caching}
-    )
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
-        **kwargs_cache,
+        enable_prefix_caching=enable_prefix_caching,
     )
     kv_transfer_config = None
     if isinstance(use_kv_connector, MockKVConfig):
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 2652c7c06ad0..ef6928d8ebd5 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -73,8 +73,8 @@ class CacheConfig:
     sliding_window: int | None = None
     """Sliding window size for the KV cache. This is primarily set in
     `ModelConfig` and that value should be manually duplicated here."""
-    enable_prefix_caching: bool | None = None
-    """Whether to enable prefix caching. Enabled by default for V1."""
+    enable_prefix_caching: bool = True
+    """Whether to enable prefix caching."""
     prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
     """Set the hash algorithm for prefix caching:\n
     - "sha256" uses Pickle for object serialization before hashing.\n
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 888f57b1ac1d..611bf1b37584 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -425,7 +425,7 @@ class EngineArgs:
         ParallelConfig.max_parallel_loading_workers
     )
     block_size: BlockSize | None = CacheConfig.block_size
-    enable_prefix_caching: bool | None = CacheConfig.enable_prefix_caching
+    enable_prefix_caching: bool | None = None
     prefix_caching_hash_algo: PrefixCachingHashAlgo = (
         CacheConfig.prefix_caching_hash_algo
     )
@@ -1975,10 +1975,11 @@ def _set_default_args(
         if self.prefill_context_parallel_size > 1:
             default_chunked_prefill = False
             default_prefix_caching = False
-            logger.warning(
+            logger.warning_once(
                 "--prefill-context-parallel-size > 1 is not compatible with "
                 "chunked prefill and prefix caching now. Chunked prefill "
-                "and prefix caching have been disabled by default."
+                "and prefix caching have been disabled by default.",
+                scope="local",
             )
 
         if self.enable_chunked_prefill is None:
@@ -1988,15 +1989,27 @@ def _set_default_args(
                 "%s chunked prefill by default",
                 "Enabling" if default_chunked_prefill else "Disabling",
             )
+        elif (
+            model_config.runner_type == "generate"
+            and not self.enable_chunked_prefill
+            and default_chunked_prefill
+        ):
+            logger.warning_once(
+                "This model does not officially support disabling chunked prefill. "
+                "Disabling this manually may cause the engine to crash "
+                "or produce incorrect outputs.",
+                scope="local",
+            )
         elif (
             model_config.runner_type == "pooling"
             and self.enable_chunked_prefill
             and not default_chunked_prefill
         ):
-            logger.warning(
+            logger.warning_once(
                 "This model does not officially support chunked prefill. "
                 "Enabling this manually may cause the engine to crash "
                 "or produce incorrect outputs.",
+                scope="local",
             )
 
         if self.enable_prefix_caching is None:
@@ -2011,10 +2024,11 @@ def _set_default_args(
             and self.enable_prefix_caching
             and not default_prefix_caching
         ):
-            logger.warning(
+            logger.warning_once(
                 "This model does not officially support prefix caching. "
                 "Enabling this manually may cause the engine to crash "
                 "or produce incorrect outputs.",
+                scope="local",
             )
 
         world_size = self.pipeline_parallel_size * self.tensor_parallel_size
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4cb5348cbacc..a7ec0de37263 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -180,7 +180,7 @@ def __init__(
         self.kv_cache_manager = KVCacheManager(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
-            enable_caching=bool(self.cache_config.enable_prefix_caching),
+            enable_caching=self.cache_config.enable_prefix_caching,
             use_eagle=self.use_eagle,
             log_stats=self.log_stats,
             enable_kv_cache_events=self.enable_kv_cache_events,

From 6965a392a4cd38ee65ac6a9c2730e0a7c62a658d Mon Sep 17 00:00:00 2001
From: Nandan Vallamdasu <nandan.vallamdasu@outlook.com>
Date: Sat, 22 Nov 2025 18:28:22 +0530
Subject: [PATCH 330/578] Fix: Resolve circular import in model_loader/utils.py
 (#29189)

Signed-off-by: nandan2003 <nandan.vallamdasu@outlook.com>
Signed-off-by: Nandan Vallamdasu  <nandan.vallamdasu@outlook.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/model_loader/utils.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index e74434e9d12c..1db6337f4c9f 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -19,12 +19,7 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.models.adapters import (
-    as_embedding_model,
-    as_reward_model,
-    as_seq_cls_model,
-    try_create_mm_pooling_model_cls,
-)
+
 from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
 from vllm.utils.platform_utils import is_pin_memory_available
 
@@ -172,6 +167,12 @@ def device_loading_context(module: torch.nn.Module, target_device: torch.device)
 
 
 def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
+    from vllm.model_executor.models.adapters import (
+        as_embedding_model,
+        as_reward_model,
+        as_seq_cls_model,
+        try_create_mm_pooling_model_cls,
+    )
     architectures = getattr(model_config.hf_config, "architectures", [])
 
     model_cls, arch = model_config.registry.resolve_model_cls(

From 2d4978a57e0addf55cde6113e9615ed064b72fb7 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Sat, 22 Nov 2025 21:00:04 +0800
Subject: [PATCH 331/578] fix: clean up function never use in setup.py (#29061)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
---
 setup.py | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/setup.py b/setup.py
index 5591bcb13244..8871b04d8fc4 100644
--- a/setup.py
+++ b/setup.py
@@ -74,18 +74,6 @@ def is_ninja_available() -> bool:
     return which("ninja") is not None
 
 
-def is_url_available(url: str) -> bool:
-    from urllib.request import urlopen
-
-    status = None
-    try:
-        with urlopen(url) as f:
-            status = f.status
-    except Exception:
-        return False
-    return status == 200
-
-
 class CMakeExtension(Extension):
     def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
         super().__init__(name, sources=[], py_limited_api=True, **kwa)
@@ -533,28 +521,6 @@ def get_nvcc_cuda_version() -> Version:
     return nvcc_cuda_version
 
 
-def get_gaudi_sw_version():
-    """
-    Returns the driver version.
-    """
-    # Enable console printing for `hl-smi` check
-    output = subprocess.run(
-        "hl-smi",
-        shell=True,
-        text=True,
-        capture_output=True,
-        env={"ENABLE_CONSOLE": "true"},
-    )
-    if output.returncode == 0 and output.stdout:
-        return (
-            output.stdout.split("\n")[2]
-            .replace(" ", "")
-            .split(":")[1][:-1]
-            .split("-")[0]
-        )
-    return "0.0.0"  # when hl-smi is not available
-
-
 def get_vllm_version() -> str:
     # Allow overriding the version. This is useful to build platform-specific
     # wheels (e.g. CPU, TPU) without modifying the source.

From 5f7209a793ec553889f8ba9972a0034393a6b196 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Sat, 22 Nov 2025 08:00:50 -0500
Subject: [PATCH 332/578] [tiny] Remove unsupported TRITON_MLA backend from
 batch invariance (#28832)

Signed-off-by: Bram Wasti <bwasti@meta.com>
Signed-off-by: Bram Wasti <bwasti@fb.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 vllm/model_executor/layers/batch_invariant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index bec7af028634..8b33727f05fb 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -805,11 +805,11 @@ def override_envs_for_invariance():
         "FLASH_ATTN",  # best supported backend
         "FLASHINFER",
         "FLASH_ATTN_MLA",
-        "TRITON_MLA",
         # Not yet supported MLA backends
         # "FLASHMLA",
         # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance
         # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967
+        # "TRITON_MLA",
     ]
     if curr_attn_backend not in supported_backends:
         warning = (

From 066209a045216c87bd582be97830eae728a29369 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 22 Nov 2025 15:38:44 +0100
Subject: [PATCH 333/578] [Attention] Refactor FA `block_size` limitations to
 hybrid models only  (#29084)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/attention/test_mla_backends.py       |  2 +-
 tests/v1/worker/test_gpu_model_runner.py      |  4 ++-
 vllm/attention/backends/abstract.py           | 10 ++++---
 vllm/v1/attention/backends/flash_attn.py      | 27 ++++++++++++++-----
 vllm/v1/attention/backends/flashinfer.py      | 12 ++++-----
 vllm/v1/attention/backends/mla/cutlass_mla.py |  5 +++-
 .../attention/backends/mla/flashattn_mla.py   |  5 +++-
 .../attention/backends/mla/flashinfer_mla.py  |  5 +++-
 vllm/v1/attention/backends/mla/flashmla.py    |  5 +++-
 .../attention/backends/mla/flashmla_sparse.py |  5 +++-
 vllm/v1/attention/backends/mla/indexer.py     |  6 ++---
 .../attention/backends/mla/rocm_aiter_mla.py  |  4 ++-
 vllm/v1/attention/backends/rocm_aiter_fa.py   |  5 +++-
 vllm/v1/attention/backends/tree_attn.py       |  5 +++-
 vllm/v1/attention/backends/triton_attn.py     |  5 +++-
 vllm/v1/attention/backends/xformers.py        |  5 +++-
 vllm/v1/worker/gpu_model_runner.py            |  4 +--
 17 files changed, 82 insertions(+), 32 deletions(-)

diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 1bd05e6183dc..783e02ce89bd 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -61,7 +61,7 @@
 
 BACKEND_BLOCK_SIZES = {}
 for backend in BACKENDS_TO_TEST:
-    supported_sizes = backend.get_class().supported_kernel_block_sizes
+    supported_sizes = backend.get_class().get_supported_kernel_block_sizes()
     if supported_sizes:
         default_size = supported_sizes[0]
         block_size = (
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 01c1364f7ee6..d0f1b703fcb9 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -185,7 +185,9 @@ def _make_mock_backend_for_kernel_block_size(
     supported_sizes: list[int | MultipleOf],
 ):
     class _MockBackend:
-        supported_kernel_block_sizes = supported_sizes
+        @staticmethod
+        def get_supported_kernel_block_sizes():
+            return supported_sizes
 
     return _MockBackend()
 
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 67ded8847524..bd7e81b15bfc 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -46,9 +46,12 @@ class AttentionBackend(ABC):
     # makes sure the output tensor is allocated inside the cudagraph.
     accept_output_buffer: bool = False
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(1)]
     supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = ["auto"]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(1)]
+
     @staticmethod
     @abstractmethod
     def get_name() -> str:
@@ -142,10 +145,11 @@ def supports_block_size(cls, block_size: int | None) -> bool:
         if block_size not in valid_sizes:
             return False
 
-        if not cls.supported_kernel_block_sizes:
+        supported_kernel_block_sizes = cls.get_supported_kernel_block_sizes()
+        if not supported_kernel_block_sizes:
             return True
 
-        for supported_size in cls.supported_kernel_block_sizes:
+        for supported_size in supported_kernel_block_sizes:
             if isinstance(supported_size, MultipleOf):
                 supported_size = supported_size.base
             # With hybrid_blocks feature, the framework-level block size
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 9fa6b1dfd19d..a9a4af5ac118 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -32,7 +32,7 @@
         get_scheduler_metadata,
         reshape_and_cache_flash,
     )
-from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.config import VllmConfig, get_current_vllm_config, get_layers_from_vllm_config
 from vllm.config.cache import CacheDType
 from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
@@ -56,11 +56,26 @@
 class FlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    # NOTE(tdoublep): while in principle, FA supports
-    # MultipleOf(16), these are the block sizes that do not
-    # suffer from the NaN propagation problem described here:
-    # https://github.com/Dao-AILab/flash-attention/issues/1974
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        vllm_config = get_current_vllm_config()
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        if (
+            model_config
+            and model_config.is_hybrid
+            and (
+                cache_config.mamba_ssm_cache_dtype == "float32"
+                or cache_config.mamba_cache_dtype == "float32"
+            )
+        ):
+            # NOTE(tdoublep): while in principle, FA supports
+            # MultipleOf(16), these are the block sizes that do not
+            # suffer from the NaN propagation problem described here:
+            # https://github.com/Dao-AILab/flash-attention/issues/1974
+            return [16, 32, 64]
+        return [MultipleOf(16)]
 
     @staticmethod
     def get_name() -> str:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index e3f499216d7f..8159f4096107 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -16,7 +16,6 @@
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 from flashinfer.utils import FP4Tensor
-from typing_extensions import override
 
 from vllm import envs
 from vllm.attention.backends.abstract import (
@@ -275,10 +274,6 @@ def run(
 class FlashInferBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    # Note: Not sure for all platforms,
-    # but on Blackwell, only support a page size of
-    # 16, 32, 64
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
@@ -286,6 +281,12 @@ class FlashInferBackend(AttentionBackend):
         "fp8_e5m2",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        # Note: Not sure for all platforms, but on Blackwell,
+        # only support a page size of 16, 32, 64.
+        return [16, 32, 64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHINFER"
@@ -566,7 +567,6 @@ def __init__(
             )
 
     @classmethod
-    @override
     def get_cudagraph_support(
         cls: type["FlashInferMetadataBuilder"],
         vllm_config: VllmConfig,
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 60cb5022a55e..5e3fbc0abf08 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -36,13 +36,16 @@ class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
 
 class CutlassMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [128]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
         "fp8_e4m3",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [128]
+
     @staticmethod
     def get_name() -> str:
         return "CUTLASS_MLA"
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index 12639edc8b9a..d369814c10b6 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -41,9 +41,12 @@
 
 class FlashAttnMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
+
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN_MLA"
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
index 52bb19e039e4..f02a4bb1ef35 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -35,13 +35,16 @@ class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
 
 class FlashInferMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [32, 64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
         "fp8_e4m3",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [32, 64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHINFER_MLA"
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 3aab1f9bb7fb..74a4cd843025 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -39,13 +39,16 @@
 
 class FlashMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
         "fp8_e4m3",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHMLA"
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index 3f2cc8c38327..1eee1d225293 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -55,9 +55,12 @@
 class FlashMLASparseBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto", "fp8_ds_mla"]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHMLA_SPARSE"
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index d38361e0fcbf..77f1ba00d5b0 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -24,9 +24,9 @@
 
 
 class DeepseekV32IndexerBackend(AttentionBackend):
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [
-        1 if current_platform.is_rocm() else 64
-    ]
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [1 if current_platform.is_rocm() else 64]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 6ccc1a341d56..56f9c7a281e7 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -21,7 +21,9 @@
 
 
 class AiterMLABackend(MLACommonBackend):
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [1]
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [1]
 
     @staticmethod
     def get_name() -> str:
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index ea611848b0e8..c8742e983520 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -447,7 +447,10 @@ def use_cascade_attention(self, *args, **kwargs) -> bool:
 class AiterFlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 1bf38ed225a4..523f759e05a2 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -31,7 +31,10 @@
 class TreeAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 09c36043c8c8..d051a89f03bb 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -154,7 +154,6 @@ class TritonAttentionBackend(AttentionBackend):
         torch.bfloat16,
         torch.float32,
     ]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
@@ -162,6 +161,10 @@ class TritonAttentionBackend(AttentionBackend):
         "fp8_e5m2",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
+
     @staticmethod
     def get_name() -> str:
         return "TRITON_ATTN"
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index d15d79417cc6..5039c44b9c3e 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -42,7 +42,10 @@
 class XFormersAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e786cd8bc7c9..298bb1ef5f6f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4618,7 +4618,7 @@ def block_size_is_supported(
             """
             for backend in backends:
                 is_supported = False
-                for supported_size in backend.supported_kernel_block_sizes:
+                for supported_size in backend.get_supported_kernel_block_sizes():
                     if isinstance(supported_size, int):
                         if block_size == supported_size:
                             is_supported = True
@@ -4649,7 +4649,7 @@ def block_size_is_supported(
         all_int_supported_sizes = set(
             supported_size
             for backend in backends
-            for supported_size in backend.supported_kernel_block_sizes
+            for supported_size in backend.get_supported_kernel_block_sizes()
             if isinstance(supported_size, int)
         )
 

From d44a63c6d6e1a545aff270b3b85cf231ef779dab Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 22 Nov 2025 06:41:25 -0800
Subject: [PATCH 334/578] [BugFix] Fix returned logprobs with spec decode +
 prefill chunking (#29216)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/sample/test_logprobs.py   | 13 +++++++++----
 vllm/v1/sample/sampler.py          |  5 ++++-
 vllm/v1/worker/gpu_model_runner.py | 19 +++++++++----------
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 42584938bc06..c0b0e1ea226e 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -521,8 +521,8 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
         pytest.param(
             (
                 "eagle",
-                "meta-llama/Llama-3.1-8B-Instruct",
-                "yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
+                "meta-llama/Llama-3.2-1B-Instruct",
+                "nm-testing/Llama3_2_1B_speculator.eagle3",
             ),
             marks=large_gpu_mark(min_gb=32),
         ),
@@ -541,7 +541,7 @@ def test_spec_decode_logprobs(
     """
     from vllm import LLM
 
-    prompt = "Hello world"
+    prompt = "Hello world " * 50
     sampling_params = SamplingParams(
         temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
     )
@@ -582,6 +582,9 @@ def test_spec_decode_logprobs(
         seed=42,
         logprobs_mode=logprobs_mode,
         gpu_memory_utilization=0.4,
+        # Force prefill chunking
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=32,
     )
     spec_results = spec_llm.generate([prompt], sampling_params)
     # Collect logprobs outputs from spec decode LLM.
@@ -597,6 +600,8 @@ def test_spec_decode_logprobs(
     # Per-token logprobs are expected to be the same.
     assert len(ref_logprobs) == len(spec_logprobs)
     for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
-        assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
+        assert math.isclose(
+            ref_logprob.logprob, spec_logprob.logprob, rel_tol=5e-2, abs_tol=1e-1
+        )
         assert ref_logprob.rank == spec_logprob.rank
         assert ref_logprob.decoded_token == spec_logprob.decoded_token
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 39c63fe31ad2..c75b4f0543c0 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -81,7 +81,10 @@ def forward(
             if logprobs_mode == "raw_logprobs":
                 raw_logprobs = self.compute_logprobs(logits)
             elif logprobs_mode == "raw_logits":
-                raw_logprobs = logits.clone()
+                if logits.dtype == torch.float32:
+                    raw_logprobs = logits.clone()
+                else:
+                    raw_logprobs = logits.to(torch.float32)
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 298bb1ef5f6f..979f97758703 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2466,7 +2466,9 @@ def _bookkeeping_sync(
 
         num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
+        logprobs_tensors = sampler_output.logprobs_tensors
         invalid_req_indices = []
+        cu_num_new_tokens: list[int] | None = None
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
@@ -2479,6 +2481,12 @@ def _bookkeeping_sync(
                     sampled_token_ids,
                     self.input_batch.vocab_size,
                 )
+                if logprobs_tensors:
+                    # Needed for extracting logprobs when spec decoding.
+                    # This must be done prior to discarding sampled tokens.
+                    cu_num_new_tokens = [0]
+                    for toks in valid_sampled_token_ids:
+                        cu_num_new_tokens.append(cu_num_new_tokens[-1] + len(toks))
             # Mask out the sampled tokens that should not be sampled.
             for i in discard_sampled_tokens_req_indices:
                 valid_sampled_token_ids[int(i)].clear()
@@ -2506,10 +2514,6 @@ def _bookkeeping_sync(
         # the sampled tokens back, because there's no direct communication
         # between the first-stage worker and the last-stage worker.
         req_ids = self.input_batch.req_ids
-        logprobs_tensors = sampler_output.logprobs_tensors
-        cu_num_accepted_tokens = (
-            [0] if spec_decode_metadata and logprobs_tensors else None
-        )
         for req_idx in range(num_sampled_tokens):
             if self.use_async_scheduling:
                 sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
@@ -2518,11 +2522,6 @@ def _bookkeeping_sync(
 
             num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
 
-            if cu_num_accepted_tokens is not None:
-                cu_num_accepted_tokens.append(
-                    cu_num_accepted_tokens[-1] + num_sampled_ids
-                )
-
             if not sampled_ids:
                 continue
 
@@ -2544,7 +2543,7 @@ def _bookkeeping_sync(
             req_state.output_token_ids.extend(sampled_ids)
 
         logprobs_lists = (
-            logprobs_tensors.tolists(cu_num_accepted_tokens)
+            logprobs_tensors.tolists(cu_num_new_tokens)
             if not self.use_async_scheduling and logprobs_tensors is not None
             else None
         )

From ae66818379fc2403e43c47154a98170aa7cab192 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 22 Nov 2025 22:48:01 +0800
Subject: [PATCH 335/578] [Misc] Fix pre-commit (#29238)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/model_loader/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 1db6337f4c9f..2021b68b8a60 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -19,7 +19,6 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-
 from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
 from vllm.utils.platform_utils import is_pin_memory_available
 
@@ -173,6 +172,7 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
         as_seq_cls_model,
         try_create_mm_pooling_model_cls,
     )
+
     architectures = getattr(model_config.hf_config, "architectures", [])
 
     model_cls, arch = model_config.registry.resolve_model_cls(

From d84d8f4429a5246a9d9f179b47fac7e13801710d Mon Sep 17 00:00:00 2001
From: ZiTian Zhao <zitian.zhao@tencentmusic.com>
Date: Sat, 22 Nov 2025 22:48:59 +0800
Subject: [PATCH 336/578] Fix EVS crash when using `video_embeds` inputs in
 Qwen2.5-VL (#29232)

Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/qwen2_5_vl.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 8e3c0e84dfe5..1500a437613c 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -230,6 +230,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
         - hidden_size must match the hidden size of language model backbone.
         - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
           format
+        - second_per_grid_ts: The video time interval (in seconds) for each
+          grid along the temporal dimension in the 3D position IDs. Returned
+          when `videos` is not `None`.
     """
 
     type: Literal["video_embeds"]
@@ -244,6 +247,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
         TensorShape("nv", 3),
     ]
 
+    second_per_grid_ts: Annotated[
+        torch.Tensor | None,
+        TensorShape("nv"),
+    ] = None
+
 
 Qwen2_5_VLVideoInputs: TypeAlias = (
     Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
@@ -1311,6 +1319,7 @@ def _parse_and_validate_video_input(
                 type="video_embeds",
                 video_embeds=video_embeds,
                 video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
             )
 
     def _process_image_input(
@@ -1422,7 +1431,13 @@ def _postprocess_video_embeds_evs(
 
         # Cast to long to match the original code
         # https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa
-        second_per_grid_ts = video_input["second_per_grid_ts"].long()
+        second_per_grid_ts = video_input.get("second_per_grid_ts")
+        if second_per_grid_ts is None:
+            raise ValueError(
+                "second_per_grid_ts is required when video_pruning_rate > 0 "
+                "is enabled for video inputs, including the video_embeds path."
+            )
+        second_per_grid_ts = second_per_grid_ts.long()
         tokens_per_second = self.config.vision_config.tokens_per_second
 
         video_embeds_out = []

From f55c76c2b3270bb45072c05d6d53460c373b2172 Mon Sep 17 00:00:00 2001
From: Federico <65908512+coval3nte@users.noreply.github.com>
Date: Sat, 22 Nov 2025 17:42:48 +0100
Subject: [PATCH 337/578] chore: add RTX_PRO_6000 GLM4.6-FP8 kernel tuning
 (#29240)

---
 ...ackwell_Server_Edition,dtype=fp8_w8a8.json | 147 ++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json
new file mode 100644
index 000000000000..8b78f87e7f73
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}

From 730bd35378bf2a5b56b6d3a45be28b3092d26519 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Sat, 22 Nov 2025 17:04:36 +0000
Subject: [PATCH 338/578] [perf][cpu] Accelerate paged attention GEMMs (QK, PV)
 on Arm CPUs with NEON (#29193)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 csrc/cpu/cpu_attn.cpp                  |  17 ++
 csrc/cpu/cpu_attn_impl.hpp             |   8 +-
 csrc/cpu/cpu_attn_neon.hpp             | 386 +++++++++++++++++++++++++
 vllm/engine/arg_utils.py               |   3 +-
 vllm/v1/attention/backends/cpu_attn.py |   7 +-
 5 files changed, 416 insertions(+), 5 deletions(-)
 create mode 100644 csrc/cpu/cpu_attn_neon.hpp

diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
index 50f17c758c14..92f8bee5a47a 100644
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -13,6 +13,18 @@
   #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
 #endif
 
+#ifdef __aarch64__
+  #include "cpu_attn_neon.hpp"
+  #define NEON_DISPATCH(...)                                                   \
+    case cpu_attention::ISA::NEON: {                                           \
+      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
+                                                     scalar_t, head_dim>;      \
+      return __VA_ARGS__();                                                    \
+    }
+#else
+  #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
+#endif  // #ifdef __aarch64__
+
 #define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
   case HEAD_DIM: {                            \
     constexpr size_t head_dim = HEAD_DIM;     \
@@ -41,6 +53,7 @@
   [&] {                                                                       \
     switch (ISA_TYPE) {                                                       \
       AMX_DISPATCH(__VA_ARGS__)                                               \
+      NEON_DISPATCH(__VA_ARGS__)                                              \
       case cpu_attention::ISA::VEC: {                                         \
         using attn_impl =                                                     \
             cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t,   \
@@ -73,6 +86,8 @@ torch::Tensor get_scheduler_metadata(
     isa = cpu_attention::ISA::VEC;
   } else if (isa_hint == "vec16") {
     isa = cpu_attention::ISA::VEC16;
+  } else if (isa_hint == "neon") {
+    isa = cpu_attention::ISA::NEON;
   } else {
     TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
   }
@@ -158,6 +173,8 @@ void cpu_attn_reshape_and_cache(
       return cpu_attention::ISA::VEC;
     } else if (isa == "vec16") {
       return cpu_attention::ISA::VEC16;
+    } else if (isa == "neon") {
+      return cpu_attention::ISA::NEON;
     } else {
       TORCH_CHECK(false, "Invalid ISA type: " + isa);
     }
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 294b4f714a76..12c6f5d3015c 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -14,7 +14,7 @@
 #include "utils.hpp"
 
 namespace cpu_attention {
-enum class ISA { AMX, VEC, VEC16 };
+enum class ISA { AMX, VEC, VEC16, NEON };
 
 template <ISA isa, typename scalar_t, int64_t head_dim>
 class AttentionImpl {};
@@ -143,6 +143,12 @@ struct AttentionMetadata {
       case ISA::VEC:
         ss << "VEC, ";
         break;
+      case ISA::VEC16:
+        ss << "VEC16, ";
+        break;
+      case ISA::NEON:
+        ss << "NEON, ";
+        break;
     }
     ss << "workitem_group_num: " << workitem_group_num
        << ", reduction_item_num: " << reduction_item_num
diff --git a/csrc/cpu/cpu_attn_neon.hpp b/csrc/cpu/cpu_attn_neon.hpp
new file mode 100644
index 000000000000..827f0cfbc718
--- /dev/null
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -0,0 +1,386 @@
+#ifndef CPU_ATTN_NEON_HPP
+#define CPU_ATTN_NEON_HPP
+
+#include "cpu_attn_impl.hpp"
+#include <arm_neon.h>
+#include <type_traits>
+namespace cpu_attention {
+
+namespace {
+
+#define BLOCK_SIZE_ALIGNMENT 32
+#define HEAD_SIZE_ALIGNMENT 32
+#define MAX_Q_HEAD_NUM_PER_ITER 16
+
+// These do not use vectorized class for loading / converting
+// because csrc/cpu/cpu_types_arm.hpp does not have fallback options
+// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that
+// doesn't support BF16.
+// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency.
+template <typename kv_cache_t>
+FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0,
+                                     float32x4_t& b1);
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<float>(const float* p, float32x4_t& b0,
+                                            float32x4_t& b1) {
+  b0 = vld1q_f32(p + 0);
+  b1 = vld1q_f32(p + 4);
+}
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::Half>(const c10::Half* p,
+                                                float32x4_t& b0,
+                                                float32x4_t& b1) {
+  const float16_t* h = reinterpret_cast<const float16_t*>(p);
+  float16x8_t v = vld1q_f16(h);
+  b0 = vcvt_f32_f16(vget_low_f16(v));
+  b1 = vcvt_f32_f16(vget_high_f16(v));
+}
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
+                                                    float32x4_t& b0,
+                                                    float32x4_t& b1) {
+  const uint16_t* u = reinterpret_cast<const uint16_t*>(p);
+#ifdef ARM_BF16_SUPPORT
+  uint16x8_t u0 = vld1q_u16(u);
+  bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0);
+  b0 = vcvtq_low_f32_bf16(bf0);
+  b1 = vcvtq_high_f32_bf16(bf0);
+#else
+  uint16x8_t x0 = vld1q_u16(u);
+  uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16);
+  uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16);
+  b0 = vreinterpretq_f32_u32(lo);
+  b1 = vreinterpretq_f32_u32(hi);
+#endif
+}
+
+// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
+// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
+// #FMLAs = (K // 4) * (4 * 2 * M)
+// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
+template <int32_t M, typename kv_cache_t>
+FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4(
+    const float* __restrict A,       // [M x K],
+    const kv_cache_t* __restrict B,  // [K x 8],
+    float* __restrict C,             // [M x 8],
+    int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) {
+  // kernel supports max M of 8, as it'd spill for larger M
+  static_assert(1 <= M && M <= 8, "M must be in [1,8]");
+
+// helpers for per-M codegen
+#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7)
+#define IF_M(i) if constexpr (M > (i))
+
+  // A row base pointers
+#define DECL_A(i) const float* a##i = A + (i) * lda;
+  ROWS_APPLY(DECL_A)
+#undef DECL_A
+
+  // declare 2 accumulators per row of M
+#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1;
+  ROWS_APPLY(DECL_ACC)
+#undef DECL_ACC
+
+  // initialize accumulators
+#define INIT_ACC(i)                              \
+  IF_M(i) {                                      \
+    if (accumulate) {                            \
+      acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \
+      acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \
+    } else {                                     \
+      acc##i##_0 = vdupq_n_f32(0.f);             \
+      acc##i##_1 = vdupq_n_f32(0.f);             \
+    }                                            \
+  }
+  ROWS_APPLY(INIT_ACC)
+#undef INIT_ACC
+
+  int32_t k = 0;
+
+  // K unrolled by 4
+  for (; k + 3 < K; k += 4) {
+    // load A[k..k+3] for each active row (M)
+#define LOAD_A4(i)     \
+  float32x4_t a##i##v; \
+  IF_M(i) a##i##v = vld1q_f32(a##i + k);
+    ROWS_APPLY(LOAD_A4)
+#undef LOAD_A4
+
+    // helper: FMA lane L from aiv
+#define FMAS_LANE(i, aiv, L)                              \
+  IF_M(i) {                                               \
+    acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \
+    acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \
+  }
+
+    // k + 0
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 0) * ldb, b0, b1);
+#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0)
+      ROWS_APPLY(STEP_K0)
+#undef STEP_K0
+    }
+    // k + 1
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 1) * ldb, b0, b1);
+#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1)
+      ROWS_APPLY(STEP_K1)
+#undef STEP_K1
+    }
+    // k + 2
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 2) * ldb, b0, b1);
+#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2)
+      ROWS_APPLY(STEP_K2)
+#undef STEP_K2
+    }
+    // k + 3
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 3) * ldb, b0, b1);
+#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3)
+      ROWS_APPLY(STEP_K3)
+#undef STEP_K3
+    }
+#undef FMAS_LANE
+  }
+
+  // K tail
+  for (; k < K; ++k) {
+    float32x4_t b0, b1;
+    load_row8_B_as_f32<kv_cache_t>(B + (int64_t)k * ldb, b0, b1);
+#define TAIL_ROW(i)                             \
+  IF_M(i) {                                     \
+    float32x4_t ai = vdupq_n_f32(*(a##i + k));  \
+    acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \
+    acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \
+  }
+    ROWS_APPLY(TAIL_ROW)
+#undef TAIL_ROW
+  }
+
+  // store accumulators to C
+#define STORE_ROW(i)                          \
+  IF_M(i) {                                   \
+    vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \
+    vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \
+  }
+  ROWS_APPLY(STORE_ROW)
+#undef STORE_ROW
+
+#undef ROWS_APPLY
+#undef IF_M
+}
+
+template <int32_t N, typename kv_cache_t>
+FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A,
+                                               const kv_cache_t* __restrict B,
+                                               float* __restrict C, int32_t M,
+                                               int32_t K, int64_t lda,
+                                               int64_t ldb, int64_t ldc,
+                                               bool accumulate) {
+  // micro kernel is Mx8
+  static_assert(N % 8 == 0, "N must be a multiple of 8");
+  for (int32_t m = 0; m < M;) {
+    int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1;
+    const float* Ab = A + m * lda;
+    float* Cb = C + m * ldc;
+
+    for (int32_t n = 0; n < N; n += 8) {
+      const kv_cache_t* Bn = B + n;
+      float* Cn = Cb + n;
+      switch (mb) {
+        case 8:
+          gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        case 4:
+          gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        case 2:
+          gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        default:
+          gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+      }
+    }
+    // no tail loop for N as it's guaranteed to be a multiple of 8
+    m += mb;
+  }
+}
+
+template <typename kv_cache_t>
+class TileGemmNeonFMLA {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    if constexpr (phase == AttentionGemmPhase::QK) {
+      gemm_macro_neon_fmla_Mx8_Ku4<BLOCK_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c);
+    } else {
+      gemm_macro_neon_fmla_Mx8_Ku4<HEAD_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc,
+          accum_c);
+    }
+  }
+};
+
+}  // namespace
+
+// this is similar to "ISA::VEC" at the moment
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      BLOCK_SIZE_ALIGNMENT;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      HEAD_SIZE_ALIGNMENT;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::NEON;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+  static_assert(HeadDim % HeadDimAlignment == 0);
+  // the gemm micro kernel is Mx8
+  static_assert(HeadDimAlignment % 8 == 0);
+  static_assert(BlockSizeAlignment % 8 == 0);
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemmNeonFMLA<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+
+  // Copy q to q_buffer and cast it to fp32
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      float* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, float scale) {
+    static_assert(head_dim % 16 == 0);
+    constexpr int32_t unroll_size = head_dim / 16;
+    using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+    vec_op::FP32Vec16 scale_vec(scale);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
+        scalar_t* __restrict__ curr_q =
+            src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
+        float* __restrict__ curr_q_buffer =
+            q_buffer + q_num_idx * q_heads_per_kv * head_dim +
+            q_head_idx * head_dim;
+
+        vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
+          load_vec_t vec(curr_q);
+          vec_op::FP32Vec16 fp32_vec(vec);
+          fp32_vec = fp32_vec * scale_vec;
+          fp32_vec.save(curr_q_buffer);
+
+          curr_q += 16;
+          curr_q_buffer += 16;
+        });
+      }
+    }
+  }
+
+  // reshape K as column-major and V as row-major
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key
+          const scalar_t* key_start_ptr = key +
+                                          token_idx * key_token_num_stride +
+                                          head_idx * key_head_num_stride;
+          scalar_t* key_cache_start_ptr =
+              key_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
+            key_cache_start_ptr[j] = key_start_ptr[i];
+          }
+        }
+        {
+          // Write Value
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset * head_dim;
+          std::memcpy(value_cache_start_ptr, value_start_ptr,
+                      sizeof(scalar_t) * head_dim);
+        }
+      }
+    }
+  }
+};
+}  // namespace cpu_attention
+
+#endif  // #ifndef CPU_ATTN_NEON_HPP
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 611bf1b37584..b7c8f56e18c5 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1392,11 +1392,10 @@ def create_engine_config(
         # Set default arguments for V1 Engine.
         self._set_default_args(usage_context, model_config)
         # Disable chunked prefill and prefix caching for:
-        # POWER (ppc64le)/ARM/s390x/RISCV CPUs in V1
+        # POWER (ppc64le)/s390x/RISCV CPUs in V1
         if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
             CpuArchEnum.POWERPC,
             CpuArchEnum.S390X,
-            CpuArchEnum.ARM,
             CpuArchEnum.RISCV,
         ):
             logger.info(
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index f1254352c058..590bf91b0d05 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -25,7 +25,7 @@
 
 logger = init_logger(__name__)
 
-_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86,)
+_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86, CpuArchEnum.ARM)
 
 
 class CPUAttentionBackend(AttentionBackend):
@@ -491,6 +491,9 @@ def _get_attn_isa(dtype: torch.dtype, block_size: int) -> str:
     if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0:
         return "amx"
     elif block_size % 32 == 0:
-        return "vec"
+        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+            return "neon"
+        else:
+            return "vec"
     else:
         return "vec16"

From d1cf8214e523ce664797b3f65a26ffdc6e81f032 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 23 Nov 2025 02:22:48 +0800
Subject: [PATCH 339/578] [Bugfix] Use HF config fields as fallback when
 loading Mistral config (#29239)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-amd.yaml                   |  1 +
 .buildkite/test-pipeline.yaml              |  1 +
 vllm/transformers_utils/config.py          | 14 +++++++++++++-
 vllm/transformers_utils/configs/mistral.py | 13 ++++++++++---
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4ee81fdabf66..f098e23866eb 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -754,6 +754,7 @@ steps:
   torch_nightly: true
   source_file_dependencies:
   - vllm/model_executor/models/
+  - vllm/transformers_utils/
   - tests/models/test_initialization.py
   commands:
     # Only when vLLM model source is modified - test initialization of a large
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a5719d438eec..7a46e919f93b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -691,6 +691,7 @@ steps:
   torch_nightly: true
   source_file_dependencies:
   - vllm/model_executor/models/
+  - vllm/transformers_utils/
   - tests/models/test_initialization.py
   commands:
     # Only when vLLM model source is modified - test initialization of a large
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 9eac7bb50afa..db7bf228f411 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -204,7 +204,19 @@ def parse(
 
         from vllm.transformers_utils.configs.mistral import adapt_config_dict
 
-        config = adapt_config_dict(config_dict)
+        # Get missing fields from HF config if available
+        try:
+            hf_config_dict, _ = PretrainedConfig.get_config_dict(
+                model,
+                revision=revision,
+                code_revision=code_revision,
+                token=_get_hf_token(),
+                **kwargs,
+            )
+        except OSError:  # Not found
+            hf_config_dict = {}
+
+        config = adapt_config_dict(config_dict, defaults=hf_config_dict)
 
         # Mistral configs may define sliding_window as list[int]. Convert it
         # to int and add the layer_types list[str] to make it HF compatible
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index 8da4ab35c56c..966737aad086 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -9,14 +9,18 @@
 logger = init_logger(__name__)
 
 
-def adapt_config_dict(config_dict: dict[str, Any], **kwargs) -> PretrainedConfig:
-    config_dict.update(kwargs)
+def adapt_config_dict(
+    config_dict: dict[str, Any],
+    defaults: dict[str, Any],
+) -> PretrainedConfig:
     config_dict = _remap_general_mistral_args(config_dict)
 
     if bool(config_dict.get("quantization")):
         config_dict = _remap_mistral_quantization_args(config_dict)
 
-    if bool(config_dict.get("moe")):
+    if config_dict.get("model_type") == "mamba":
+        config_dict["architectures"] = ["Mamba2ForCausalLM"]
+    elif bool(config_dict.get("moe")):
         config_dict["architectures"] = ["MixtralForCausalLM"]
     else:
         config_dict["architectures"] = ["MistralForCausalLM"]
@@ -52,6 +56,9 @@ def adapt_config_dict(config_dict: dict[str, Any], **kwargs) -> PretrainedConfig
     if is_audio:
         config_dict = _remap_mistral_audio_args(config_dict)
 
+    for k, v in defaults.items():
+        config_dict.setdefault(k, v)
+
     config = PretrainedConfig.from_dict(config_dict)
 
     logger.debug("Initialized config %s", config)

From eb5352a7707dea349f77fcfcd6f8842cca92b34a Mon Sep 17 00:00:00 2001
From: Benjamin Bartels <benjamin@bartels.dev>
Date: Sat, 22 Nov 2025 18:23:09 +0000
Subject: [PATCH 340/578] [CI/build] Removes source compilation from runtime
 image (#26966)

Signed-off-by: bbartels <benjamin@bartels.dev>
---
 docker/Dockerfile                             |  70 +++++---
 .../dockerfile-stages-dependency.png          | Bin 121695 -> 134558 bytes
 tools/ep_kernels/install_python_libraries.sh  | 160 ++++++++++--------
 tools/install_deepgemm.sh                     |  44 +++--
 4 files changed, 159 insertions(+), 115 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 709b79e84fbb..1b937bbc1225 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -85,7 +85,7 @@ ARG GET_PIP_URL
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo python3-pip \
+    && apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
     && curl -LsSf https://astral.sh/uv/install.sh | sh \
     && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
     && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
@@ -224,6 +224,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
+# Install DeepGEMM from source
+ARG DEEPGEMM_GIT_REF
+COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
+
+# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
+RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
+
+COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
+# Install EP kernels(pplx-kernels and DeepEP)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
+    /tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
+    find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
+
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
@@ -289,7 +305,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y software-properties-common curl sudo python3-pip \
     && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
         if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
@@ -356,36 +372,32 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 uv pip list
 
-# Even when we build Flashinfer with AOT mode, there's still
-# some issues w.r.t. JIT compilation. Therefore we need to
-# install build dependencies for JIT compilation.
-# TODO: Remove this once FlashInfer AOT wheel is fixed
-COPY requirements/build.txt requirements/build.txt
+# Install deepgemm wheel that has been built in the `build` stage
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt \
+    --mount=type=bind,from=build,source=/tmp/deepgemm/dist,target=/tmp/deepgemm/dist,ro \
+    sh -c 'if ls /tmp/deepgemm/dist/*.whl >/dev/null 2>&1; then \
+              uv pip install --system /tmp/deepgemm/dist/*.whl; \
+           else \
+              echo "No DeepGEMM wheels to install; skipping."; \
+           fi'
+
+# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
+RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system ep_kernels/dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF
-COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
-RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
-
-COPY tools/install_gdrcopy.sh install_gdrcopy.sh
-RUN set -eux; \
+RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
+    set -eux; \
     case "${TARGETPLATFORM}" in \
       linux/arm64) UUARCH="aarch64" ;; \
       linux/amd64) UUARCH="x64" ;; \
       *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
     esac; \
-    ./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \
-    rm ./install_gdrcopy.sh
-
-# Install EP kernels(pplx-kernels and DeepEP)
-COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
-ENV CUDA_HOME=/usr/local/cuda
-RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a 10.0a+PTX}" \
-    && bash install_python_libraries.sh
+    /tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
 
 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
 # return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
@@ -415,6 +427,11 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y git
+
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
     CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
@@ -455,12 +472,11 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 
-COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
-
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
     if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
-        uv pip install --system -r requirements/kv_connectors.txt; \
+        uv pip install --system -r /tmp/kv_connectors.txt; \
     fi; \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         BITSANDBYTES_VERSION="0.42.0"; \
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index f8c104ba14259e820a5b4ddf4d872b8696d82b33..57a33524a5169c8b56c7de309cdeb243ab3fa918 100644
GIT binary patch
literal 134558
zcmb5W2RxT;*gt$VgchYzvL!-P$es-up{U48q9}XIx=|`wA=wHc$<C%tDnd$jAtNIr
zd%WMP)P29t`~Khe|Mxtf`+1PxbzSFq9LIMY$9Y{>S5;WInqf6T5bG3=96U}C>pci!
z#RaP6_=(W1>vQ-&tIjGb93&RW|A{Y64j~9uLh+!Src?AltFzOIhPxsiZntEg(QZ6^
zY}GP~9qcRBM^>|Yv&wsMcc-mrJiRHG{erJTTHdL$(|PR;4?G_~ORE03A&ha89Bt@b
zPs76$VRD-`T%&y@D_=cXQ797bcj5Xf2dd%OPfZsGOREDXoK@AnIDWkT;%pXf<<FPs
za%w}$Ki?39XXn+4-T!{exNA`7-|s|^H|78PomFD4<+8uub{sKnC!h59*OTD7GKxRn
z5S|=KZfpL2M=0>@8u<HN#gSvjYVRM78XX(Uw`?zS9B#|(Boy?*cuv&S*Vot8^?%BU
z3r*yukp29loO|i7WDiYD_r#S?e~TLHsWEtb>2p;W@8n=Jy#qVz%3Zs5ZQHiZyeYS@
zG0P}U+KoZPg3*DU2rvrP;@je|YUvX^_dRx>pP{DPS+X$m{Y|(4jl>%Dp<L7Y`yPu6
z2d=I?_MD=F^8Mc@;0eudcfUN)m}Ob|^W#4&DA&iTDJhwK&NlYjvb+C7sz$=$b>(t2
zTweF?-J6@6D=8^Ck#;g&zqBYNrMIKu#fukZZnNuIq)X1%#2Rw0>~Ko_yBM0Z!=;mL
zOpYyk&sRs^ioib&V3L>^Y|h6&#Um)Hs;=M5O#8^o%ggx9?W0lqcQg3&Yh}h46cn86
z^IF4SFxhrl!f}Z4tRxY@@po&USX<7|{1j=mtM7UpqHKZ%khJT4&vxMIhYSOm7iVOr
z#~V%=9KEy0aBgZO&SSy(^|^BQ`Bwo<%_+wcXuX$N>HhoR0Y)!hz6_L}>BTK(UHtTD
zO6Bn3SNLcA#RJ*6S*x67-7Q!T{mX-S9*dJ6i$9*zbPA@XrcSmxw)=~;-3bb!wb)No
z3H}YD4yT_#KN%P}cZUs^&05T5A4@pwF;gG+sHmT(RHvhoNz|(2^ZB^>`C*U6m-GfN
z&by5Fj?9hKd`ded&~k<d)%njuntpk91zVw8?(RmrGWbZ>%=e7Nz{ROR24ORwm^GsO
zgN<38kBWyFBpfbTTU*nvT*n~f)Rz=qEweb6X}e6F`QLD@h)a%hpDFtOh?kFVtyjF^
z^Q$#dle*#jde7EVFix6OM?3#)IAz&Vu$RgUfu@F)dmk@n=({|=MUhJ4Paw<o-Ne>4
zUCi)5Whqed<Jpxp^bgvW8{fmKDc|NR?7ipb#oAczu_*jWckIau8Xu<Bo;C`zTbBM@
zG`7WUvQ?zr)pjoVi^9#Vd);QnXMTS7Wm=sg+W#?q_M?vFIu>b3DlaX*iRShNhYu;o
z$Qy7~ESViDeqR0OCaey=y1C7Eb}+x9BEQw4*_%fCPEbeh-uRoGJ9dos)h8>3@eG%Y
zsnV_#JYTJH^r+P0+}QJK3R&k3*I%ABJpboYrar~Gj=naS`%0gs#kcYL&256;?nhay
zme`>F)PI|x$%|_n<Y*7F2Xn=_jz(U5L@lv_==S<sG^`l!h}oLd#7g<yxZ%Z0MySVR
zar>1RDp9t%J~fXSY%R-5xg@2q`)YCSPbuhqVyHZS{JR(3t*`vIOW5^hA{SYyERb*I
zbKL@lT+3Bi^IL4*a0E(jT>11hS6s=3+I!>z;je@K^Vg@39Xpn9(TeLct&|&XE9Fv)
zjYbeUaCfY<>3H&wGJoN@rd(4wTD!hFV=UG6>({Z2tSZts|CFxorWG`jv>9#tEn21`
z&c_}rr3lDEq|@s=-Bk?PDMwp^2cnUgvueCfP(|eYsehiM-$T@-O`9JNwv}F{qoeD2
z8&RKSRFO%yj4P$kw#T8buKUS9D^L~m%3PguOx`_HT`7CPb^F?-73VI$x}K>BrSI1h
zpKQ=!J=lZHd?(V*xbkVN)Mbl;FIQ0&X?LFgTVlI4g&5vdSFiVq$3mfiWt>bu-FV%T
zt_lAfPdJ7wqLABTTovjkP$oKXknnl_`A_MVb>8Ina^Z)s%!>*dsqvux-LypyPNcPY
zEY6RN#5lI~wH)xIyZv`R)tO62t9bqB1=f3YwAj`Re#$Utv?jb*saCDs`u^?#Usfs`
zTU!GZ0P+Uc|4sCGNd_qwJ69`Y5<<o5Iq~8Ud$7!WpW2A^>*h@aSLC&^nkDe?9R2p3
zMn-b%t)Mq+Wo4ySp^Y)Fun?Y8Eq?Lkjm>hj54Ez`7fdLgP`~{584AZ5jRId%lELx(
z=<*-tpA0<QT=%%oO<9Eyw5$KKPRe{+E;eR<K~$k8&!F-(q#WbGf-Js4omG>b?m}@r
z?9Y&HP%gdkIvI>cRsUXPI@@XFtN(V9RdQCfhv-w=mUpYWxVTIXwR*EQW*R=vHh%Lg
zV8xTH`+p01sDNeJw27<LYtC?=sFfYX<Acp2(sv}CZ3+|#(TTr!={agV)KWM?Mksmr
zdP;5aN^RMMsP)0KSz3+T{v_gpg7>O*h(xVohrx86{Hrk(9TE+x8e9Sb87Q1&?QYa2
zMDIucdwC9LXXouA7N}z8&5sSwFPoCa->vsL^Ze>M{02YhxbrVCR|VHd&+<uMXn1e}
zw^wFx`0!!ZOozcHETPrS|0GrAWAo;(-s1NC<Ri_7l^hK#v0i57^)F0R1q1e`Sa<K<
z-Livd&*(BYo1GZ^vU6F-*80B~S{1BU?C_GZA?*~62=$Z6G(3Q7rfpC4|Gd}{6l+)7
z>j$4<UFMbA?klww5TbGy{sull{I9u%?45hH7rM&d#Y#yGh$@`>^hl30Vz{s;7NJR0
zg#Ha_Ebspbui?DH^k+i|_xL2`@Up3|ekh~WSJ!PH9UZN!t5X&4OH_>!H$HzJEJS4E
z^|?0{ac&cz#caQ!j&e;;PrrWs`sepgGP46Y#rEI#pLychly{LON``{4a<%^Vw%DBT
zuQ`H`?AZ9ax|%<tq{C|s`=ZYGkLfhCPWt+7@p9hyGskwf$=$jIR`vIfUb%8bC*MM^
z(5Ab<x-&Z`r$($Nx}~LM*yU&G<;jJ)9vRC|k6KaA_E@%+APlx_*)r6W_q|?C#_h`$
zdfdDNE@xo<ax++r_*AO>_m3|%lJ4KXt<q-yNiW#)bCLb`zy&obf;;(d+!CVI5A}*C
zfXE(0g*}MX)*}0go7-~0rQBvG%v%aZz{_eXE2+F#MMUxeeYi`Bf%V}A?$a7-Qk(r=
zo_>63dc0n(f<=Ivd#X1faHxE7AvZVo*|TT#2CiABGW07Z)K(FUl>c6l>S>L~!Wg)l
z-%A@C8>OA?s8fxLkI%lmae%oc=e*j<lOOvV(!5zOd`PLKWO3c&I`sp&e(I6#bXSO4
z8A_MTREPIaYcZu<ChiAh#RAnJh-s&4v?!`0E0wBPcPK6p=S@iyAQKxM?fPol`4)#}
z^ELGR4>gjj^94mr>vqJN^;E}jhIq_%vKY17G*&G0ydn1&yFEK!o9DMqeDP&@RYA8?
z<pNfLO++*GcsKAPRln4Qb|oU<xi5`O*Hu(0De20pDkU|w#R+90%eJ}6@<q38J*x<w
z)qhJT5wJaKzd3`9yQ^yp3M>jb^TcBf+-DVU#?3%m8yjiFk@u<cxzRUMtTHl-#ri2&
zIAdCRBIVkxXX}5*s=CR<P^*r*dL{^MYq4X+`AB@^^@9PfzK8^V?d-0O2-Lq%DExdH
z%ZVQxl0QFaAZh&qDc5fOA?2uV^O|!c)`D-#<%!Sd7w6j-OV6ctl{5nacWY<w5ELA1
z4$L=e;Mw9ArfosQFRxun^C~~U<M8*4^0vq3I);W{9_ba6?8VPu?w+Q^+!q=Tr;dNX
zo<HYX8gy5M1-kk=k9|X840l9;SJ-|3I9TZj)~<l4>2wSD@Zm#ZVj|d>p2z%8g|Y**
zD-Rtyq^P*SL`ih)0V~tqH}oO3ezN5=%G0fpXffOM`c>7{v7**zX;)sJ92#sX9J{uG
z1IXHxs2Gy%znak0T=@7~0B{R}Hn-sK@V{k{5iURbW1PQ5B)QC0OAJR#U3PeFjFPL=
zTy|7dRr=gFV0Ti}mkJiklj`a}fXc38??gBSQBadw$+bX6rK6nrGYCVnTn1z9ky)Hp
zSha#k*}7%-Y0Z5WZ6(u_EzWg{TwyoKX!H(WMZ-K-rz`+=qa#N<&{jHI1{$}xFpGLT
zAni7z>pJqXC-$<%*|TT0DJcn_Kd>5qNi*%FIk**PxJZlDbA!d1Psaz<ab<4-V3JPL
z+TA*jOS*0;KRA`aT&|*~rgr2Acf)#;+#*07e!NXhjy?T=X+5Lp`-j>&5Gyefj;rJ{
zk-3(RUW-Xg5)K2+`Ic{P<5MnC{~{JitzQtt(*rp**v89v0F>|H#1K_d`<RL>Be*Tk
zW;H%FFc<F*)d=|U6ze(IUS8gHYQsi4x^|~d|I4TnPbpbiN-sOQ_J9%dqi}(`3Sa!l
z2&R8jr1b-o-@Zi-g4pc2R0nAxPv1p}pkz(I5h$Mnt1qAFPeU5zK^<yFCEmZ!+)~zx
z@G>>hMO>iLsj8|z!t#SsLMs|HF=25foqlxGN!w@Mym@nWxO}l<zzU_#1EDNq0>T1*
zqydr$nT3yZs|aB79lsU$A}ynA=GAt~P~H6(-gkgCO$DbCggmLl>J|#*t|M)jTRx_1
zrq)6K91dWdMiDF@uTutu?f&eHK<Y%<5wU1Nm4=pzx_$fPs@i3qs}C-f3965e^e#7<
zCNoKfL`Um3O_(RE#z6931onatMhahy1z;rk*FquGNgqFUD52I{z(e-s4)+Td*Abmz
z4!+tsCf(n@eatXe{Gg#mmRhjj=+LgX+k|J(iS*OLyLP3fq;R`~ZdTuFmzpv(Yj_}x
zV)*5`Z%=nOJ`WVu5Kqn-S4Li{`%QRFgkbXNJTsa7;)MD?Q@k606^@h(Wms;MWK;&`
zX^wVj;|VMb&)|3R$4PEJ9HDpaXf-L|Stn8IEfEKZ7nG}+Wt7wk3AL2WPVrs)cI`4n
zEpg!@WDD4Xj|?{Dl|yAubq89u-%bI1)VNRN#LW+!-@gBn(+`A)+jy;0g=1e`5-A&{
zCR;RiW-KQ(*Dh&2tCm?H28-*Dr}m5K@$sw-{wQ3Nm5wQ=^ZM1?M?-M0D2_>p5GXuR
zu*g9DGXehy)<}*iEQA9XUwI3poyvIRvW5ty#Fl$}PtX6_39yR+HX9dvAbXhIr@jCo
z3|#t>Mk0Vi$FOIjcb4LjaT0}eHots+up=Vx;-}jVAwaXjfsMyDkSOu_^JfFMpAAqV
zkd<RRCmw`4_>v3#_Rd~%Q{h7NCC9bhJLHFZCNb$kA_bw!h=MF&+Wzg^w}s~N`R2)%
z5}WR-kLfy;a(gbkk3S#leE*!Zpr9=lgcQkOF*LJhAkm8)2E}c^z3rfMD*|5ZU&0G+
z(*L*>85w!&mQ7cn5Wq@01zI3v=TTJrCPZVx#sE4+>=3C6j2yJeT&G*e7>Uu@H%78y
z#FK);3W6sRDu<(d8iKCy`=jF3t5<umI!$+n`_iy1NQuZyw%Cq5^<1{1taN6ig2m(1
z?0zaQ)QMNk`M&cMp2v-GLw9^KSNsa0C)bCNKIG13VBf1_B=7{q*q*U(?|2*5!yO>^
z^4q&8M+637JUTl14g3V|q$?cxC?MVf7eA!j#cna4G&3`M?SyJaDUf%emVM*(T%_x{
z`}ic0HYX$3IUvo^IO+M~$Md6a1jry_rHT}>jE#wzAKb3*^ySL1VjePUij_pGv=a|B
z6_u34FMR`|9!5~^xD^+-C^fr+s1R5Z3$m4@=8I~d`_>Tw%OeHUDyb>)fbV)^hSgP1
zi~^-5!prB3$z8?sl)Bi@c@`bl&b?5J0uvV;T#I}GrDFCEB4nN4W5aJ*EA|Led7+j*
z1mIG6LCt*cm?iBVKpmf6kr0)aocF3y)cy0(*00zuy8~0^XD8bMe>*mAlX4k1415rS
zLP=iujlsfbvoaYBiW!rrh%%Q3fI0J>filuNY-v}bu7dC*(oIcG+ef%o5doIfzX91Z
zD8~BLHaxAgMN5G-mQoE&)rvz-TTxN*b!BC5bN+C8e%nlO`~0KaK%}pZMPU-S1dt32
zWf<s+Jq|dmO0eqw>~E0hiMIXPS(~6Ba60z_lneA!(YMHptJkdYW-UN<ped8#<mBYy
zN=9<Y(OzoGO~v9lwpdqued!0ZEd&!G^;WO~vuSJ5PjEHbl~1u8&Bczx!!CVM7DatG
zcpu%_Z+;qF7d#C@<R^(8$TFBAyZHGbV<6$7h{OkSID7gIC)9$DAciiY^1`2+y~U|q
z7PE~=Il>OxV&J#7R^KqtUhV-E2?0BVTSr<b{Tu+Ww7~2WtQJi`ZB^i6Vhn#&@bv~s
zLA$b1HvxpA15fP<%^i!=af{nz%<N#5ebRH(YqS<U3~!+|=X}j`=G%auPl4M-hMLFl
z2U9;YJmz1}sgb~zdHG|iMh_meu>V0?S?M@ZX0d#6axvCpVH#-bb}A7@%=|Zz^65^N
z#VI`&na38bF4bZ^FVuRZ4x)0x42+13oY9NY*y~ijT-5Nd0tv80%`8C7&<p63MFRnY
zqG9;Vhcs)Fn$BT;$fs!^f;vH-xXq1J@Sb`!2civ|G66}o`2;ebe)s8dGL}C+)HZM(
zIb>*Pn0`8sloo?|pBBcGWuyi;gv~x|m2sc5^q6fX%NhO}Q4g|z@%ZuMr7l1B2@1}R
z_u0POe9HCI;?J9c)l;+Wiwki7%12&pN1P5J0Z5euS%@bE#-$-9p#Mq{CfkM0#-$>@
ze@G39ikf|YfaYkd<aADrbTc>tf~-cg^X9S@t6D&@;N%+rh+9Lb^NTHksCy`wL7ZKk
zB4rJ{pxtmU)U}-Bm&3Mu2)>=l?2PonL)9b4Q35$R#qO-?nCjqD>~Q)rzq-SxJ7epw
z0@b0AiXew17BO$K1KrOhGrHITkGDo*lcq{idfO_wA0X!Q;7hB$;_(EiOTZ;t7MI?;
zK0ZDPK_tl*J+>rDpDbo0wBZbf_Xopc6EXXcf<IB%weg2H0Q(gsxad3>VWD~R{sz)a
z?@(Bh;2j4~X=xR}$Rg`8Xg+cT8N(!OrkU!1pT0a+wQBB&(W{%<1y+U@7UY^r&yUxS
zRZ9rKznHs-v>u=$k~;m@bR10hZ2>%?PKP&)x&j8bD?&T-jNCP6&+G=^79Q|b1#}2#
z?P}rmghC?1eB>!mkMtkr9fn%Av1n%KOF_Rg8Z7~D^!tq)H=fS7m>()0ekE`b3yX}m
z;#j7jc>33uY23`9rH2l97i#<>kW9zStiP6O)e|<*;7)JsJ+!@R`K|oaTO#~~D(#<-
z?VXt%ZijmU#cbdIVHXDn2OAsPz(HB(W15=dQ=NeofkMi#*TE%&*bZ2fjJ$$ft7*k)
zaJ#Y0?eZ_9;&t-}nyK6LM1}AY2&&p~9KJ~<v0Pm>T`TKe*C!vCkFcc#3cGIMhNG#-
zCIHJr{ZjEIf8qfs+<*Ovc=kr5G)VD%)tDkEj#K*1oCBh=6u}V@z(9jCs2cp<eQ|#B
z)I+UWaL&vF%QRirZQl<y?0|Ab8bQ^|f4JS|*JD^Ihnloq=mM?779t?^2;Y`962EbA
z&WglC=o?VVQa~9i6?<?ca1g0L=fEGoeCAt}<#7!S-V+ZtIna9E$iSw78m{&_0|x**
zxL#uY6T?fNvq`1{1M~Ywa~PP06>xNv{<zR{+$0%+(c9V3%t{3g-swxTEVK|LFYK|4
zj+T~|y1G^9D4}@<m0Z>J2ei&FlS&1R1!->QH_JO-++B*Jk#uB;9IX%%1V$H_aX*j%
zX8Z;RZqLPi;1VbolYRB)$9ro*uE<-nPEd=J9(jtE!!Jh*MvUuw@N3fE894N>tJR?Z
z9((kc1-i^?D|T(Gpoz+BV{n6HwYWC21HrM3ksg?WhmMMiGXPBnt-d|cYwej}zPK<C
zwb%~w4;Y3-hz<?iMD})eEkU?di$*C$6^fY|Q%l~vni@;^^w*q;0D5$jZm@X3d-+Hz
z@BvZ6H&WF4E5x4{D=;vlfA<sW$|ct?lpbzs97H5^0Wz1#VT)&WE=uh#jr%1~ipen!
zO(2!9{j6N?>@_+Z$Sj4=fP%xSKMz>(qug$=$sVa9NBcHXI88a6pWGp>B^!+O+?p+X
zcW~tw*Xd;7Pq0$)L&cvma1$O-CkP*S9C>C9NX!$#N6vv+lh&QYh9@4{k1uJPn3&9&
z_roV$E3uwnJMw48xr#1klup1Qu`O}gzVa!0QEOJOe&)ZDknKfq33yDuVWomZV}acZ
z6KpQ9c&Lci{j1@e`D#zPpe2UORtzV>oAro_%D~6;=HA)_nVH^%(9lrCcqsjd$g4~Z
z!pErh{au9nTCa}0Px?_Z9`3(A;4OSwK_MY@D^YRQU0u6vqXQ%HY{PXAn%@EcEF0)L
z+3rzpQ0|VUUnj9fb~@X2dK6rArQCbiOKIr)JV<*a4)5XldI~EQsfDRd2wNz9DAQQs
zizns1G+ue`*-TIG*#Ds~ZgK8B?aI!I!1AG@@1%n+`o|@Tk1hfxD)=*;u-SI~AZ((U
ziNWn>cX%2-%*@QhmB|qG>Zny<`N=UXTR`raO;;r=)xLfEkg$=zgXq~(=o3Wu2M~dg
ze#joI3o5h!%d>FqTOk@jNE-{0)DeQ`9fsSaw)nwIHUhZ-UqW}p%Ih?GQHWa*+pwUZ
zJ*b!{jAV!6{Ss2Zlo@Vk@jooMZ(lWVE;EPF<o^CC1FWD21c^j1SiGz&flN@6y0j~M
zV8^2<@tk^u7G{T!hL%<%Vgk0}N;#|gqayxH68lMOU-36QdhW|aoI*Z)yUrEcSmHbe
z?FVy?4jDr51fU$heft)RfwsWe7uOV+op%fzd_vuWq7xuu5hZ-_@wwL<WfTZ2ucct^
zsOYatdINTAU_%yuWZ_OOF5j)xgy-s=F!!)96ciM4w33Kpnc2@Z*pO)1g@ws-nr-`b
zP<cTLy@kF+c|`Aq_1vExb4P(A0QUtYupPZS(j7&&<T)9y6eMcjV$s^V?_4F)<N3Q4
z(W2N%EX5X<KqiTkKeu?Iq><brQ}7pxTe+q|OG1pef|*H;yr6@FXMwDd_o`<hp5^07
z;iPCj(k+BSEu&wiNkIHT?$6Fmj~QiYsj5~}(Cb?DrzT%TkF5hjp#jqS7oYl(W-#AB
z(QHXWHn!w<Z1KAiv(%~JCR;y0z?f0w!Gh3TEZS~Sa2H^LF4EL7BpMJa*6HhYR;mYR
zHo)Qe2|sv1UG~am|6hI1q0c#7R9>Xh09iPpr$K!E6~fnl=Ft9`1;De#pPeB)W%%<j
z-JP7A>3YSFfk|wKmrsqLFv6hwopS)*oho<s!0JGzZhjAg)xOpK)8@^aDdgT~7?hJ`
z1<Q=e5=vj@zw`u~>q%z=w0bR*xDePnl^4HWQA*qL&9daDsYW}20<Mx%*V1x&C;VwG
z@GUcQ&w#G%OF*+7+KObKXQY4d#3{X>FeCAIR(t-G-V_8Fq&SL&x(O0o2(1{~YO!QA
zwa|6o-i|&p*}p@{{2C5M`9)@7BoHk#u@+5PicClsUJ&s4$@V~l07kLVyS~gf4zho_
zzdXKV$yw|u-$YNZ7Axrtz=4*9jU3^jyOM~HCM_2@awTA3tW@9~i?F>7K+L5pmdmKX
zzWf$(-u-8qL3ejIg<Mc{G=972+*KKD*t0xda*2kqHUeEe7J57an5A5R-6Ii>ZO+LG
zfy}r8!qdZaVQw0++LCE_U@jX!CA)t}^5;h>7#(j57)Ya(z>^no{|s0FZ|IJ;t)>>)
zOmv@=Z~VRJRPAd7N~~Zz+}5<L5&!=1>({R!C1_*xqC*YQ@~H4z<Wx8C116FLb`zq{
zXZu=FePq4~04yRRVx-@E$HwXD8jmukk+a~CWxh+DM=Hbvgy1+zGZw}Oomp*dZTCm?
z7~^=;Y+3I1e4PlXKHVcw&SPT8Z*}cz4^dNB?_9C|gs!e6fE>n#W$lGrSVz*IaYf|Q
z+&w@3)gs%8?zkm@kfU8`Sw8Oorvw%A+4|+O&em^s=SRErr=n7&0d3KR3OFJs_Y_*y
z0qqI%)}p*~*$`(KLwHkFg)*XyOujbaIeG>O2<*RcH~~V8td_4`!McYFIs>*_meg*i
zAQa@6s6HY30sZAVC<m-PV66uo=pxvqm<T~v-s+&kK;vHU5%TWQT<Cr}h87D-pULmw
z&<4#NC%^2<r-{uvd|R54_^@EcMn@qG^1!0Pcux&M{dQbtfC+_lLCc+8SXlUBG{NZj
z2Ua`}pu##JS`fS=Z$MUI|1bXFef#8CwnZ!1GetoRH>@qk+HrB6heWJTR)gB~&^BMA
zxfK^EL(?Djb7WXp*qu9DfsFxzOB|SZV(2>c#TVGPNG1ZTSZok!{)TF(QcG-7->XOZ
z7p0T>uDZxe{0^=IE_Mp{4x1T|3Ks=5K^Zb$S=p~WC^i<I3tjvco_SYS7e*jp6z>)j
z6O)pX;^f3f@Nh;A!}fuW3qea<#$y3J6L4ht8I9#%NU{yJ;oKAR2^<XUQfy$0CyUCT
zjhIp;g+3TCyvm=@Ygb|@QZkug;lL>oF)lt2Er8vHbOXx9-nw-hVFFT1j$t4=kSNUy
z?TmhFHtWFfuV2)`zE`8wi<?BzQs1xyjGD4A0Z9Ewwy-k2(c>$B4DJBV$=nc*eu>N1
z>s+{ZTTKHB!jtacpUmP`&e9qHKbMB<H~aHD>D7|0z9;1$p_LLok?uXLJzuV)k%`d0
zvf4|<Bt;a~Q7NiZ&*EI%BH6scXbmPCb^=TZtAjA}9EBd6V@ZIaaC$*00$wc;MD(4d
zz~H%twgtIEg@uJ9Z%Feh`wjmZ5g4e9cLxRr_O;vo=y;0Xp_cHD0XsyW{)Yf_Z<o3}
z;R<9Y88lRQ5@<d+B4(uQq26(AOJt9VgYwa%ZUb2rtJX1S?kq;JU|VYPs$0)hc+6Em
zZBNojPssumtsr^W2dJJ{@7%jL3mn`y`vgQJ)Xxb_Cy2WIx#s|&Ke*z4wx{j$%W3bU
ziIIb+21xvb1rPDUsD=)rN7s5#Pm~d$yLnrQbHqVZz~`ePnZFR1kd4^I$2ZrI;f_uo
z>1807KnwVRkW8yZrQwCK@7mQzDWeZ_9j+cIMj{A_@g0oBl^y*PPTYdj)YO0iWvD+)
z#2_yr1^G+I>ktp!r_lU`(uEBSi~2li2V#WEcd#C;hg11Bn(6tMI<Xjrhg|D7{k|n`
z9aPQY`HAL=Ko%Cim&_h>Bj6ENW2m?dU?2%YVB$_*hj78E?yk6mRI(#U6&jov{V?84
zgFE|E<p?WH71)UYW_QpMv|$^yJ#7pjOf>GRTtA$EzWz7JWg2~@BS$X43jurhQOOa=
z=p1qv@KqzT=nAP4Ag@IbQ>uTmoOp7Klnxh!B11w1!SD{8UJWKm&WJcTIKa_GzXG!`
zCMMrdAp_4fBa$vX^Pxulm*ly-jOZx-<NJ6HkD~?iNWXMuaiM*2y93AqcMRqhKIfe8
z>FO%M)DdhU(Q|pY5optDlw2gEjm_v&WmMqV@WEzO5D+;Jlrm#R27(B<xr8582{2U1
zMhzX8s>+cgcck51;1oPpT}cp&qz#ABALMo${s9UhV|HY4aBxgaOjsC(5R#AVuxVu4
z!msOu9*=vCa;{lJ{}p=u3+UZNTv<hU9=G|89<qgF7>ZiQBo4)zE=Nnkc*MR#hz8Hm
zOHgsBQMDKBzP*h=Z;w(SDKXJ*s6~vI_u>=JWdOCgaGe-L0*w7Bc)T>l;($l8<YN_g
zN3{^d4{y=maTO0)@kW4zLM#iKy0AIR7k*}h>p5sQ-i76&PPv}(j)<kc5JL<sas?JQ
zyGk<zAU_NOWGpr$8I5nV9?&cV^ia@i)Y||S8F$=;437{&gi`!A8a!=C#mk`(F!vg*
zhfn1Tc`z}4W$$7Z#3G#Cg9){8r#sQLl0KIQ00$nUM*tdpRB7g)@`~;LhjVgrXkz=|
zt=iT|If8#KJb6zL>i3uOv?>M@UYPd!*;mgsz=|g{Kx07UcNk;QhuX^*JCb14lP-{i
zL@{U&9Ok)^j64u6Ak@Gwf(ehReSXN192KA$8eNXO{X2(Ibp<`SWTXb^p*PBTE*{s_
zEy0irvgCqL5nQJOt5+caO8{Nj+1V&Z;jyv$j|Eymj>+dj?+>t_%nx%fyOoh3+;_CE
z{Vn#0XCkm^i$T!QEk%`g!HfY$BOnX79+5#AA;`tW)f09~pX5nwX{bGe2w!yXG3Td?
z5r%|NHVKK#q(v<=Z-=lAtW!oMr};LpG=IwK3Sc3DZ7xorv;3?F=5g$M-+w_DE(5FE
z*w^8c38J6h=EBS1Uk0K81|aUVi;e9re*1e95Q3ts4AI%R6%xV`T)Ec?bFSRG|FqrX
zb$@Z+bqu*c5pOO!SYK$YV2BIbly6iKa6ZQVB!Kjp64mB9czp@A7a$=35I6k%X!IW(
zPf~)aL<PkcX>w<7#Hu?lxpIgfde}#0nCB?#xe8^3Muv3T3k)b4VW9#}2&Jf_CreIg
zTo9T;vS4&{Ca3}Ur-wc;q6h1mic-r>y^M&L{C6jVvEN<Y-J}M^Xeyiqt;{n|!6%_H
zJsP9FH>A<aY!Ts?lA7O|e5LaZ3cesd2>T3T*4@*?@;|eAqmn#KP5!@=Hr+??L?&<V
zY*_geriosLekm&zC{#Xz#fi=lq!?}GK0}t92iXyU1CWYnN4Trd9VpC)nEDk?;!wbt
z?C=hUX)NwI6de|34>t||DEC7j0Ilm19E%>t6ph)ZM@$Z@3GQK%txJu+KY&p?RCTDV
znF(@qX$*tDKicsR*B9PRe7uG$8?=xQqX>v85CD=o&}c&_IiL@ogSed>HbA(jV;7!`
z@cx-L=JFxoj=p^AB@~zo0ti%wsLBH{PMtxs&DXpgP1<5WDAwt^0f_=53_~!q$hn;<
zZ@{*s$v<~>5w$fS>j|~J7%PC%K_vmDMY3S`5p<EEiF*J|GC?%;zYt2H^#FY8?*@zF
zTm0s`xMW_xdnYxRTaUaTx=)d9V{afE<LmjQo;O-@t-!=M=$H}8JWH`!SC|7pAaK42
zTnxWjudk${B8mAUTo0s&9O3vCZHMmt_U^1Uy*!%*Ns46QaTmj^5rm;|%Mn7o+W6c#
ztk*Ew-gw$1m(0Y+I!%;crt<y^QCfTsKaRtNf;)2?iU9+&vGCh5whY<7%cio&^4CLE
zy&tQQ@xWAUMFlQ2^PFXY)`A(q11H_$^({_dD>@=%gLZ>727?#CN};UdFk!)a@}cMh
zYPN&04zP^Xcqq7Guv@GvQB`7HMqbjNH#R229`t!VCmTU%IxV%bL~lGQ8LcAwJQ(P@
zJYK8d3ugeFT2%U+i3vH-g0jN`wuiw-Hy^bV(ER`j_-}f7W+&w2<dl@Ej;*7vXhbz7
z$2TqriDUW)E)z4E279eKo&W|A-3si%zk=raf#0~&!3`k_%ZX**xg+F2aP@Mc!XLij
zD@qLYij7W~)2@U*!)12!@SMirBA8czs<`NVIDjEA#{3VXm7@*q{Np$*NaSm1SeP&_
zhK4oguN+vuq(McCVDdZ!Nlm&H5wVR0{lQ#K8fM+*hO+J{k{5z-qJ>WmIfC~4KEzYx
zqu`@9gzMTUf_ooOq#_6|(Z9va_4V7g$jmIQOW2Miw>{F_=H~QO-XK(^Kn{Q|z7Pj8
z=Y(OuOGrrY@_t7p#k>d=x#+aXe<lh;1^W8>0EzlwD<CN_PIhb*dqznkJv)(-<h}1P
zfCTfW-CbQOKvUSWlmeVPcg8|LleY%&#b_BSCW!H0UB~U%1i}W+jm3YYo%1cXURB~*
zGE7g@u>?SV=;Oz0DzvPvN6<CItP^}>5JI*zgk(SZcfgCJaQ(>Jx3`ehs^jv%)k+mK
z3JK}Os+E22b#T6ACk+O@3}NDeUz>k?`1#lxDw!7m`gZ7?UxbY8eluk^LGwO*_z=4f
zk)DY%es>4RPk_&o^d)dA0DT*2ts=WVw7&!9o}yZyxJ`LC<%iN?OeJ=ov62vj8=AM+
z54Ru3F|q@zw#2R(ER;-3B55ft0Rj@Flz)n4RQ}oVG_MtO@8L}&In#%_h5Ov!VqD4A
z2*WhW{||<El6+F%KO<H|RSXFGM-+WL8@ihO4c?rC&14O^qWeQ&)22;tG5v_D2%okY
z%$hBYBs3T}IC=6UN;shJSE$T61s^Covvlr5of+fM-Ed!MyyAf~pn8&Mc42b21*Z{E
z5@#>R^6>Mks;RXX+MWkuk)tdRpV{1d^X5$yEO6i;)H{$e;J`^Zn<NNkXCu}(T|X#K
zhGi(*5_w`tDJT%#c)eE*yx0aS23v2Uc?^8pMR8j`YInP|Y%YWcW*n{`WJi`Bb#a;5
zOZ|r0?EjZ1uH{A4S_v&h!<;MvuxGUU4G_vS4|RzFk3J7OZ8wRKUKFH1WH_7()<L@J
zD&(W)Bf+42FfN1E|Ak-k@tEOX6KHP6QwF4Y8zmyZ;IA4f)Rzz_Eq;z<ePrT+#iI|d
zp`r0>O&QIXc)!d+ED6Gt!S5vulPv=JB)`Edxa=YHd5fk2WpOedxDO;p6LhbkWg3Vg
zb`#b;#@e>6d`gZ(!&Wk42)%I;fF@}2_S)eDvUvnyO{0xy*2Zq);^68bw{HDfWJW>h
z@n1wNv<ajQJ%OgFzEv<rWBPX!jOIrp!Fm&4!&IzirMhwB26j0tiu{s5eR}CfkFN?#
zfKhYkE)AvF(?xcsqSmRh{~OHn_reb>HE84Qz^+K7kBUMw0xLkW8O(h{+C90>D<k89
zBO4<|)_DF!oB|2wgYRZhd+!jNNoT<X5S%<&0XN<E?Wc#@W4Dy~`!LRh^@Zgl{~B|9
zKFYspvM1eXSWU=*-zSFjwX|TZ1kjLdWH7%yHZHCmg#^)vXA~Yd7%C6b9u@`0wm~YN
zL+4WB`IV%323m_n21FTyaasq2?Hp<-I_0|H8yF72SQWjT*l}`Z1VePn%4r7W3+V2D
zM*#}w6NYI9iU>|g&I`mz6P`9NdmKb-8RrV69xhh=E}c)P&jVvg+<x~?4?L0#a+Gld
z0oxH}<%{+A-)WzZX}~No?3RXy+Ja>JlboIXU!G|Kn$KVi3e9sna3~rEp%k==;W`rw
zzBPlgwEu6%o?PGRpI%zAL#UxX)O7c1d*P=8#6kt&9Fa#10GgCE%mb4NUwrcZzYau+
z-Cm5QAzY(s2A@od-i~X|`*rdmr33zB90WxL^W{s}TVK8t7<+w-*()#{kblO@)LhqK
zyn$|l+&aSnf(Dd0mj}!6l%9P9rZdrAW;!g+$_RgP0(7|D<$L-8)ffNsbd`Um`<soT
zx3O{9^Cb@t^cc38_bK!b;Ot3Y(hVK-t6{W+s+M&A3eKwMJr2Li4i9O#lFy1Z_5x(I
z(#|vt@c}?l_K}XlmTi)I_wIe+BU^wEC&x2?Ylugu^B$a&W@GyTqni=q5T?m7Th2;$
zY;}SyHw42HU6|0fJ9h4b5t)T%F4*BENZ;Q+r`6_7oA}Uz$H*1LEixSjBBwIi4Zr4H
z(Cw$~L_**WJ1+bDEY$nGNjFT<|0l;fpO(vh0(zjPj$j~$9Ok<A)N|AnnGq<nFonep
ziHl>2OT|NAfO5P-W+4k}@x^3Cz*Vw2YFB?079W3vDJBHT^BHTm9S3&fT-Rb;>1@Eo
z{u#*HXUywq^VwMg9%w!+ICLn9f7ACT;+Z^O+tS?`^8(ED8j7sGd@OR*%lmlcvft_R
zCp1^md)>HHIUG1U-DV)Yl0DmEPeu=mv?{>Farsq8i?&I(aSbP@DRKltz+j>4HQlt5
zrRA`vCvo}mWr`Imh>o$tZR<Nix=h+se=Z#^loqM2t;OJuPVR+Z*MM2FE$ckm1=h*4
zeJe3PiK#_QD5^mu!pW!;4hRUqm?7juMD{<+Di5Qrf!c|tzOJ2JuH8|@6->>RM|u*G
zUst?`?dgod2uGs?9M<kAlnQK3=`&9VP#t3<+>PLYw)L#_He8m;GfU^M;=*8FVkbv%
z-MW_$|J9MgZ=OFtP#aHl?Zu4(m12N&Dk?Ya-gP|n#_=8_6O;Rw&Jfu{sGfR7c47_#
zckF80+7>Vz*azFMy?sSx0ZdL9ylyat@7}#zCk)BkT3~I2f)}=KE#X<S%2~qm;hVma
znEoY+>ne@uPSQY5JzZT_XM-*=oQAG=Nzb=vdmO_q;t(O9o}H~N-0b3ngoJx?%PMby
z0?bTJNxIKzQQbYBIy2rk2Jc}Dx4LX!0nw$1@;M7fYx(l!iBcl4f%DK7L$vH-q9+Jv
zX57G~P+Mn-hNYuHQ;Zn6h7*1kAzSwn5Pqqpp!W4^#m^~3m%Uk@%k*eBJY7FOKRWqM
zyHDGJ?;$bN#rcVdEQ?lg50AwcFAlODMBx*;)c79#k|gitK|M?|?j@L|!<3Ocm7rh_
zi2cmW3=b3hJLj#qf#@JQ=i*}yE0<1Ya_<ro``q4cz+VcfIeeXti@TH}$et(c0dfpg
z*WSTli+qXeblvmkodCq+xON;LD{DNa73xk+p-2`vjj$VhSy8!#nVA_;Fg8BENq)zU
z9j+KHh2S_T&P@=bJfjYT*bc5yhfYXg;{z9dNgBda9^)=e`IhDO-%qmnq^9;kSiK5l
zvAlHYp`9)v&4P_4R}=7kKwciR6>uj{f|cIb%*(?>?|G9%SoF@5*0DhY6j^s3c1WiP
z@_(3~4o%3gZJYC#XB2EN5MC#E!hC&w>#Ua%K@~yyD=LqDj?LFtqT8O5e4LywKYen<
zoy$|%+KwS)?EC5t^~DpeSNV4DF1G8lAtyKUEih&WKR*XN<oE$y!o>o`7lT5<uM!jj
zV+GF#1aF8su)NX)2YS#G$j;53gZp=d7BYrp9>BQJfHj1tq<0CEEPrL&zX8}~xMI~>
z95za)eb&@8gX*#SbgsHMI}x{=jxJ<9lei>0E?^hXqL(KIOi|IVZ{);BdGaGOx9vIO
zNh_apoMPq5o7+WH`>qkL)h(^9g98KWnI*Y>L|vycp@`Db((2^!=#=E6AN?GbpO$FS
z`g{FlV+HZ45U$j0;MkZ12n5^VW!m6bV*vjE^k{P6GlviNv3oBNLO19I9@(iAbq+`_
z#DVy(U5ZqD0nEUC9;OY1_Ya`>O%@q_Dnm0f7ucTg(Yf(+!o|OY$&)?%zf0bSZZrBP
z07NVo+KF>${e}fldEUL|=XVj$sIDGFdk@B-LI5Lh;us<*6$fu%?@M_s=u+)PFCCin
zD4us~5VjW!Y+&^W^I<F~GO}{oZ<|w>&PK?#(J?av?(O>u=77aJ)nXso9VFJi!2m#O
zOG~VzGrJFZ6)YGpOvld`6!6ffk7_aEoId0**uwm*|IM4B>sNa&!?9pAy1Q|;E3{Bd
z*p-y5tSoX};~guTmvQs@IZIf=h$g=(FXPd7gtd`I1-xY?-xW_fHo>$7*~9#?{561)
zQv0Um?^%!?w-=0V@6s><PjbCN!y3MA`*xuA=i1t1R2w#J!eJH3Bo4S*EW!CJ)K6|1
zb8uNYlf@c{6Bs&hzkO&ql|td}U4=(nw_({Hb(rVd_n!5k{fee@0jv?tn`>mPG5qCd
zZ}0E#FT8(0oEFT><dMt$4`!QHP+brt=>X=If@7F^`E|iS#L9cRi0WoBqu<X&I4f#u
zen6`|aRkR~ca>7a#{=lVR!>n`BRwEUcQLK^T<nLyMe@>Ohc@7WWaYr|LZnMzo{kT<
z8}wNb#Cw6RL1O&i{r^0Rz&4CM;@knd580h18S#2b&%4E!CoPccAQH&J;r4Qa2b#NZ
zZ)-`b1|ZMjV`XE5oWGivmq$MM1q2FYoZF9%e`vp=HE^k|q!2$R;_0zHy<u;=2gm;e
zMeGQ?dWtn$sK}s)ah9a#8iu3tzuyaBCrYsLz%jAvkYzh~>Cymfr?B;ud+bn(v6ppD
zAPnS#Swjs#BLuwyJ>V$C;eJ~3vDCheIy_i9pc-_ZY;C}Uk-vsGg8s(>5man1K)Iks
z1boP;aJU@ghhM;|mAm|WKzk3wqphtC5ktsR)!scoDIy}m_5!Gk@s=dq8NnThTw<;|
z_%EoBUhVDe#e-9`q3%`qZ%@WQ`(c1{9}5e!!&hk0_<>V$LY(nQyrR_T6<UpC)f1_j
zRX8ZKF-(c%-A9b3&Cx760Jb1YW6W-ODl32KB?6;9Po6v>2`U>#dx|f8-k0=zndjm#
z2oE@PKwzK%4whWFkb<nlI$!NYEP%XkkjMDPG??EpaYkZ_h3pf<WxGO)$+6J3hXk=l
zg<Pl4T>sgQHr7)uO2P$1yv%BP{uay|vb{ilLfWU>iF*EXy2L3<Xsf7Ie0DJ5`53`_
z#2f3uyPcXKs@N?l<7=dMStd^|F`Q9P96W~S&2|v-755j9#RzIbfdW#n`}`Og`CMe5
zR`-QirI3CS){;0M-`lDx9!bd(*?*QRDXb(GIe2&;9K5~}EwPD-i9i+^lW=pw(~!JQ
z&;R{8*jMEhsl_{p-Y9ZhhF@2U${O=pK0Yh4N^7gZpq@W_=0l4jM@9?H=f=h(T>X+T
zIZ?~W?e*M8hGn5z)xWih879>-)A!lN%ElbaJcUpS@-NiNQ;on$z^);B&R8AUO!UeK
z2nd+6#_3QIo_E=BV~hL#TUk5MjgY?vM;eiIqECxpH6L~Kt<%}!*@A}8HdEGyrz<>}
zZ?(sF<?|85+I7wn#C-V3zi8B348T%2e0Vh*4mE1>gkh~@n{c#=e_f+0L#5l(r%&an
zU|lIFC?pvz^UMv~t@E+ES~Z9Vp9Ny^KR!!hnw8$)a#?Q<Yyll@ZE7VmPhI*A8#Zj&
zveU=P-u{I67UDfv`XQ<jB-yoVRQO1OH6Nv}l6T*ozW`0RO``o{s-AkB6YD}S-Sqe8
z)l6aaL0jsdx-8=Ap#?RC2ZU@5>_=^e$~bGp7$ahA^v2q6gPKd_QntMmZDj@&VXe<c
z2w9Ona~_lnaq-hwp%ojqZQHeuTv@Scb0zmq*w<DhQvQ#7m>ASCRmX5Ic@M3ZE?r8L
z)6@(_Os=}3xlC4{t^8;(xd^miS0(J;|I)knjy7vvqfxeNJ9ReujvWV+UV`HG73@m{
zv1)n>T_8!P{1c_|aw_<)v16tBVQ}Z9r?Zp8Vg4)ka3`*Ic?K|;wbXqe2Eeb46~ti^
z&ad<5wmExwT2QRs6tpT4*}Gy~tb1^q%0)%iyALJZBu{E@`xgT}%QVs&>O7y<6XVgr
zwH|>*$%%>F+o{i}cP(1uZ^xtO_8)qHMU(xXi$+wgg9hkdtjFQ)r00seIXR6Xr%^W_
z9!q!yC-Hb|k;yxTU869ej7R>#R3Fc7do<zs_%Sb_kGd1Q@akx^=-yH>=(NzWe_C$t
z);ZTj4SVhh?YTYaod5V_kIy*`ocH`g#R*y*5Y&Z}2!Dp(piJ^yB`tDk34VG7{?t`M
z7KZwp&cwtW6xfRAq0!Nj9t(4LxdJy6j4ymWsW|q>N+s^$<`xzfK%y}-(~mJUlw06G
z#@0{~_PI`3IXX_F#JNrVkOQ0e^<{VWk3u1Qva&`HyO0MgjEuT4waNQA5Pa*_#;sd>
z`}-L-ZQ9|Zp4RZsKmVWrOId&QsY45y{7Uru`%tLa9T-WBd}k389nHtb=R-@wA`O+=
zR~gJ<WMmW;0EUDw)^&ekmHg(E;%-YE$+4ao&}Q=q4nBqv9=r>~8rY6vvziSe5OS#V
z>sNU?-Cutl@0+p&ZbUpu{6WeeK=2>Bu_-Gv)5^{cdbtxf(O{NWch|TL=VH;aufRmy
z1NFP|0J7@pYVbJB@hQv8d*K;8Z{alGd+4UZ?)03TG17wBtz%+t4r1ww86Z<WAE&7w
zcfYB~Qf;88&v%`+!7AcZ5v<nF&CTT4D9-<VjD}5?4Gsr@-OR7s`1UPcDR358fWcHZ
zB_*Zxhkjydq3H33lyZ2XU`8-hFTpXx90?p<X_t1e26Qne00wv#Sa0|5-^YIeGJOqc
zVC%(L2EmQ8R#rb?K9Sw`#3LIZcCg7%#AQG|><g;RHrFkr-2z_=?ErYriRX`DxCWS@
zU*;<2%}Xx#WoX3b?d|1whTFGqBdbzilEao`=im_TL1Py~c>|zR7;LS?N24Q2&fRX@
zcuQ|DZV}_?kKqeJ5Wk6&DNju`%{u!MBz$o!ZV~5WcXCt=$;->bW9;khHnz01#Caie
zbEsESHMJVdNXR&k?(jB*1SF@-;m59DzaD4C!Rb<P5E?%DzStqN)igA55E#kH$==@H
zNl8im{znneZmzB;AL&K}GwL@#^`<N>EoEY41fRLFY1h}jzCxJ$$dGqppoo|S^TV$r
zBDC+yU%}5w)W__eG{<*^Lx<9J3$w9Qn|U<3)#E)23!QQI_&36r8Z&Xk=^T5<Lr`*^
z925V*z`$`{Lu2Faj*gDLKC&3SfB#;EZz~<$7^dE^`Wdi!;q9U!jQ3Tvqi>!qH9C;I
zBr(TzpyQH^v_S(j#kZUbM5jzqrXK&r^a0N8J#TCjeYfHc!q#Q9OYuRmL5b6`%j6Cm
zKN^LW#Y8ULg!_>vv(6qGG)1G!^sqa^(776$XeU{L$zY|~<clzLzd?$T%E!QVV&Jpd
zWs18QILc$Etg3pO_u|XNyb}L_fRC}$2u`J=R-Cv|Dfc-imp7<2x{;4jqoda@=5!AX
zM1N!9V|xLG^FYiN8qZX<7%cN!E_88~NS~9gb2Y?^Ml1PZpFssNcwiuL2&hPtwWxC{
zN=nDm8t|;7iixy^t1$t|@UZ-BYAieqN+eL7>p=yD3OFT+`Kn;lct)_E=x=VEn|V)`
z*!A5CID_Da=7X_Nr}Ol~hjFZKsq1uhz#&BZ$z)X~CPSq22sluL`%Bi&`PIak%JDD5
z9pDx9%1+>b8b?(HE51WWJ}@0U+GLupqHvXwnfcK?H;CJ*v<5i+d-o|N<eR_~={}$c
zrHn3WU96FzVHPMCUzx61-VBDALGe_F57EhsiiyF&XDt<)#Z!Y+)dxRh*uFiGQEVR@
zo1EiTIOS=meUZGI__VB0rOfhhVzQ|zzs%()Ps4GDR++`QoVhGb9jI0u^-E!T4ih<-
zYY|El@(CcKYMY7y15886r=J8cLB{gG69eqvV0K18&qBx36&@ZQQ03#xx5#7MMahMg
zNhq-5eo-o_TwgjVjx|GY<iWzyGOo>a%^E+9<0!Jc6Y86iDt8{c9}=PwnT9zEgUOI8
zv2Cv9_x=z?g7Y1c6Ojj(ej0sM6eT7)I$OxFU=9VPp9Ng1(La9taKaEkouPXqeug<!
zar9%T$l8I<PPyy!zWw_rP*dSI7s9<ji>9s2EzRx@5tL?^Thb7#LZ^EH<0&yP9#9ss
z_X+pzNds_Z#!aVryD;vd5-W*@VBsOZEvRKW=vF`#proVq%z{TYtwzU07>_sL-w$>K
z>-2G6o*st0pc%h*cES$9>;Y?o8LAUG19sw~{%16^JTT*oLYJn=BO_A|NJN_JpEiWr
z<LBp>zXn_-k1~TH_EEftib5;kK|?>xlt()GLnECPBO@aqzF5sz_-?SHVDsi*`do)9
zG=6dl4i~CgO^gH=$=zC!dPsSQGfc<GprIjuWi>V1W6O53X$cB>exw^fFT09-YX%8E
zI6&aBh?gt4UbxT;-`T{}beE6Q^k`CeI9@ZAfhI8NzC!uwmpJX+xf3RY2=+JZhA|d|
z>}|Z98^lb2zP>(8JvavBc|>@>nwpx&kJkYnXc$FVjqhM|f<fG_7Zy^TE;%2Lj#K#Q
zbO#$7phpAzSyXIn(Phj%VK<`g+yRZi=nKiM#V1k=JQmBaRUf`IynQ=B4&=hUK5JxT
zfHUxp!}|2};RtKvOf@qwKUm*fDtC}0lo}fKxDLRBWB>mB?Cf9hGi=lrc^_Y22sV2d
zgWFugT;1Hzbld4hTn`K+eJ(h7Nl};p@9OO2@WDx(x7ZJS9_zdiIRt@aR-(wm;Vg6n
zRT?-?!FO)Q4q?m3y5CG9QrnB{c@cTQaPlzDUZbma{OxEckgt3|30V6o8dCh|_r|OQ
zyOhUh1IWqA!No)2gW4QLQ8=oif=xpbl4BQBQ&TtjwlXs>pfe=nGS2P0qGoTQ=Gx!K
zIEiyxN_WV}*xXe??y9J$V5;_kR1caLHV0Z@XTlA;SX0{)d9-*kU1jJ5YhAw^Z`3~H
z8<M+uOD!=V&Rt{r=~2wYaYzO%4jTW#$+fc+IZtOd2i?D!U(#?pcxS=P`vczv<3Fz=
z8|Fn^v(wYmY*6+g-_o^*-FPLOM-wFhPc}nCLx{{sI&4Sz)YmnpUP(#h%U1w&qcHz8
zzr)Qq5ovVx>}c(Mb(W;*m4kzWXnCZ~&oXY@c)Y~P+B6_N-CRrSPJsRKpIdLgc=<99
zE@GrQh$be42gNWo5yW-y@L@Bx0>|Olakr9~7`@1vZb;bRs3hbmA9AO`2NWWd=Y&$E
zI`V(qX2&_&;ZqRJ_hmdNHI5&z3KtNd(11VDx=r=Sk!v43_m{p~x$o9TlRXD4a3ohK
z8T<w`tLfcEc&58E1R?Y`aw^@#ut-BRMmj5C;YJFA?lRF#9vdyA!)S|=?K3oz6RCm0
zDz;vBX#jo)n$vlw-ALT)Nr54nTqy7AItS}}*SozUA~N><nh&T^Z_CQc0BrL(Nt0-b
zEK(cZ_OrM4K%@~YIlPZ<kQ#hA%m60=U|QIBdEW+WQ65`yZBADH8psHe;=-cPiWMum
z$yGTroR!#_0!Rn=ek>Cf63PWfsbfbqQMTNWSKXF>3m<cUXLtceZR?}sr@x1Pvz$d)
z>1r<p#7dVQ>1rQCNTQzTQ`G`b;^X5}etm(|Rh8QJ{UzyrAUGLXSzl2Y?z?HA=>RvE
z$G;|8EJ0f?t){KqL%`Sk{P}!`!PC?6IS;{3(3Sm;gtn2wTNFGU-srr$|Cr?vU5jpK
zXQu{8pef+wER!G#?taTQJ|CC|XDkj*crZ)57Qn?26cl{NyPJ=X^yvJ-z>{J48dq%s
zC~HpQ9B8P}5hbN2yyRy8VUnPtlW(>tEg>q3-jWIflrecAu`itr1E7MLfu)57u&&>W
zV$Etc%w5%8;(83<$`mK9E^gnxeUKVB4Uy;SLX&<n4OTKdY!08ytSs$v_tJ^0kFiyH
z+2^hcm05jx23xIOXC2sRhI*=|jbj}IVqY~4TnfBp;5c9!47W$gPM&H514IAd;O@M$
z4da;E-Hex=FyLTv<A)D+nAT4H2K(jnn>We~Z@4ZRKSPxOyXN<Ib91YY76fMl!*#Pu
z1<^i|?348M%a;)hJt;7tRU;)PrhT~@m<Sh;%^&qh9fe$z4lnaqzG4Li^DqeeKmu>u
za{%2v$$&|7o_DQmIJtQm?_o4;L4-6mYPDX3<y3C!PSn2r0RV;3r7#$GKQ1mV*XU~H
z6E!n*`e6swmzCk*Fou$XUg4>VfYL+ubqM)Lpg_gTmtY&Rv*AiqBV<b}<P;EDcukct
zO8=;JEsRr;cI`>;HVj3;b;SH#BUB3DVD;}wJa|yapOmy|{t7@@MI3hwYCe>>4-;@<
zN}T>Z@1nx}<c!-8nB979m|254zaW&1`%->nwcNAj1f<ECGiNaI<L%}30-SN(`t=_N
zjH5QB4yBw-OEOlA77gaG-p*rdYnz&oPWu%v<%m3bUPBumGdfi{IkwPfw!GqOKIFR$
zkX?BhI1u@d<bDwmwVZuV8X8V4B)H<omu^;7R4B|AJ6zAt&u<BLgf$FkGgS>Kl1h60
z@go@+bxuHvCOULjcdi3&MYXY;p4@x{!w<VpX^!K~GmpBCBq4NdYzrXCdmA_5oHc7k
zZ$n!6hi@9al9PPkXFMz4cGK1P(2fp?M`9<1l=pxF#P)XyU5{q-y>g`#eH+_lm8tiS
z$TSNH3Wi8LxStFo44zeAE05*zk-2Hx!L#q6P0&>|y<}*(_grO=5Peo|Zm6HMeNcYs
zJ`oX?y=M<b+Tydjq5$1&F%BsQTv%jvb#;Ym0+h#+O)p*2#oMw>=W1zoiAxI$;k9aQ
z4xPY|qECwT;1$T6q~odkFEs|W`AMFyX6(zAVtaw}N?k^eQ`fRc>-KL(+R*c9#m`!E
zu(Jclljg4>BX5iJzPC+{;{T<6K{G7OfO#$=VQ5sq_QI$Gy*$WJ4|xV=jJh<>@cAm|
zhn=Y~)Gj+ar)%fz6glg)5r@CGGc)rW*xB39C%RljY3-`q@a`fb6Vn^K*i>;Zh|3oc
zz0pJ4w@0bXv)F;@BFd}i(Mu{3JF^On2+0%8VYsEu-0G~Iyf0q8ij{UdJr)VDuF|13
zQO6g=v}+bdHvu4Vx$@AdpgLWCP?$}F4t8s(si$!kML5G0BU6#>x9)TC@Q|l=B0u#`
z4>y1rV{Pm)m)(by-AuxUN6sB;2z8u>!Y_v~v5T?pNiYszIeL`33(Gfae*(z@Z9MTH
z9rMLmCMVKV(oYKL6=_CBx6wZmO9VGI)gSKYU=2hih%nJH4pZvx?d2AJSLf<cEzaaI
zbcL22R<rF1pk+rx7m=Lm(t-gTjIk&deR6bgxV_5MokhSSVG#sL7cDD1eUmL_YZE^<
z(rSl%)6d7a@B0mhQNiKY3;0k`Q;VU&tZ)%lBYY;HFmjfB6dzF#TP!Y$K#CMDX%4-f
zo0Ee>*%}sC&tJL}ZL*(#L1gx*zyYwgTNPiveA#BImUcA9$*wANSNY<?<;dwpoKqoR
z!qUdx$9XRsXThsFF-!6R=X1tB0^`svQZ%BW2^_Dsp_j)r*-49psQVz;l$4apouusi
z?02f}-k~+lB^XCQgQ!nye<=;CJQXFSFHRQS9ZTM|m}uvEM9h7zAb{9`Wk-*RKF<BY
z-P7sJVm4hQg*7)X#f8RIGokrzE#y#<{=0p&uPM6lU%#@)bdskbpFEKr926G9YeaQt
zm5hu!OR`(n{b~FXn>2t^<3$e@Nq_Z3`)W3n--Moqe2^WR@bw%V2|~PQ&Ouzg3yY7x
z32JyjzXgmPGsPhz#ax`6Zg>+$<fofBKZi5vahsmwwKGC~!Z?&LmFh2#*G(iAJ}<zL
zC6jd*4>L-+<f0r!ej>-tLl!tdbdz0nkq3$OIc|>wHxzR|t3ibbujv*%i_IRL7?Qt+
zHI;_P8gjA`{~vNvD8m`nB$wL?7t~};fh%nb`VjA=se@k6aRWwl`k`C;-jt}h@R`4Q
z^$MEl5*Q%PlOq*yFap>D4xLnNM!T)2w-+C(oLbQj0R^ppcV35V7(y;-Yiac^Y!v6x
zOi^ch_g%Q$Tg?)|Xi(-_*8#`<^VTplGXr#l^SPwax!U<LKQl9E6+9lZJn<7R58xac
zUaGLm=bFF2y`v+GxZMRP()#Z&qm&>MU`Qlp!2^MYwO9G*8c8||q|xWPiX;N`oA^<`
z@O(|SxB0ZuDi|9J$;oc{($Nv5|L*MJYy+{Mo3$(5F~#z=tBX92!1e-_pGC?=$YuNj
z40*bc0X|V79A?|UAFsyRB7Y(FCr_;;)2Dauj#h|+1LS9C--^mHtAA+N;3jPjA7)^1
zFexV(EsCJf(8SJN-=_$6MBJQE&i4a1=4Mmp8T#>yuAZKJ_jwnbIl#kXe<VsH;6L5w
zcz7c1HnujvSWizcslOSopTPO~iw+l6Uw-%?l;M%R=eYP%X(X<up`B60bz#48sN>;}
zI;diPfq~(#K*v6|Hr+k<`c~n_O+*)t{E)LvuiIUk8XH{^SlR-mVJN3xVHJf6NTb#X
z2?-T%@hwTPke7dk5|_($q<PdGr&D3b6*5H7K7RbzR3Dwcxmq!Ks>0^n@dkH?+L*WJ
zc7$BNc1;T;oTPM`DQ7NTyomN--4DD<N3s6aty{Tj|KK41WmJr$5xn*Y@2AyAR->*!
zm-j&!6~*=8(J^<Cl!gBc@vMF)KA`DV0GD-?8@;jVm5E3RGkObBc0wj&W&xwmg^W=F
zx%)XjXM8)%8&U0!7VgQ1T5RF=fYOO5%d%akb5k-hhQR=YYpmztv#Fw?`?j?eaQGF)
z`N1NmLdrZx(Ef~|$|(6n#iZk5{CcM=R=&HRiH-cklo>t$>cxweB75bEw>Re!_l>e0
zYWnA(FC2TEl_ljow9acdfjGDG?IO3OT|&ZwD9$3@fx3NUP_9pvg*gSLq<r?AjM%Bh
z^OSgZk6P2FBQ&+YhoA8_@UucX)`^O?zB3SbT~?ynn>TD=3k20kxjd>6Co!6f^}1+r
zcB<BL=kdM!_Awy0zX?b5Qq;lx0v1%!3s1(pWyK5~FE8)(y>B9}+caS=XsRXR@#1p5
zU>-i$Z0A_4SPw(0d$k~&&rsP1Qyc||`L;9h%IeKCA2yRG7W=Qenh8HBEp>aad3<7$
z^ogMe$OG<Z0h5c3E;9>D9A+Ufd$4KO@h%OPy(m+7HKI)6q!kDmaM&Jm0wlM#Ha>!v
z7ooNM89Th14SW(O<P)Yn7BSV2l;6y7sHkuk$NZ~(tsxKbfi}hSns48|gYvH{P8gEu
zEfU<8cUlUFg||VRv(h;+=yn;6Yz!s{;)D+T<0F144gHwNLH=5gcD+U<N=r-Q4wP~_
z%S?10;tENHfmV@;Qs#&n=@pnP4Sj6q1Q8D_{rI2}M~u@><H|xttyT3+Z;i_#$C6!F
zg+IniW!^~S1NyoUR^;DkbmKj_2oxBja!JP<pq@d{@ye+88(v{*2Ct$>`26f7<)GbU
z$of1}2a1pPBj!==5S@><Q7clBT#9MCR(G%Orq^ipVj;>PX9hnh=p6n=#Ri|bw4?-#
zgjX*bv&|qg_KXv?u0OF<_!1oD{%RD*1m|I2K|tHRKivt>vo~)_XG?{{5SFZNbGv8j
zW#ZhfK=bhbBJ51Sdd}On|Fb7s#+C>fLRmst+9zYHLDHs@EK%7LsVvPXdt^yOL`92a
zODb7MQfOzi5f#;lQlhB$^9?iieILhrJdWp`=XuS@|Nr}azt?qM=XGA^we+-aSGQzz
zei5eO2g(wtLeh?m{(1N4j{USJ>{i`?Jj}ymsl6bA?N4W$%^?<DYBWK&p}SjED0b#H
zb0>52XCguxwtLj%kxjP1kT_I3JX2;N666+AT|@EA=bfSu8C#bQlSefsNQBT6`rIJM
z&At<zL-_5|o~M)fa>2A%rhDgdKZFY<`M$)OhQeV2TxqE}V`U`^vq!}r>|Jnl&&``n
z@HZhBckfQ6Kf=>SSw8*I$Io0+zr5#X9#G}*j*%?n!2p2vDH4Ak-(@!<jT;JewY8mO
zWTu%#`<6W{hwMC>@H4mRMj4)z%#RNU92|q&!lUv$XwDV9#Gx)Pk2Z!e_5)I!$<3uz
zL3~IyGhc~}<C_-X-XX|Pfg~RXHXszG7Rx;|qtDxhw(Z*CQ2ML88*GX&)*+gr!vm)p
zf5;w%Y^O${W^?ALaRtXF7H9lj2hahtQYA*XWTd8wDLWKg7cO7U+IRz~?c9*J52)vt
zkkBCXkx@~hb3SnkGyqOe5EBy<iqj*PIynWz+1K8m+IhwI!7tB0U_(kSR(9@LuRqp6
zaYnUtOJB&tc(#H#vnqt{F>(4M@|nhvt-y*oKs7;=hsILqB=FZkoJ>5ZG(^-9jSH=v
z>IP%fW_59)x7w%rt+P)5eDE1B+QwbD)6*P)LAZ+|?Q+eySF$}+)By#Q@?1Co{2Ah1
zK$DgziUq1FzYzJ%UnIT21!S7kUl}=)+>`IzF@PNMy2ca8mo=5S$Z2@<#({#HATg|a
zU_Zz7AfeEwVa^rM!pe#NPwIFe97K(9hzhSHQ&dyzP7jS$@pFP*hZZ@<<TJ4>$#q_G
z(xepu&-h)Bu_wMRxo)&_ru_Jo$8FrT7^H#$V~`1I{WsU}iz}f|Q9GK06DncoSFIW~
zPxxKFnGkLEA*TTy(3r+@d;WaC$v+g6(mjX6At7a4@ahTOtP_nW-^aS%pM~l0kbTe#
zF0&rL=$9q=KK=aZQ~q_gIV;@~7+gTu9j>%;3KhZL-eb>T(_(V_B$O3%oy<uhEjco$
zYaeZz=>7ZmcNAukTbjOFD7-_!sI8q(>Bj?4ne0CYWaMAiD=GeF>C?c6Z;}_|YC342
zs9xhvTdyGN5_%@^CJ6e(G370)bSRM*xI8*)GWYtgO|DdfQnC`#GT+_5Xr$>_!!pR7
zhUM;GaBWzcW5#Lkz0MCtFG>n=o$6<3Mw19NytkLv-{{+zOv<c9?LsS;6c%t$I6!=@
zU*`y7DWEm?n^%C(_T1`)ANzC@M3sW0O~|zUa7Rm@rf_|@EK9>pLZSMH243|)aNuP{
z#WCB+-0PppOZKL{Eg%AmU;6!i>e}^`#~(ITKYKB;<P>48VQr6MFeG-zk|LwMWSc+o
zhrcI1#P43GTc6+J#pV5F%Ga)3H`|q8U>;>|WZ&*(lT%lB^l*-j;g*Z*r`Gj%Q9SyP
ziyHDfRR(nLB^@;nj|NH@2L2}9SOUHX=q6aY*bD9u@a_p;VW&419uEw}ghkm0S)sC7
z{}YTwapgvj9a~Y55PLeKY#4~07>?Pq@PEqGps3MX{lc@f@1rSqz$Xv#TREOd$M`f|
zXKwlBV<#YJQeXM{`aZmOFqKR{(!YB`Lc+15N9|KDtz5a1+VAk=thKJLC)FJfxO|1i
zFE&pPFTQ6n%GLM>6Q(~J4m-2J|0TpzwWV&*sLRya9acY2^zbhnNSsMC*mLq0ja05$
z=7hn6OGq!8%FnO6jkWlrrL?JP`i#|%ZjtBEkS5QMX3j?$o>-77`}$jin{pJ4$Bo+*
z+|b3{{U~(qo{fk1R#rB~?_Nj)=Zs0X%kr)jFsDUDN<P5a5VwODFRmfthg_`r`AHS@
zAQwFxL5LxfYla151pw>$IsfvfPaB5i?4POqp{gol`DaH5=!qzk@Z!ZL$Kv8XWA|6Q
zJN(j&M^=`Ky`!^Nk59CQtRJ;cW6rV_94*wTn#vC5QiZLd3_W#|bC#nyi)Vi8T`+d7
zY1{desxHpz6kd!X`NK<seQ~d)pm}Q2tQ)(J9kZrA5n4spG4#Wtb?AEb>uPPhvP3gF
z_}}b^mJgaRx@P4jnt?><kSb;elzLrFfvs4+Vnyi1y*E0*j?e9+eiw?qqzcae)S@L3
zdk08%GGqQ~?SDA7UjO_h@i)?r2(52Lzn0gcs^DKZal`wdz1Bh+*$FWUtlq%+tL*Iu
zyPg+ueDA7g!Q*GhlQe^0na3?D$j#NvnL!}(yH9k^`Z|=XCTDOYAi&+jV~hGAQj~TG
zr6P^r%nitRc`DFiKPi~SZ5zHd{7ibvxROch4R#Q2Lt^i6YFq~0MP|$QIh@3ceY&PH
zvGR(I3~h)`pF7?~*Dz)3)QTftgaZ^AKqiBRM?q?7SjQXN1}-|Okhy5_;u<)*RBzEL
zy}_%Dm`JI%dGqGXPdrRWyY!28ix-E~yrRd0?)R~aom<O)3f4oS7a8g*h@0ZdUT}-E
z*6yVqQ=hP<abrN#7+Pczab;^~eyFWQ!y2XzlAr7TTox4n`trN4l2n$F(fm|{!9=tK
zGDW$?dFYkhKg6E6ocZz3TyERougV>iXcQrw4t0y%Xq5qFmAT9o7$XL+_a$9$`>;eY
zmQI?ng7DkOnSpX)zmXhjYm=PY%b4wnzT&!?<kKmx6|>TxhO?OF!P3FRHaPgfqeo{z
z!FlvjZdLR0UI#_35s+UzVU4uY{%O;*&NTE@<I2;CGJHjA1X?X*;tbl7&n{~C(fu{t
z3W_IAhK@(>{+a2LpA7g7p$!O5Ej^BYF*85kMgbeMm*fk&#_qLFJ-duOJaqO#@$EV<
zLJ|XW^W^-@Mp8}jnK)X8nHO{6JVFy63(JgizxoaNk5IsSAub2dQu#Q8NqS2}`|9Um
zPdHtfHosKf_$cVPSG82XBFZ0IT-<WmvI@b}ec`+^5<U{ee$H`_CUefdo%c!|k7l=a
z1zI8v?Ai3Rbu1?ma>38KFg_uH#1MEm*6`w`OAP#NSLZBlZul4l`X^N$@-gg^$fIY>
z+-TagY3shy%qwZU4bBD^&CoanoT?@f-3~}dq0Tm;nyx{Du`g-$%Rk$PPrP^)l0<Q$
z^REiamd+u0^8sjH^fI85I77urNPTKsOyoo${gc68&z^-Iy-I8;UWaI>`bq1|eul?4
zDEEQD|Mgd|kF~V^0h))CP%Ak6b>UQL6gL5;)Gvuq1cXC7Kg!Z=im#a9Eyv;XuW;x2
zx#M>DcB1G%a@fJTE~82y&;92@dC${qqwdGN0M?g~eZR<zh9X8gSg@=#j9wYV#dHPH
zv;?o^Zu*04A7Z1kn?gpZS1K)u`3-e9p|O#eWKI}5^bL1v*5xjD==X;hI-rK3JXoTm
zrZe?lAR%NfP+geSe-idz^eS<U!eov3LL-4#z+c@#i7`id7RwhO`gr>xImd7KToVQQ
zVsvueQIYWdVakLn5-bKS42YkD9<!#)P>G?@F?vM;&W^NLMP0(FfJP!W9l=-~Sp^`x
z<lPYImK|9BGv}k)`~*?vK)j#z+_~b1)7*!Zo};cmX*|N$@P_>%bPY`nmXRKx&@Dxa
zbdd*55F$%!yHzhn7eP+TEWLnOMfhG)d_`yxXD`!z%=|>sA)3rbuHoR|0QbVorfBhk
zCi!ieOp6EY0Tzjm0)NRzrEkT+2k@F=)`iYW$f`G}TZqar?<{#|e64Al_JTv^BE9kS
z^aL=CA33s8yyH>x`(aPO{em85wuo6jmDlq?7$*+`Ro1^r|K)}lKLqv-liz0>x2fT3
zWMl=s8sJ|adb;%g6cj*&Vmt%VW4^%-UOW@+<8rpQp|{I7#uC=B7q?GXRdU?Xi7Dyo
zHiFI2H3+o;ZATZqcm@~vexaKN(Wd3bXyU^~kw-HFgN?7uc+$P#+#geSG(oqcS8oIJ
zHX!FRHe(#JaPT@ZcW4s&PBg+u=WD4O`DP=%F!u1#?VG^B=eFzHllTc3Ctc&kKnx5T
zJlK2NHYTpCnX8xW-1+f~vuz9bFGM-zcpZezw+anNCr^kHxt${=j5V{Ulo;E#Zy%zp
zOc%Y8Pe!TC%`==neVE1heExXHmmh*LD-plaS2-`bz?)N1>@emX$1C^#5GLWeD=J0>
zcQpVR^wZmX&O?Y6&c?QKW+I&<proii%buf=S+MTik{dwb7^@NEn~bt(WnKvm(DAq$
z8fqW#!N^Z#9c&-D%IUc}=G)7+l<Yr-P-o24m4B#up~&ObH4Z75xA`QzpIhh24i}26
zjd>mK->)t>h{(z1OB_Tft$i*?%-)IT&!5jUaa^Bd7-JeR&&s}f{!*OsYD$SEn=&K!
zmv%Fa%s~lpNGmk-<%<`q;UsE$Zft5y%uh8uW*AzYbvBI&udq!AzicVy9r7>^#0Jke
z$Qz}7g@VmRJBC+DKil&DKp|%4f$El(7zFTnCHXNrO<EfeEKU5}d(7e;l@Cy0jy<3h
znN!}}W*Zha$}xb8m*Z4RZL;p+{x073#`J6{74aJlSu{iGSK3VL2JM22IH*_QcsVw<
zi>z$s`3x)}ts~Zn*8`LbiQ9n+QAnXoOkin7%**0{D5j|b{}|vB)3xrir+;3HXedI9
zHj(xH3g7)*2K4^oSa@yCqz`K-iz>>?0}2<bsz#C=K5soHY<n}sj7Vm}D#TtP6LTAO
zI@U9Qq4Db3%`G=8yV;9@NSY|iZdPk<PV<&SaF<avCv_A0nUrl2KQ>k+tCMcvLq0V$
ztU}F58iOL=w{zz(xHbZgxK$9AkIs8IQqVskBerVU5^dH7J`jR+2F)#~6G&(Lo3z7)
zU5+w~s)t9%X1X0vX<NR0S_9#MMhRxK39~tucZCH>Nu>z;O$Vt->66H$bSkTeAD{^x
zcCFRa2mV}#Y0VpW1A`sB%4M#ryo>dkKJSZ-t>e=$N>f98;_7sVh-B!TUWMqE#l)<0
zcYWwT>NyPsWPQRuljGvGZCld)`+3OHCQd{Pm!6gu%^!h<$f-)lsPZ1)i6|Ni1w?wI
zKp3p9E}W83C_<q*GfDscJ%adSiHV7oXW!$t0t2r;bf^Qj5YW~Iozk8?S1w(mbz({+
z7tC6Ond;UzHYo2q9-(T8j9j*Q^#!?w<Q~KA!P>N}A=Z%e(BcrAD`*wdRX;q=m6B%u
z(J}Uv%mBRxO2jwQy2B))$+-!?18s$9^ye{S04CSzFcY!4|Lmh#=6?2K=u5<$C{*~=
zNK30Yl%$r2bVJc%i2gccY5cWoGD?)Zp%2OeyNsG0ZVIJe^WlS_UZ|5k^DdYQ8^i1v
zRV!7a7<$hLzCZQD>wHt9Hg&9M?eISYc7crLPQxNHNA$6#hCyIg;)F2mTvmmwD*{+0
zo&q-?bU;Ty3<b7qBRfG$OX!2(@HET!>FKG&SVg&fSEqQ61S0S}x*Je=Uj{7-RejmZ
zud>E}!cX(XeijoJaW`+?6dLyUZ}+va^T4Tmf=W`F!457Zssn>E)ZX^*zwVl|{d;x#
zE&NIPC%X5?DZFZtqs=LUHuhV8FADlmTzvm8nDwKJFq8HVy@2uN<K*Q#|07bClKVNu
zSSf^=svsEeDBJW(DxSy&Cluma8hs~nguD$y;~_-$PK3)i)Rw#$%`-UvXPd1-4L<*<
zhu>&Hm^*UF+cnG1Ps4-%0<&{SNW&47WO|ym@I2u`G*cg7CQ0%Gs@E^R2el)()9b?q
z46p+S8_2-F!Tpr3wE3iMulDc!W&Pu%q$vv5H#L|GNqj7AFUJ8l*R)7(gDAXO_VbEe
zrY$5p&*Z%QJ;0FEvyJc!G@E!RCyyNY1jIwe0c^o~6!8jbEEs3|SAV`%+Uk}t{e0{1
zJk?@bQK<cH!w#7G>F&oE%|Crg(f1f?`{Ewed569vp%pJ0uxe^g?hr$G;~yO}WN{x%
zH3z5EdD<tN|Ng{<g|-U$39hrmCvHb_Nt^*UpPRfR>a}M+LcHFN`eG8`!KyjcO9B5K
z+qr3tc@2L{Fw;XLPVByO_bz=Ze(7!e<je1Y5}5423{ilW2GWo@nkE}{|7-Zb0@=6Y
zYVxUg1Bzl~Oe5*r@WQPNhxp6c8Z~g>AL2c9K)Nd+EHYz5<_e_b6~%9p`_jCD=7N$i
zEYr8s=SQ!t*7<gRYWU72fEF(~T03(zdrH&%U2JS+s_@oPL7(qQ7u`z^oa(b<hq3s#
zd7Kd>CSFNp-n?rC1}f`Ez535d11bj25xs9pCggnLLO3#{eL+18qE3t|Ol#ayM`b96
zXF|5~dZqvU8Og0MU>eqmiXmwEAj$Uxy_XJ3knw<;2$$iWxA%`%@@pK@(z72kn<v%Z
zS67>i!-4OR0kcC*_S&l~!gI=x@B8j>k$eE9Fju?15}<j_`xn0;p7+e=AXS!^t9UtJ
z4?>~Eq?g(RU@b~@H<nou6JP<^lE`!#<9G<Cu_WPbtyr-FEa13s;c3S7q@kV+L7iv_
zxgJ>feTX)>72dDq#l?ff%%c$G3YotNqRLQphrFv%eD`fo$EHrfwRC+%<C#&c5K+G*
zCr?xI^7ro*$`9H)nXi$y$>cv~t0X_eoCMcZO86UBQ~Zmc+BlV=Fn)@+985PlZsweB
z>3?o$FxtxT;=M@@2cHKJeu8;`($L*}Qe8gjyioJVwG;Y%d+`JMFtKp`x1(eD;-otd
zwDj0jGDLN2OPjWB(Yw!*DSG{S$oSiBd{eWX)&Y9KtC1$*;@Q1?DIUT5`KUaj5gpfz
zBhQWyV+s1x#bDKwv!(yexfL1{NXdILN?@~#s6@pZXu>?}nC7|&^?nSGUb-!9`z_R@
zQ{M^3GN;TVk^P8?srOA--p?|z7+&_zWZ0@rPK7hyeV>^w?oG|NZp|7v<4Z9yVB%9W
zWcc`=N64I{-)0#>L83>jn6w%(a^!hXHR=|F9XxOvC)P|_=u>f>g62jq3><;lioJ5h
zl&M##`;R%ae`mV(5YN|FJeeWWl|dcXfYwqQ^w7>c;^xpsS7<1sQEwQnU~+hr4_Nx>
zfdi+PYfHWd4vbm;ms<GXCwSB_(k$tVp+>!<ssD~#*$(Xgnf)b&yt?L@!#=tjH&AKH
z-Ff=74rbVt8MZs5e#ET?J4pT`&{8nNC?8J|!gpz}gn^LQht~Z3=Zl}N%1<{liTkch
z@wGr<ER4boWGpP666lW{Jvxk$*jB=z36R%83GMer+9?Ak?-HZ3N?sFCq{9pF)_-r=
zh8<R@38KZsYBpzjo{bF&fNmyh)QT!A&x@<@rYl$2ePEWGR^pGb!~1*sPRxXYm+1;4
zOi9a!vkNkqG|2<arjS2PEgKP}r>(8soFX^WRnDVIMjCsFc~B)>t_1jG!oL}N94c%n
z`N}fawaM4-KQ5aG)vY;PI?ggme^NA{&}|;871JZzx8H}i^_D9vEEJQ)ywavuKn#E`
zf|sraSRlJV`!H*F>qSRQs|O6wml2&oKwL@M$@yXiCtZt!+};1b{zBl^)zw7?%|bJB
z{?I;Mq)B5pytfOl;m=;8Hz=kJ!+W4X2TN0u8tf2bN_<K&B77K{WBU9eYb&dj&u;`~
za)>j3<)+X5&pF`}#qZudZ@%d+E-sYh40;EP=Y9{J%@t0Yl9#vlJ)R7Kh|r|3z;s;k
z+I4KDL@hJZ{#6UR32ZFNV<f<_ubkX_T#knE5vBS)Tz`}QoP7SxoexOmBObWy0EX!4
zIvMQXG9$DeqpofmK24I}3j|^W*-U1^;>A~B*y(dK_v$-x%a$ho>-J*$2R$G1G+x=Q
z*P?W17<z??$pN{S&=7SWKFkppsYa1ew0!>&ZV84_=1X1ezJ(q&8Gs{I<0ZK}fVOkn
zkf9#J(v5nB$~3i19!&HUc*irAQR>^bFTul<8sYe6W2x;jgwudszrB%Z=t}EZm!QjF
z86ZHEVRCm;Qk*!*RJw-k+Pt_d7Kb5{8AqWhlL6=?O7-rY(S-^4eLt^i=X>mu>c2qb
z=%E7#7A{@dt49yN(kMFkWPdaia8UeW5}^84X6Beaclq7PRGCCVRj;8YA<1Z-fxY{E
zDq4BU4P-P85ehwW-;s@YYGFxv@RCxC<RHd#_zIv+3o(=)4@hO0QdP0LXIeTa`0LlN
zp_OwvsX$M)Z)4_Cbw@5#+|PYJ{5!|%20_w<E?c+OmS~+dJjt(d9XI6a)yFpS3~M`L
zqs1HZmgC%U#cS&7((+BDpb-NHiYNj|?rsof8UX^^%Z8saTm}9>Q3qN#47}nc2MI^d
zQcXj{Y@CX8(uaRT;!X?DFsiRk<W6w1i)LW#Ig8XZV5+^xzo}`=DyETz*UkAHV#v9<
zeyVF$uV#ej*3@?md`I9D^|?*FK-9EFtSl`TTP&A4KEY4{W`%z>+>38K7xh|!tLOEh
zJU<#y0*Q_s=aen{L^&76;v@ae&=YjWNo_Zi_zCpB`C$=Y+TfPbfPIX``c85l_#baF
z_FG(g{bmvZo&`c`Q}N{Mn}?98!Xv!;#(R`hP-dcelJQ>Jn1mFaCY@Nk%;2ojulom1
zrgQYBtSlV&hcTnn)K(|^DrIhQLW0Nh`&gE>ec(jtXs~%C7lw}G^Qpi7$_)3TN6a5g
zB+`ez;!~xG^X20_@;^RR9qAgNSDLP*Oy$Mo0_<CEe!ljrIxA+;P+7v_c*$|{09%6B
z;lw)+G>}eS<f3+l-er6{b26kJiZd>Y3C<jg3wYx^NF;wM^TaS8bS7}^kdLhXiM2Qg
za8HGPKi=}pZ%2uKcq`4}Pf4Gai^0;K69Z!V*rzfsDt@W`1|ds7@9|DQ$d=9gVSEZU
znv_M!ou=%ez9Z}3kYhTgUE8*rT3TXqhJDKuyAJV8OM?;=f&^;E@R}!g?kt2&LLL}e
z)AN&dHWp$Waok7)U8=~*uq5D~E#zw9=Aw!g`b|ZJHFiy9sv7}0&|9EO6XDoAPQ2F1
zalWxxKNKl*>Eb2um6qFFKujd2-uFc<rR+&z%$U7<4UJzt$^s||TnPXi0RF0HU6no4
zX4u%Uyw6%xf6Os{PJtO7KL5sbVvTIuwga~J)pQBtzh=}Sb22@1%o4B&c+UR$<Q<I~
zk_kTBVz`P{F#BEo4;>mbkChzHpyb@#-BtSZf#Ms@O+7hi{D0ik!JJ@zti#pe_6rw|
zL1hm(=#D(ox+e&vgm#qJ71O`}sQJisYxxVWtSrbSQ-+$@r?zU5oPf6x(L!uI1NS2J
zPC3@??WG@DfTtD_rj9&dxVcw8EkEsHaCro<iYyAT+qmWL0!MvQ`&V^UyVqGRo@1&u
zK3QLFwnDG&f6kk|)MiYs`sH)OM?@u<8O@oqHZ|KuWvul%^$`b;+N^HxH`%-EJyYXW
z*$Zd?BG>iM(IU4sC-h>+??2UXZQD<qZ38F14ygEYe0kq(&*Ns^d(6ESV$V<$@9o>4
zzg74mm7nC)Q18!=%$YKIa-iOp8w<B^eW^<uF<+~i0x(?hYOR*%mcgm7*XMqR%VGOD
zlwH^~Q2Jr0UkVg&c2a>AHPTH;0w^8Rb6Zu{tX<0x$s(kC@Wc%MV3+Y*PJJ7@i#6_>
zerf)Ny%lX=Xeq+OQZ@ON{j-P-n0>AT@6g_HE@^EylfuTD+eELV9<r+7#{5&pP&G<J
zhbA*^iKK+l4t}Xu|6U(;{klzkn6JOTKd_p$dZ;||qD#6NFmq51e}XIq+`8LlVhhQ^
z^HYB3pg+7fuKeMc=$SCb+?+r3Q=dMOS!FJo1!(=8CY(Z1L1||TX=gPe9<rTf@2*gI
z_9iAjJ>S{llNK}~Z8I^V%v^TmaA|SS(+UlRN8B39W)EO>+$-tA1bhJKuV6gh%T%R@
z{~K(B6`6A-LCJdGdGzR!>DuZz#an?#KdjaIVFdXlfuas<6u#!zp+lx+1FJoj*xPHs
zZXml?U-snTpl;?<*RTJ`fMGnLh&~)676EUsOT&GGgBPKbqejJNlLac64Qs`9YvDVe
z3-97m!yj-bN`tZA)Vhn`2X*sn1}~_eTk(FrG{Mud5>%Kyn6#`*tNO}YxHihLqM}pA
zL+LHp%$wIk)|&^3UmznTn9-@<DkzwD${3<0;o7yK-DNE-ES4@^>TLkPtmF^Ad1E<x
z;>o-grWayjYCn7c!}W$XLu&eP{XyA(j+4RW&5cEQ7`eryED(W)5hWEBO>S?&3E=Z8
zz#oq=Y|URD?B4nWMp=sAXrjAW{2adwheM;gl<%UfMS*zk+;GSRO0vo&mfI(2YSNhU
z*s$Rm@3EnQ#m0MMDx~~h939K}rj+U(8oQ(}%W2P)mj!PFXuupJR$%=$)yyt@J9`H2
zt+MGX^b-;N1K~MjlHjKx#tcIh@JDIi>wPOJs9ytX+OSyel=Zl*wDgSeP-YR?mHIO>
zVE{XeHSOqW(Df)@tChamv_8M^)TDOa1`8VHf2=DQ;3emts5@fBjI&wOm*qqMnLnEp
z17BNLXF;z#bPQF=gb9D;=DKqy;h8torlGk0gx55}xEY66{plHy`ryHA6+fKq40e#?
z5WrZAH>mE?ymxmx$03(@Nd{8YO{s5az%o1KSf_PQsc4{AtsO_`Aaek;F;i|6WghF8
zHRb$rNwU1bPlA|zT0YEFMJ&o>CBd@V;ga#-AFr&F1vh5q*@Fr~Z4A2|=6uB8--1xB
zq7X|{&95y6uUK4}eh5+VD`#hjBQ)NO1FbIQfhX|ftZ`4oc4bO1haoI~rzAO8c5FKd
z)4zLXpV3fL8P(4NAAL@vL*s_9@r;hdFHP&`!}34}yb4tmJ9X>`DNnwIK7I1!$tB!B
zZyf+J@*JbaEioV3oiNUYmyI3NazL+tKCxI*?(BRidp7dkN6E>2Vg|K{Pgsy-UwfN=
z>h8_QNK^t39$fo+G&lV@2?;wC*HIZQQZEA@ahNlb$)J#+aAQh6H4y&{_ez*l*e$xr
z?`gsjIrQdfD|<PB=m1LIax^zij$0AXd+h1LG4dVh-!e|fgV-|PIF3G7v+md(gaX}t
zKv=l;e8ZX=D@yGLXSplTq>CVNA1rO_S^mS6+Bhxr?AiNCNrpaoKvOzDW)mOg<!KEc
z{+m<&`m)S>w{I7ne>?^oO3v-qtl8GqV*4}jbKmnx!MejnjoN|-FRmnw$Y%+&vnwZX
z*Czpn^!OvNDeQ6K|JcuvyUO-w&z^}#%0PTr*tFcSh*D8O$tZ2+x*QJM{(#T1OkE+$
z92HOS(~4Dk#c$rw3yTm7{fhUq*lE^zFV4{p5D@*MW9uJd6zqg{Y~v#t{q;W%fB7-)
zEwgI`I%F@LpAt2iYN$ZN*Yx@6r%du~!0{Sr{o`2N`)!8;j^Y2YgYts%hgOHnh_j+B
zwuswGOw~)ocp87Xh-zO3Vobsh={~yZm?c)R^hn*W(tQ`hxWJbVO5E}5iHVQ=<vTru
z$7SC1e2x<)|8r+Io|luCm+##>+&q$vfl4cPb&?#8bZono0aO0N-^Lt6QAnGaUi7j9
z&!nJjKTK@1-J0(&EX7QxeBfMs%Das;9=?42s;Vj5qD2eN=b4lXKmx9yMeZ}c6G9jr
z_>nJXdV}8T1`HgykSv#Zl~_gE5bMqa$4<pzqs``Ou@s0Kv1=*&PpZFxAt&G>e8b{o
zpKWKaTu~o4&OA8d3q=cz`qI!~KuK@y4HpPf{FC;<a16+OQ+eNCX@X>L>=$bCSvZxx
zztu|Pq^$JYAKyK5^#gG^4$UeHNqQ?f!G?wreU(SFSJqfb5nfhYY-;3&kt0PvPoyTD
za76JywZz)=H6sFr0Y&YJ*jQJ>DBJ^>oV7xYn`c<eo}D55cR5rIk9lqHp$>Z)bi_4X
zQ!{ttnifi4a`fG^onjPxly!A)WJbTjgNpJt(mZnN9{CK~RB@Q`+RdatymQZ<a}PSg
zSV!Y0P46deM;p({KbC(PkGDZk4A?Mv^^wsa{DjOplb>*AmF?8YPa&3(#wAAb-MhOW
zC{X$pwgSdBD|l*`%YmX(Svw2x%*pYRBggzjtbE>%%?o1HlCtud%*~7!WtDl!iA@re
z08V<tjAMu{=-lQ6XNXpc=;HLhbnzmWdv;_FE#)pUGQPsub_w*+X;YP2_pgaMiQIOS
zD*eYTKjJv2h5n;wDT@5f;+NT2`Hm>@H3K<4{TPm0X#dQR0}6>5lC*+zLAE^_Y#7hf
z)rk|0Y`Vr0D$si<f6#amjUfMVP$ysNg6-|?(L-&CE|gW;GbQ*+1_{o}sNI(Y4$Ms)
zaYtc`OY(wYh|F<9h(MO}l54wFiMb-$;H=iDs;$M;2`&v@j;G;_>%y(pd!{TRToQCK
z{ji^1cZM@evEX|?xus-u_eWIi8Nzi%D8eZ`n|1I0{nxKwTTFbIlM@3nF?n^BYJv~G
z<ni24o(vR4HLJi5YRT~V%eTU?u*7hNOcf5Wcou*BIH~Q{1z<ny+|E_+1YpjZF{6jU
z7e)f%eAFhH1_e~wOm<?rgqKZLq9$Q_!5*_>{TaM}9n5&R2MD%A$@9Pzd5PFNhG>%8
z_!~<@e%t;1sCynyNlBsL>(;B+5RH{W$5HZBgZ~^6lVPZdQ-0gFZLQ|cJ*p=xq3p)Z
z(ysE#j-D}ny6ETLYsAc6Y{C)?xy}v|B?qjBtoP}#upNHCF1W}pt5AhXZaN#3R_ZNY
zwu}X-b;!QA_uqUC63H1y_MiYpw{c9K8D*Y8@y=z*>oV8c8o*Fjp0kW9BWG23%*uj#
zP)?Uloua%iNOog>#LI^xR8@`6W|^t4G{WDuZ|U2&QKl-Y3bBwJOCIBAB_$7q64KD9
zeK%OFig<<Zas1tGIJ2dsrlK-Cz!va^(Z2_zq@_VsMNbLdxoejSYgk>>)zplclQWG-
zPd6TUr?q5x2$c-oC!Z<Z`uCq7{(EV0v2d$EMUVdg!zxO@(uX9C*akYXzmHF=$)uE3
z`%coC?+)xz&=c^oA_Rg{K9H%2BFwDw+Vu?A#9hw5)~QPue2b&p7NMb(4xbJQVWj$=
zof?${^u?~PEM#h;nHxFwE{=PZ7j#G3{SG*Q8Z?gt7*+ssR1~pOe#ZC#opHtqz+8ij
z3?#|W$cS-V5IDykHNRz<1F;lc2Zsf~*<JSD+wIt>d=~qO-JBWJ5(h9)Gw07!ErQmk
zf49F;)7g64Hy|KFM8Xvu{|tw7&LPp!tG<DRZynjAkQX9Z(^jMSG}wW8y-*ovKBl<D
z!+BZGap8zeq%vGl(#mP>J*Phi4t|@D!wj<6_~w-XD+xEnIIKGubDZcjpUrTfE)!~h
z!>N&%O+4tu!;V(2-UUs}y=~j3&C%iNQudf1m+p*M=t4!Sv6Amgn2YF(`OTA<WftU3
z1%+I2!`RoY2keGpBs!%AvJOb|6{Hr*veR-4IbzU0T@B7byHO<NoGc<{QqD$1ZS;(@
zK`=pz1E%m#aKX*5zVx?2A>53}9R5L8_jDhwqcaaI7L$wqCDC?EgAxyuopXMM!f%2$
zcw>bryW415aT{+&XxC@YnY9UH3DU!v2c4uPzkKHfUi71D!3Y>aXYdU4IADf!nU*%H
z8P9MJ8?$8Pr1D6x)E4w@R56GfotNUtxAF)gBGR$99KHT7UAi<OKOi19%^H8`6TbM7
za1rrwaXCw-+&afRE|*MQ;SnjYrmp<JzI|zYVKbE+&QIJ~St&^Q$LaFfwM!EPbZ9Uk
zYtpukgmVxY5z<xT101O+0}Qovw6%pSf$yon!kmdej?J|W0C4c7$*XZ}Q{t(cb^h@2
zqq&M-4<Bp1&UhVm6JZBfOf}cmwz9Yw7@*8Fu`4`lxhFn2$YA-@q)h~Y#0$gJsIT4l
zjp}OW`SYJMB_DisQcGts3(1d;u@Wg-^=s1eXV2mY%0zyGtE;Q4KWQiEF-x}#X|8cu
zkn+>94-F>&Wja;BkzI_5NsOK2fR(e=riK&zwM5AC6cTmv3esY+-beg;F%x#4VtAm&
z$}R+6R9!0DMNz~1;oE6zA8$x)6=rN>^Myr9A`QwcShPq?U*mUSTWJjDQ)kOYY3X`8
zo)ly3M`MPxDa(FU(_#uE!H$i%*J}8E=r3VOnBhQ2g5_E38kFGZ@ZKZFFb>F_NW~S}
zu81;?v8{JsW<4O>L$@y5e4a8LB`ry!d0<iu-$QbGurbH;?;n`#OP563lZOwVX}-i%
z6@GQG|I}1`hvtGR_lskT?DkwUceV}QU?fz@i?V>k$xA5iOjo%ChmdnMtL)L6Amf2N
zc(;BV9ri?wWn$zyT`*o^c|lGxUM267R9w8+XUd$CG369KU<jt~V=nHg=K~`Pe?sLb
z>TdLnX9D|55BK~?LM%wjEOX)0=<3>9c^19H(wub#CK=ai-@jK|J5yR`;pX;P>)rLs
z_=qw#CiX80v~GC?5)rcrS;4Lo@ZK%h23!PUWoE~DYTmL_Ltn?mq!=cQT4<Kzw1h82
zjFn>7u0>3h5+*%u{<#rcRq(KiVr^AbZwwNDV8%@&>@9gPG-d!jeMXInw$f}P9qII)
zo=BHNb%@r`$Om=HUudD5O9gg=ASdG*Lx3HbIK<N(?f5vZaV3_)mhtRZdKjn`t2}lw
z9SI)FuWLNxRR|(hPt_p!qn|$CoP7JX690LqiBCVBc;cu?l}B###a;jY>l6_E@4wcd
zmiv(^jVgjw{S3!?4I44SY-o32;gSixr0j9u<73dNX|rbS^YH=t(IDlUL^vM-o#Nm@
zorjkd#KqCIB7}DmxKRutdTe$l-jUH}Mz#_h5<L!GmOfEL!J(UJD#*(-yKkmq=HM_C
zA%=b;b7Ip;m>CYVVfIXU^>#Z`-vpv)bMRFR^@mOS7D4BK)NJKZ|3{TGhw#9sF{XLl
zfxFsvYf<3|U5XM`K|Q^)-_w0k^iwg5N>_=3k0HZ_p}{cYeY7{MUc9)tIkR~Sl&`Ts
zO`gwl#p*5jkGZBfWnneCd?YA<^1uHc&Z`fb_5Fr;*7GyZhK7=Nr~BlIlkz!dJ5&&2
z7fL6E*l%@9=`2i5rJw`wKEw(ZGkTnW#O;J<@#qDEf^I}ub{j2Vd)i|HJ1>W34soci
zy83{KLjf4GQ_t{>h<iBBt18<2td+tBf5$TV1&9B)-uMP*LX%b?KgH_q;qh+#?YC~a
zv-=HUlIg+cc0zeXMW?){b+!joLvGyKvxGSm>gc83ZfEp~H_LdvRTP0&93azSKN1yu
ztiPXM!N12$=ga%MBq;GwvseB2Op48l%=>$JCP<&eY}v$z$TNlx9ZF|Sr2skT?S~ME
zl9ca9P0i+mQ`|G<gMGp6M8`9`TC39t8BE2rzvrKi9zQ<slKm9cw@7sMFu6QTLA>=g
zZbqXyS_no&!qd!!h(T-_ZW;l95Fa0(azIW}==|d#cmC%fb3CaOnzKsh4jrIfuzKU@
ze}jE5P6#O@$=qNEPiiAu_*fcGc{YKVKX1kXKR+bP^R!z0zU_{KIvXjCDu$K2#2Zc}
zhUVYgvW2w0@J|`Y%-r2wd-Pbxa5C4GtDRjqR|=X*GcO5#D8_09Zb`i8(BK}urXh*B
z|Ky1|-zZ)BvE$@$SaT&X-6<d8+yAEw5E%8A7h)$1TiZq~&gn;VpkjGNH;b*Tl;V*e
zTu6H`(w9_**cl+bM~U*pK^<`~*Es$wjO7gcahRdUX9Zu72eZDixL_kYjrgX|P>Dj(
zF0KCD;#T#%YpWLkQWboba0qG8U;k=+@7Q|~8!AEpy*l;&9?tB#LP)b~!p7!!_B9>S
zzu$8!__q4bpNIJwf~Vx=9jLAvuSj-nVG4T<ZYAkYf-N?;cO8et0e0{Fa^37s+l3tm
zPD2#Z(_tppafB5FU&U${(|^MUzZ(@5wQRE{8S?GhP0SN0d7VA`8h(w~^5|?EPt77+
zsyM#E8GqitZ&AND`&))*n2PQsM^0pT{U4ZP3}p-{q=!!)TN_liyLou1`lakjiXveW
z-aJ1~o;p=9Q&c)6AWhfYHd4^Ja}PmRsFY-i{{EYOlTC>lO<{yBmz7&7^aJ_&`Ud4V
zk+we8HN6TLw?cs_D*{Nz-kC_RJ#>wl{kI(MP2wHV85wSK&G?jY{)gj|-GhIyH~mUs
zePyo6q~kGUa#R)kGY`UylWpAo$5YpZo)iVFLM(C9|H*~VT1$l?ixyc0ss=P1FkV?o
zW<!|~y#gwFgB|DzMJH=qWidS_#cJ3LZCDh89fYJ!^zwwL5}n6mth^m95h^1=8ZcEq
z=y$_evz|JrTSbXZa&w}|$QaFS4mHuKPX4)Wwb6D;b40K~nU7(s0o&Ej<n=ev=k=oi
zT^5szOtfc@9?dD>Uvj8YYAYSy_C1q_O{S8j{|bTYlLr(C&af<*25O_03SX1Q9>Bw^
zrF7r2+IrXzu}h`x)?Z|bK!A*s2uHv`VyxRCfB$xE+I*$dHE3qcyuI5i)wq8pKZ{-G
zlx?G(xt8!C`$r5-_dI%Cr>DHU=p5aA=j!4@B<ljH;q*tkG~i^pO`=@y-p_ER5Zp;)
zj~~x_A#@&Y>EMlV<E}D&nX&+ANDLuIqK?NkbEh~n-dc25K2^P94~UnXc%oEb#VkGZ
z)E3}%kGztw<d{BfT9^y8(|VpaFZ=RsyCaM_LyQ(D%e-hB`yen&^cbmT3W|pB)2>b?
zXw&|g(M>^mc;^Q`p^Y-RenHcE(3?=FMdpCF9qGo4Jm%{C<H?4DPT*p21`1!}LF){p
z(R?$7IfqXk-WV(W+e)tHCMIKuxeyM_>TRw2P14`mLqS2b>AcdNVn|q#&TKVb%ENrZ
zkz^TJ-t{u+OYnb{xJO_7{dYd2#oyp3Alt(IC1(#Wlv=G>g&ao=sct}KD0imFT@R1R
zLvv8)ihrnEC(D#IX&){4v&&~;wZ(U+`*Aw1_4>aG15bBQ8{syYI!$sD1tOG$!44Ra
zetMgar_T0_e*-SSu_fC<ooINzri~cE{}u&N8|mwNUb##afE{s@Ceh_BV>ViNThu#A
z!;Pm*NrFk#P?$mY2bj20&$Q}^Smh_C>^UAzJAORr!{8h$v1vYeq;QmSaVZNRrFrBB
zo6Gg4hL+<9-IBsyl)cLsh9q|-zV3~q8y%?5N`H!_mBj~V1&maXdP4~=0yA^tR8&+j
zC&-*VfBt+r#9%5`cwV8k6|lCmyxI$l6skUc9JkI=y0&ae@M!F#L`i8CIMbrcfoLk=
zlu0;euBfzBS-{rzKN5{uRz)pyo6VFN4HE^Qq?=tX|7e*s!}}wHn?>f_D%MPX6r0BJ
zv7Zu*0J%=>0s07F0&B2nzY`l5A&{*;E-eN;F?_+JBF&KwIxVHDct+0=;Jo0#a%pue
zsF_|)aE=rI4IEi$aAs(me;6~-g2-<BJ#D|Z$+>mW49vcU>FG_}8vO45FS_Sh9{!sO
zLC~2b{*dhu36KPJYH8hOX$yk~r9Zh)$}<8Mr|gi1PjPpTzcRBJUMccs@ruFUX;7Ny
z4p&>K9YZJ09x~7u>ZHVL*FJrm#2qJY9X5>pUa(tzukAeBM$uN}Nt2_Kl9nu9Z2BMg
zh6chv_$IptrU0N{`jb5jEa%Q;tJqpt&+FTleT3qo4y$z+E5WJG6crVhDGir895{Kh
zzoMd${XK`HDO~a<H>N@?T_-Xh8z(R2i<OL-V??va&q=h~{ueLa(!MrPpiMyeOx8Z(
z@sWLv;rK6SpF5J+@S5VnL!3>RFoEiV|H?C-ilJz07hTa<<pNUMXMuqV^0eVrGAn8g
zcF;-oD<6?AhiBAZSy>-WM8+EIpl7nZpZgow@J%eI{!PB)|H&COBkR)p-`{NZ7D5jR
zpHs#QqcTca06##6;y0K+CY1<478XYX1@eV-ma5oVN)PJolhmn??evYfWPkGo7^nTY
z_~6y6b=TyLSFSw2Z{Mux(|0mhL{Tgv<uN%=`v4|4NM}yEX>5XwLBvIR44Bm>NQr}s
z;Qr*4VD>lU;STX{H%vucJv$}EP{7RI-|tFLKOO~lrJ0%d2x>Fi+{%s##dq(bO;teJ
z1P=n%=d6~}Zw?H$AOm}g-Vif-;K>MA6ws9!q69@br}U?=k+r+slAg9&H`3dlJsl7J
zmIH7jXRa^LelX~A3kPEkhlYZWd8E(Io%_vSMw3wyH8C4JWnaGn{CY&p8Q$W1N-Ez;
zyEbqp6f85#Rt%?<sNrs0(u6m2FV$%+NWviHP!mUW1#dZGkvnA?;dtH3oK8~MS||z;
z%>VeKrMCh6$L-od`mNRg#(<#RX40@0E90o;16I$JYW;58F?L8QfImXk04mSQT1(dr
z&mSaK%vgyhX0&!+th<2rl5{BnK#s_VRtOp~c5+AdU4ps{D5s#HqOp<`X;*q&k}kLJ
z*wF^-R3_G;;D9E&6h@~{4|FWjht%<<Y1FfOcNwJ<Zl%2=V)gF=t?hH$!`rgWGzXT4
zK92?`^UA?iVf`T&(5RfvaoWO~Fs`$iW_d*gdR<hU3~Mf5F>G!Nsj=O%Z(j7q{vpjR
zz;YtYOjo*fqJ$Ud@7pEe$*6Rf6%}Rj8)h=NrS<I3;y`j5p9)yet%bC@7jO*;AWf1L
z%a_k&%fzi#U6KQ3Jdc7hUSaYXGkPtGZCH^8P}h^BVyXNz&d_)+zz8cUZh}_L6B`Ne
zj>S5K_(fy6nXmc$y}q=|p!b630vsxk5n|o%F3JH^azkk-)x0SxD{~Z&&}c0f2F{n>
zpkd_z%Bn!*=VDAxrs~gbvo;btobR`=QwIJq53Hu(6Qh2U1cxSrSC4(xJfpLc@er}I
z3j*mT6i(<@7#NU5?{6wx)Ct3%Nzyu_0?Y0?Y*!?Wy4D_ICk5m0W?Q}p-UN}A#aU$N
zP6hF`^K?oWLW1;+Zrp&Hfy0}i*(Tbow#v$^|9}0jlf7+Kvu6Wf=``TV;RGp(r%Z=i
z&KnRL#KmbiX8wssf93Exga0YA-O8v@qo`Y&i)?gkL~>DNU$CU$=9lwX+Oh2@A;owH
z-_&SSqG{f16;)Mhdcr~>Sx$dA;%!!o3w=c%j@s-N`Lse0{6~injBj<ynE#_YGQ#fN
zflb+k(_2`|z&=6^A^{~@LDu9&#xk-;b!!@KN^kaRY^?w2L9m9Smo<z76#x#&AUlVg
zxE47C<WR?Y;2V)=H*zcqayk8R;!?xREkW}hVZzqqZA^l@&~R~iBlK=SjuDw|M$UQv
zzYk12zN1Ak2J=omtzfi-8Go@~i;tKPW#1}nJpC>)yMy|Ni^mS_si%yS1K;$g>?(7K
z-<7n207eE)OhgX>fdj~5-OM)v$HABxu8TlT7-F%V5M5wzit*jMjiOFe0fbT?KJ0FQ
zOgMy|eT@4^spnuk@=kW9eUCEcAH51>7J3za@yg%4fxG-y$HDJIt@E4wD-_elLsd>#
z8XI4raRW}m1r1qaHE6N)3-3e@xha$a*O8%x9MNQ5|F>nR$uQTUlKwpVy&qR>-h2bP
z0=c)7Q$&Wta_~jLRUx|eEaz+k@V+qD%Xcv|HXcCitxiSj`1mos9a|>1anYxh%DnDa
z=i+HX))TxFNRFtRf(deCe!NR~$^!m#k@pp1`7UH{ja9#ic-HAD%(w91+jY_$Qik#n
zAUn@vsk(eecJH3LLeX23tJPKNpuXUa7wE`h?&+up1)Hg)njQ=}`GpnjjP<L{vy|4E
zp`B?KXZK<c24Xs*KaBQ7{#ZidYqIj`t9h~X0^t?D(9@$lRatM#2fJ_~{n4YS`VXz!
zw$&v~yRIvTE-uK~060w0xmQrw!CFQ*LngRC(aD!N_B5h?LsZ1R|N39ihixN?KA|Sh
zpi2@SwOY&MQ@Id@VX9-aJ2?L_Q#nJ2hQS+B)x;!fc91&Ern-gkV;wR_jTu0Tzw=3$
z{0|3-F$Fbdkm+QtJ%cXeNB<RSq$%rdcLL+2Nt4KqP!89gizdA^mpE?_QD|jtEhur?
z$1#Z$#I!5;@n^3$tao5cIqF0Q&ro^>X98E#w?zP*e{s?F+jiGB5woHo*0}Z<@MCun
zg*&l)NBy4ei7W(A<3o%X@s<L`J<&GzBr%hr1Lz8>+C(0`Fpe5Ognq|G74EN(8-*Eb
zQ57+6d_H#WTv(pzu1SBVpK8^A0Xn@vm`j$psd2Q$ZbyJPS}Nxu2KLZ=Jc0Qb7874p
z*N<Zb2&w~YBN?i&iSDSrn&J@jDW>*d-dW(Q@{uD3<y-=Gu6h4HEax(Qfgmu-lr2_=
z^LU%o3$%C9-`G?OeF60qgFgrb2D!>fo}uz+4<I(Uf$%4mgs@Sgm`trlaqBHdp+Zyi
z{M)fD!=fqD)i`~56|z89^cghB|DrjAG`+j5_ns;#ITC>b+G02ytEi8HQ6IHGo!$;W
ziEQ^kzKdkM{RsRY1sRG6(Oq0v3cIvVe)K|`4I*j0jpOw><%wz1b8~-Q!JWVU>E&)9
z1C6HYGX~yh%te8K2H3A3uNOh^^_om0atP*A`^Wc@jOM^p0ewXO>gJbfqkD>0h#B?R
zIEk?4XnA5HAclZV`y2HKQ!yzE=p<+MhyW0(sa;TxHAb2=i1sT^nZzhjQuF6X)<Qls
zKrhjD)z!i4xJhD#jMp2CF5U)Cl)`SH8zK#Uru>NdD~r!g%b82oe#wpYKY3~@X=!<o
z@OT4>0JD3KnRJXskWYVk3|pD9+bgepP9*@l6xw6iXY9JD(LabUbouhwqPaYcho#^G
zu}_<d7*cKU+xD`u0ZfRqpyB#$1f1S{%r)?hnUjzYiA#nSszk4o=N7-fKw)u*-1E@k
z!-gJ=gmBRN$jPl<)eV6nhl~?KHix}~H_gkl4mUMdUrB-Ru8L6xeTwr%_rVsSD^Q#X
zgLHCPdHSiX*TV@>Q=xOgGjc)-!)W*ct<IjBNe=4l^K=3tkdysC5<L*NdGKD(WlJv}
zcWiqXn)^AU{JZXT=dSCL2K{Z_clL0TBv-pjA6LK$Gp-A+8luDY#bdZ?z+t^9Xi5Ka
z!1(cXm(z5!9Mm(H4&l@AoHv1doL@U#!#qKoM-7fEEBbk#Hdj%bwCFacMVK`Pj}zb-
z@aZsdUu7}0J7us3yU!y(*C0<9hgYb;itchL&GxQYw~lV^GH@qHm#V$++}yDTL)%F~
zJwrl6A@Wus=j+j{KVqt<)Vc@9_mF~qPj^r!sq`{%sd)01p9AzUz7Gej3l`##9;>;)
z8>R=~!HWB`f|g;rTW>>nZmTQwcn9Zp=p*9iCPV|TJ?2Tb^hVEU_$cIp6;ZLo@liT`
znUiNXX$EX=KpF$Eb`!-u0(uO)tkc9u4G)>@Rei?E0ouvqT_TI429~Wo$K*3-g)Ffk
zZf%RONcyNJSR4yqSXKl8&~5m8EM>`-O=4BtCw91-<vRXTTxhJM%ATvxLNb2Mm%vPD
z2`WxBm2kTJ(|JfsNnM8ULqTXD15=9U&)!x+Bd6mlgcn=6EXYVOiAa4Ad6YbE6Esuo
zg&_U0vZT2r5DGoMA-)Q;_-xYf{f@b@c&vXx2E~<SM3B{V5Ao5--Z+yp-$Bge0LdHS
z$};cLRwq?}(|#>9?R1|aN&vz&H@$^M4d!p-ZrPx1AY^)rZ1r$1YgbIs=zuVZ7kBf+
z<>nO#m(ygG{;Q~;>Oh=%Mll2#`^VO;C7oMJYQ2Lq<P{a)mXze+Y87|cw5Ot?4ns%Z
z<P-gLoP_7kKxthPmaHFGXiYbqS;wo(Kl9a)XQwL4>Ie4f){WlT5u%KeS7PE<VgZ7A
zRUn1uJ>Idoj94Sf@k3J<<B)M@P&YUv%X5%ZB$U@gLW3R5jsi+{?bBc4d-9|n0~na5
zv{yn50UCxiRN+U}-X|@dU==eyYf4^JqX-qy1Xrr*vSFSsIi)VDve6b(c#zVyUR+Tz
z!Hw{RA)_9>dw&KbrUZsvg0}@x$|@1g#$aknp%^CK)<m4;9XF-WLtsoC|2iX@#N|85
zM|#x2+^4RBU?5b#@fly}tJPh)b?L~tO-KuC-F0G>m;&xZw=)U*0#=lJi*;!5G8~%y
zTBe)mAmt!YAVnzs%6)>D;j)!<ZWDB4B1wf%!`QpG96$Om2D5m5{44PdG{#V<kuD7V
z_jgdDFoMMxQEpgGO$MI9Ub1D&%}rljp@k@w!ukMJG$2SFJ#Y@DG^3BafBa9L?r_OB
zk$RBk5A|d3Yz3*!fBa9<t<y*TzHh;wjX)_MRD`kA+}M?Y(b3UzFS$sB4(FLO_vW8!
zZV7UEr|0C+u=zVLc3=4j&s;fv;prxpLeU*oP_w&5p8N5k7p_<_!fX{<p0=I)UZZ;i
z>P%HE-$Gl}BA2_^Hpg1ZsR6H<n&N@b@RrW$oWwSE&SayL8>=5+SrW2c)Pl&Bi+x&4
z7ltZZi++Dl3j{#NwcVV7q;f)P4PE+W+pS_MQcO#Z`1%g;a(H^47Fz?6c(6DtbCGJ(
za}>dP;D7<_^J>LAao*li3hE5&t4B^5#|WZ;y*;H{TkMxC(XC0JM2y3{1dixr{u&hu
zXk$u#FF8So^Y;RekN{`55QQ;O+1l22rM-PEUxfuHbI~%F$BA9vJbnWi`VK6east_i
z0Aks?nU>KoF*bhB^<kD4U%JVhR_FWStH`;`{j$%5U4BqPxE;dx(bx}aOyEw_AP5;8
zd-v{LPE6`3&lb`r3o(|`bClRM!nS5{=XmoE6lEn8+Vqc1hjw>98<tDNMKe0m%aX=1
zJb4K*tMiVXJ4N^0HoBLAus>h{1s^Vb0PE-3I$hXJOmx4+4-Tse@u8FV3C>l21{Q!&
z3awkWPF4)*X9YA9#<E`AAttW2kbO&LwhlTajIk^&|H3R`nAayF+uct07y`1OHQ|4L
z-u?+%TCYn>>7k{{w?`|%Sf8JvJ#K}R=X9EcJt5KM@JKXc;igEqnBt{k<KU+R%ILSM
zX2XM6TqILOCyGtQ;7~ygEKUmCPf6mGl9zX^e8md3SbAPr`8(QuhWEn?TdJK!^Y#Un
zFSFd36n^oF%y>e=R(u_V=`Z4k3zshaB!)B(vBLhvV<!H7j_Ov!5^s~lHt(a!EsOT)
zD4VT9lqFn}`FNdDA%)eOcV#ltz`tFhCNDY;zP+32j$1#8<tX?g5$fd(u0u$0h!1FW
z_1xyby+myKlYy{UrEbsH?L#5Op7sVrN1WZiVDsZH)2m;oCJ(6f4-C{<nLi=!@{wEc
zLYnLUhP{V4a1#dU<x}|zh{2{+XfV)YbvBC$A@Pj1vO5KytT~Dp9Ukt+H{`8IzTKJ`
zEX&hkBuzX7xzjNAh<9VImY(S(6<7VWXx~@vg29d<Ts<)GL9Nb&9nZG+>Ansc#Js6J
z(F({-vY=u+-%j<OE}L^9Tz)dZ4f~~^h3$U+gpn(BGZW5!q5<+361jEn3g>Yh6vBx2
zSlMj7{i3*7NLr!So8R0w?Oo3yRA9`b)R(^GfC54eil<dUGlMjL&ADWk+(JSjcyK=@
zAD?Pl3E3tPCxf>h47&Jt{dr2V-~B-#Er6Y!nmohJC-(0z!sT6@rSsn!4F)?<7cOG5
z9*mM6BqVQOp>E%my;ee|^6Y|_JU@N-W1#a2HF+;NlJP0`4*Is@tsQ>JI_TOb)-dvg
z+Atm#7Gw#Q-{lXJZvA9t(>ly}C}m|FDVok<YM=1Z(#L_Sn?8R){%S0a(G{yG(?m2Y
zM>Uome&{HvZy>iPa5TK-1k%FUC!ZaTmlk|`ONU8@NREwVs^GYeYpzS7Z_QI20D5KH
zhqoM(R(9y@5q~6XJJE|T6Rm?f8uD5=%3u^^fe&)rq!yl&snRL9`xBReXArd|dl$8&
zXW2nUCcUCoj3g!}uU@l8JlEakwzlU@Rp2QmCt7~#qyO~+U#8hAz$6^&#Nu|Ah|42#
zZZL+xT+E7`2*2JfxphImSKxnw4&O^o{OyF(kq`lL20MP_rb<{Y(j8uQ;*A?WXHQUn
zO0Aa*C~P@{Z?WcIG)U4P(VNabjV@KMBfMC&W?-vZnJWsff6kcNzeh8`Bs-esOO}5i
zK!h=bOH}$(=;Qdx&6P>3it>(D3F}uNFhRh?e_@{YJMS~yS2(dn6rxC+{yeOu=ioQk
zACV2b<k$ze-8|Ayob=js@)54K^VhuF4MZmmdAbWLZobl1xjRtt_){Df0N;)VVrCNj
zX?S7F3vwVn-s8cms)c7r`j2kLt|CY)G*1^WUim3rs6k?!I6s+5Bmt)t^P-IINw>6#
zX&R2N$QuX8qb{h;M5%|dB2QlYH4&x}Lr!ck;C<UIbWda%*w%N~KRkx9Dt<ihEa_){
z^wSPXh3*w2uv`_xS}q`>T|PcN3@A3JO^rFq@8<(*How2!u32ixw!IK>RK{GOq`moJ
zhsZ`CMl_%&+?F7~STY@m;kgYdL=cdS!Vp3dUA|zf$Z;=jqIGE1McenpiIpo>Y$EoM
z?!}wr?DY|1V%|9A0;crFGDQ^?qDo3uH0;NRDrX;4h656uhsntf^hx<)mht}cH@g9<
z4R!!*O)MlkN1T&JgSM3ZBp%|Atjyu@jQiT)EN*-^0kh!IBR+oMuL~;5Ww{Dn2f3Hw
zXHT4V?{F*A(%M>&HjjZ!aK<OB3z8MvNs{TuXw?}g#(>_n2uO``td;%*lQ3gh;xot}
zDyWYSohl9(so|3anooL+J;cg4o!aa<1P9L)FzAqi$wCoOy3~ZxyT;H>$AAF=0V-ba
z>*`kFBRh9s^U1Fl?$165r7a5wNyeeOB282aqj{Fz#4JOCK4V!KR2eUUt>OnaA<b0s
za$ppK!*+w&kK@Ptia<E&!3#P~ME0%*8{FIysr5x2)s9tNOYQC3E4A$?{{r_A9!!`p
zY{$0HU<cOUXF@{6_qAKulzGEv_wIw|^*eRyq+x{JhG)G%T2PT>|5ntTTm>&VNG77n
zAr8CgDX2gW(oMOSAR&-(bFBXq&A(>xlsyNUDrO_09aM=nO9%^_o+DYt6bYIIwnGh$
z5aXM;ZVwibEHk4OpS!(QtaNW3q$zO53b@N(c^>a)k$#m|Tv*&nQrG7*MqL{tNW7T-
zY)1^ToyednUz(2TNj8MK9@cyT`X{D#H4p=_D33VC#u{b*I(!W@075`3HTkg@3>3d>
zJM5PT%A>nT`H%1210IZg@}#NSI<EdLR|(2vx7?k)yaw-Hw!Ylic--r3bA5YuZ6|dw
zt!islb-0b>Frl&Jwb-B9EXbre_~-yfC#VtG6lN-`;S6DRiyxb`>x&*d*I31`TDgU%
z)`mBywukuaas>?8wWraE93qs`<k8WwLs+VeZp!LnKk2u@IyzqgMs6&%;|H@faG>k)
z_@&UGnRjA^coi~}>zLgs^XVu8Fhgch`G}UBP|@-K?HU`k3X%1M<?mKMbWlHY_G|}H
zT|L-$n+xDt@q4>|zT?Pqs;drY7V%Xxr*O}TXS9@DjjQ^ZRh{P|4@VLzY6{W{&7F>k
zwxc*WwNp`)GV^_zw#!GLdo*W$%#?_=-y9j)`wscLHce*;|LM9(wzel~3RA}?NyGE$
zRk9<lk<(UMQZjvpjYP-lY|JWDLv)tC<Y47XQz&<gw|uIvH|&G(!2kO6^44YXsO@Rf
zU7s$!=&Vv03{dFec93(abs;7(>rOYbOb2zgps){R=%aa(A6HQsqVVM@YDG!_`-gLS
zv8V7dv#~M^$-8(PC^dhqd#<B4F6wye4sJ1qeW=KJBhry)fj9mbuA*{a90b>+pPEnB
z?4jM^p9y2lC)-GIb#1ws<K3*RIh@JW?f6&V7kHl@@2q&?n}ukUO43WT>tx%vhqiex
za!=<rw+1qd2?&e5eqEb^M%HCRUgeLjKL5rgp5NuQ#IF2GmnucEoqphLo>uv;U9rP%
zsJHdpRyy--9+NXhefXJUjKCD0^+}vkXr`B^mGfyz{4RGVH#eZmNon|TCaBh|xmY9K
zT={kq8=`7QvD1zsGj7eRwr{P2ZV<KEq!Ax^7Mve^_>OvOAo__b@83c!F#9Lq{Ntg+
zhaVaz>^a0yFs~YS&p$~@PtOnE<nb!0lOsOn4~8!&4<QOdtEKs^7n&rbCzp!D{poUU
z!kZ+o?ytN&i;aCc=SQ{uG~Nl`(JJa{=3}9RW^wv~sSA&WO9v0?PDMtWsu<GQcjQ;+
z_3NqA4r_Jz?dP#$v+mzNoB5b>nXPWgUE8VkCO0Xyt_Wx)8K*IDqy<xogCwbpu~NKN
zj*$wE{Klje?3(BZo<*DPfc}DVV}xtiwt-4YLI-ECgY?2~Oc4so4`{|({slM`e6OL_
z78<MYB&OQMs&kIa07$5QooOSZG_OmmgKO8V^BZTmx1FRApcS(UmgdTuSPSXy7#%1n
zsNWt2=d5GTBL$(}I!Wu7=IUm0n$n-dXoAuFudiA3pu>m7Q4EKL2Pr8BHv6;NhP^qv
zkqHaGyTd9wd7x0j{F62x$BZcv;qOKnM!y5P!`!2(;c0P9%heW4EeZcWy1bYTu*T~%
zuNEqa20XleCt2BDJ9c0^@u=W2GI`Xw9{dbmBkdT7`a*a@G6sDSq#MdjKYtq-a;Hop
zJHIA>!XOGCAEGK{ITJ1M%42aQbOhsOuBX<cN?*L|debD81J&!MteY17(@E_Xmb!&`
zEu^q5C|YRNlXA~8&fY;u?1Aah^n}bfQn~f;IS`w43FcH?7<c359lfIy*Pgjj)mVCA
zqs<R3K*PJs??-qh7|d&-KW}!`II}7&%^U_)X5UyL0}oM$`zAvfpD)7bLpCD!#<M}p
zEXj23-(QcROsy{C6nrRfMLwtS6U;Psu<&m0cQcsKHBDSWu^edaUk8zUV4N<L%gy<t
z945liRI_ytuh0+YPnZ#<aAX5~pb9Cktv8%XP~sOhK}b=ONdDBZ9$j=l1%<a<x_B`~
zt{(oy9|O^R^NZ63y}pe9^&H|b_tS3oRndOo*?u=?D>JN4vvx^)B?e=OuqAsxG|gQS
zQC|!yM9~Qgs!JCpbP~u6n8532lGYZ`$>}8v2J+g>m*&#t;;!*q4Icw(JF!&j4xIfH
zz9S?Ewpqvi-ly~@eR{Iz+6Eu1kh&xeWE74H$V)L~KOOZpabpNa5B|ch>_E$~a~zf)
ze%_pTq*n7q{UPaV;I3BQ?}T9KU>Ioi!S@ZFFyR_q!^{JprNzBi-kos+VYjQZGyNiN
zAWW$YY$z4l&&>o0B8|ZURLm132FjDcUa$gXv_t(CeM;$`Ls=QmdqV{YQ}2wHvrcW&
zk|k%Zj20rJ%S(@q=}TF!*C}z67Wbs4#zpB*ICzFU_T}1k{U{V`m+W`Ty+pskiyrgZ
z#|MIEykjuOVE8RNXCx_9S8=f?_X_J8+6!jd9WlN^pdl*@RdUfbx(VzCD89_gq&Ih*
zRwHC>AV<+KG{UWva?-WX?HgOj;I8n6fkOef3Yp#X>GUinFIiGbc;O}0@l=`1dGck?
zfq?7gQe_HJoj0$#0vq5?c$mIPKZUCzZ6~q<UxfiPaW0u;K)b{e)2j$saG}&nn+s(_
zz_%5)$r-$8l#q<OGe*SK7Nj^#E`kF9!xw1A8SAz_Mt`>%gpk?4X#doHy>a=a;`~-0
ze8Am-p0^NSXlSgXxUPHOZ~(+%xQe-8eq|fe3E*-v<zr~L%Oz3lKy9LLm>Sr@Q_dhP
z$BEfB-uJX{Q+m+*V^%#{BH%?55bE9AdYTCeLC23D=kqCaY}>qAjuz@%J&<9l%h~=e
z;$u0QJ@xdI)|QNp&9m6#ubYq^kvD6>wyc86j~}BZrsc^EQdTAceBg^1>;UK#my{HI
zFik^J#g$O4G_+&i7jo&>uX|y+jO7L>8odzL9{jqFudI9&E&m7RoA5A^QDOrGjIP+a
zrdVB5vuDSSH)L62536>eS4R^N@ZaE1<`nPwsoM>6eZ@=g4g6L!HUmE`Y?O$Auo>+c
zF8u3Nd3mlteDmU2<l#ZQn-VPcy;I0=IoG?oHl1~&i))MQCfq#_60qrhVPHFh+OPXw
z=_M(I%H?*?xG^<%y5ldZ;nO^#Pn#vy3>;ySl6k|S%4tYw=B`5{mQ6NP>0!HL@$=Y=
zKJWU!uG78QaI|FMo=|t)M$Jb$Yb)bo*Svllm-a!MIeC0Ll8%u=*-PQT6gZfR79A})
zFuK6$BmQ=S3FAGkenwOLp$^S5?5rOkPz6OBo0`^?=cw|>z-r?o#U`k$sarsy&A&XK
zn~+qf*;zyBv}{?GYD=5DvYvW2GiJ;{xxhXudZVnU8qVI}%F4oXJ&~dlajVVC`@I-E
za9~Tl<501lJ9kd^a1}rClRc}feEUAAt*s5AlOlQU-8Qma1YwG9#;H?9tNLydECfmh
z?vB_#OB#ww)S5T10Gl3=9@58KDJc;#u2VU*DCK|l0?Tl-o-S$l`Zgy=Ebd!L*#M2l
zSJ8tN*}uQ*^C(9Lha<<19Za{5;3DNcdsZ}N_dQU<$_I+;Cr#R#n5fe^(a+a+wTsK3
zQ?Xu0j{HSW5R4>ez%?9H@3ZO;9Sg~G$=rnXBQ$CbJ8`~o47p}wYU)ySr%k|mA@nG^
z+s?~($Btwem+5atTok5c_V!-J%)zpfvO~h7#bY=;f4qLcwb(xPZS1aHDQIU_3n2ye
zd}TiHkj%a-D21FqUwCJ8<3u_jO18bfJ$`A!Y%?>eDrl75J9pv>Gj{y=*R_j|Pr<Ib
zePJ6(PFY!5s7~=;4-72eTO+DyIeO?#FWST}?-1GtREbgCbQg*$ZFV_dzrK;1&wVHJ
z7Z&P>K_1D~B_nprA4E)c6K&tNZ6$?3r}g*k)EK?!)3fJ#>R(LSuiWm(=~nBTs7~6k
zG@kBB5^!Nmt>^C0=;$pN*<l}X_{tQ$4;~B|M?^$iJ01`a;Ol#<ZZD21H#aqGOGTN;
z7`#m+`DCQk6&Zh)D?zY4tn0TwD2*0g876N`(KxDL(>Gea>G{`NG;(oIzGkipTi{$G
zJCdt>nIho$x?g|&l{v!QspLUg7^OF#%h1wQ)zj-QI{3F@_ENzx=r-~sLAuO`&f+y7
zysa&!`fo>+756V8B|=5v!GU9Q16V6BLF@a&PVc&N_wHp54qvFJ$(Ih5UTJ&_+|oIY
zouB|gG~i*!qEXAF&3%?nTu3nkM9PgE8?=)e6;`aP0puW7Nprs+_j{I>mUQqqF=B8z
zv7z~SpJ=Rklc(3(_E|fnkl^5~8#<rACa0$I;tnk9oLs6fc1g!gZOS+3%+z~u^2G}~
z6Jdf7PE*Fk<YQG;k;lFxQuE_NY74@fGUpe#pUIQk_@iy4g{gNw2M9IW#N-e0ysukj
zWn?IoT>$~4&f3@2K!!cSOs(|y=Z-VyINw%d`cj0uV6@yrNk}&!{ks~Fq3H$`B^=yx
zh6MpU#3kH9x;_($Ef|-3)QJgn@oRH89t)c)P(gC%(Ie|3uR+G;;VLWrlJe*gW{Dfw
z_p<i+1b7uJ#5t#&uq|KfV?cdC?5K=M1N0fNy3+>1#F)?hzx`B#MBpvWQ8#ZAJQ<Po
z^5q}>cL%mmfGns*Y_?!{_<;kNpo3#3bEserP1l{Kd)=dM?;&{WX>-<M+Kh4OhECB)
z)sW~{=}b}fT?;5YOauV|T^Od(qkH#h>F9|>CuTwgWYx<*IM_3)Prw)Z$d4YVsi~k;
z13X3!I(_<sC9aC6Nb(e4Hczs%Ln!#eF{{~@%B};rtuw3QE?js|IUsWitme3E*`)VF
zpM!1q#pJy5tP{%_M_#@>pWFCRO!GDW`f^9UV-lP)x8h@Mg@y9LofE1zjhj4gp5pL#
zoizhMF8`0NH-XBz?b^66b3`(fA=5=gDr2S0QkhDl(jZfj1`R|-3T2KcQHm0gBt(hI
z7&4@ZR7epGqCq6#`~A6}cYSMp>$}#wp7(j~o9p@y=ef^gAN$zHzJB8dUNd1wj*JMf
zv$sbf8S(rM5QMbh_2i5WiPV~UJ{#J9`ew~vcYwnGKDUw*W_Q1f^!PO+X(mseJY&XE
z6O+qu&IqzO@86BJJ=2V@%=9g7n!|?=N9IDq6udk7ry?k+ygW3PGA*A8<%1Aw(tLXS
zxUH4dXrcrqUT*aOeEY3i(bXeI4j+CpF0Qi}X|QI*#tiO(Tv=FPj~^?msvbsybNKLJ
zN*@SLr~$vgD&qE_MJstAh#B1`vB^23S$@;e=>Ur~5~3*kbne)ZSIEqa_-ogWqBvc!
zfOsdIW@LohCJ&+uVC$6v9nEgCPTBn@t9u;zEiEJs9KQMaCILp|Z+Mu_+xFyx<GXdb
z5;c7SrO(C3GcVIyN9P{Oc^c?yYisT8>{`FoO+lXkJTO`rYi*6B?B-2B_7OW}`SKWe
z;(hy`z(kFN{-A2vwd-z;+Ih9SaKx$@3mPt1@c7xYBf-HKmluH=SWa}Xy~KiTQRc;n
zy*}RMtwH9C7RkuT?R`uD<c`%6nLRpaP8kb_p#TvPx;LuN7W=|R>6r=|S~?p!2PI-K
z`J<I1FZB~vAy!r;NGpYfn>YT}j;s}wvia~9eZlC&dKNK#!|OJEmuwpdCkCS?D=i%y
z9&TxFE<9?OH#QpVDT<`Lo2l2Xkxfq-H*TDUhHP1+;1CBGU_cy6k(O`a^cT6g^%(WA
z3bO9qv$L~9^S`k=Uw7WTHNaRN3ZJh<llNuYy-8d506Rt2ik6lZ*Gzsxx#}n-g!uS1
zsB9yA#{64^Gaa}ulo7KeE-gNP<;oSn0CKSyOVvVV$AQOK!E9?jv^TJv*RCBQlCYUI
z>o%2v>=}}hl1)ub%X@!u-IuijsbZHSj6@O(=a&RNug}=Mz>uB94(=K|$47i6^l{<H
zha{(<X8x0_0|J<vnp&e_Cd=t~Lw*@C0yPzXA6Rne*fC|K=F!pPAP=!+GBq`Ay*faP
zsMxVnCxV_5`8JorgXI!QV3q)%n2ls=MM?S2FC2G?YA_G@qzwgWMQW|19?{|z`5mPN
ze_HSXICV<L+<XJBiflQ|*K}jc))F3p^-xr-c&Z$u*d$?Lc(w10i~GRS2ffj<kKkQG
zkh5dAMOex7`*Zxr5u}xqP(0uuj%rBFC#<7~1BV$^&RzTVG4^(}l2WgJ{T}p`KyiTX
z1NCZzM_-b1G{U{4-r`quBqXGk5(D&%>{wg<khwzKmOihsC)2l~0)?z<>0X$m*RV_^
z#Q+o&YwNvBRa8}lQJ!SUSWgHYS70&Yvdbin3{rP{okhxGaencv-`<?~a4R{Ec#g8z
z)qa(GN(h~T47HY8nT0BV4;5meefItPd$mDF4;~aYX|e6*rVl+nMs?Yn)%g88=Z+6J
zrn#9=1T+u=L6<JXrQc*3t*xzJr)0rY?3N^$*(d|_pvH}xVulBxF5!ckkCV=wvxzW*
zsj6UFN94yjjf-F9^q!P~NE(;f2*V7jF<Hu8J>#EvH7B1t_Xz0l?Ad|V6+lHtN9Ir%
zl88C2Tv_mt@0Z3|_BCQZ68zac6lY6}4}Ss|f{qp4g@A5Bm^W>DOE?yOf@NuOIu#eU
zWAi@pJ-KOJfyA1vvSQS^_cnfYDeY4@vKz+a#9S{tRhzRAOz`2GATlGu!^4jqGxc-m
zTH~YZ8<@3%*2iYB?4m`B-e-Y2*kim^Y9Ydwhj3feG;^_)m7b1{(NfxNGz$-5Wwb`~
zaHmVC^$Ei2_Wj@@!3uO(yg$s}M@j=Y^6~O=E=>uVXt_<aN#X2Rm4I9@A(8lO(#6Fp
zM2i$h(isvuKWS}328s;3QY6`za3!fxMw@_mby>oPj~|N;NxRil>KYgXC2RA>q+E3+
z)O`jUnVTzCo;oG}`=o39P*LkNL5lyWxX+)o({XWJNxuH>t`ZUwQc`*nlu{Sy>FqeM
zzhM^AZyTF*cm|i`8{b%SwSV)skz#QSa~GVQSwR&`r;QtY<@k;ro!M&O)?g<|?J95@
z3-bH7Z+tPy)kpQn2nO8oC_jI;U9!Z$cN!{fKBOU_Y!(k}SZvz3xYMV@j~*3*Z|e#=
z82D`7oX!9<{w+{(*_t);l8?qC)B=ycG;u&;%7<n`>dSNmHNg|sP1)1!Z)n@1+r4uv
z=#v=e`dN+>)5+1XK2d$hF-pt<Ro1!M`ijC<2GO?R$f>8-d2d$5plPu?ALs!k&XD26
z#c5D#crn+Z{p9^5IS{ep0FoidvBEcu=JUVLL32>1eKEr-=tSAm)<l?x9j`q94Oua9
zPx9gd0xN9a@=fl-E@uNKuj+C9a$x1hk6d#3nE+t#U)QD{{JUu$?<L|*&CL3e+fpvU
zw}g(rl#d76Zm&$oq+5L8&?Q1LVl+;ARu4A}wG&hqgg!JAy0>&MEJjU@FxhoWLq#WV
zW>Z}T=#XC!9yhwX#~9wcv*HqFm@H=-W8+!2b`!#3x*;NYO=X<9T(j1*bBv+mkbY^~
zT|(k0`2xittYzlfKFQ9;qmK{$UH930WLww1vBBSiafHol{Un7m5l|);m-F<{G=g+x
zizJL~e=$Mqv{u_H{`w@`EiUK=qx(91Sf+PxAytH*aiCwvcco;{9tr2pam9#z6xAbR
zQ&Q+R=JNxIH{~sQScRgRL;f1<({I0e73mT%bf_-sxsZ?oLOE{{K}T6tvGC%p&PIJa
zh6@|wS46=Lu$1IajSUTS5HU-szcz!AHxfJKAIAGK#Gr=7r@6`<buiBfGpk*D_B4L`
z#@);{Fi@W~Y4zT!ic(fG59z>cSA;4;V<uai$cdOG5~ntlIubE#eU{lKA0LepD?G35
z?Bum~-CpLFF>eogcJffEj5>%5K-sUhHdtb&@@xhu0YJ-z@tlk;d@nQqrH8)aIx-%4
zBM*o+&~^`wS4l~sJqJA#)18j>PfcBZcL(y&NsLdyK8dx>3L<7m_2@w;s-e;gTC-FM
zV<bvYP>;P?j;HUgQ6T=^?gS|Wsnb|QZN2}1;`2zf5)xvQ41+XrovL&_<6`K4VpKpb
zAr!7?=pdgdD9YfF`5iT;Y~H=Q5~2z4!@zE?3(fz@<72iako<x)(fRV`NbnH9RaaJm
zNdx~_358{8!FY*NoIhigB$t?Y>XaSAC}Jpk#FzO7z&I*B$aME>CY?J+dM`AWly3ZF
z@#9WKhmdUj1q<$0(rn}L)XZS8r=!}FBUW3B+!n>2b#?xlI=*-%$&ZiE<RnLOe$D(P
z7(}V|Q(h0*L;J_Nbxz6CQpPHVRKI(dOm_m+1&GK-j+3A*MgB8_X#~RjghP`e8~6p}
ziZ%kRnOn4@gF_X^>&uf?ySZ%6vNR4lfC2=aK7GKPn`h6qe66Ow-Dlx(#*I&Muh2>9
zHS=V2^fK0g*Rr*|Ikq|`mGKD)eD5Ywr@+8Y+i#CjQX(50^)YvxJ3UN1vv&(=+T9Tt
zNH7A#;c-Y3Z-AG6w%rgd4?Bci&c=8ChB46<0N8v2zvF!4n!d$~!h<5>($C965{Ao}
z5B}2rW&oGV)sYZAfAYkc(-5mU=e3d$uJPy3w|%QS#-w0aLJ$_lNkuQJMpsGDbOl;-
zFV4%ml$YhgZ>jACmCl{}korLVwq+bH^n!R5khi)29*A9y&~eN}PayyG>yt-}ct@GT
z>vR&3k-JIVoO?Xmjz5KF9<}s@{`<hkKvZ1zxIv=153Meo^_;nU-_qn&2BsSxthhw$
z9c|{`=!n6~<>^*jx15%Ty!P3XCs%N0lXQG&{&w!%=-IRHvb`+K%-Ef{V$^ci9J;Xa
zbP@4(bR}SK!XqMp9U2Y&>G8tU2J=2DC_EC5@)i`&<p#t@o9!2c(aWDpQ^%5Q&7JGO
zDaQ%J--e2!?*<*oE#nfP`!O0_2&@q=2ejL>Zyyx$1riPPuwW$j2hf|_3!ZXN3Dw(i
z<AQ59b@Fi7v112+o5qO#XpaTutlu9_Ifx|YN>*Sf##1jTu_8+Kj=xjQ6SE8OL)-o4
z%^UEHzT!k|l*mb2W_{#7d38_1han;tNvH47;9E6G**!g*c>zzOrhF$Uca}Lm)%)b{
z8CbY{aK2qsbi|BW>-1lrDe7?}xX0u<_|buuW{&7!<PA!<=MoZDCXknFx|FEF;+UvT
zpMJdPGlDq%bM6|?Nh-e8S*Lpv-c)4wCg0&&-2w^p95X>h`!Ay8v0<cbbEGxsTiDsk
zkaIXuu6qH4pg4G;q2ZaeWNagjG3W1DLQf$%@nmG%tpq`<KhG3=7bydPA(lLxM~z~5
zQk+{LYZ0RX_qF_lP<VHbVp%jf2Dn1b4)4;tfB#Xp!bqYYu6~Xsgn~hJb@jo8bR`xP
z61hg6NvQm3U*Plc+^pymqiSSGQWiwYz*!NU+ylv#F1pCPZGM!KBb7RlcTd1_H`m<w
zc~G$hel|sV4`uY0Ecrz-?(46Zjg-z_<)902ICt<_DdE}sdCa%@2rDcsCaJ0hB`)Nt
z1Mucu#iZ)R=SACxkv2Y81{!Qr+w`N+-u5}Z;B(JazJITF+WrD<xpKc&uJ`m*@MQ5-
z$q~Rt9HyzarOE^`>zd^p96>M2GB!<$|I=8xpw+RYq@=Lm@bTjd-`sF28Z+JJ=f_*~
zB7O`TIZ_DnDBdni(lA3Jper$N!GhGMy|!w^B_;;V=ncH;+p8CcncV8ZX!Q>%ZGgTb
z!%Hve)u+$9)+LN{>3K*ji~_USq}^E$^Vn-!RFF;ipO6|jV1T>J*2#HUP|C^M4jkxd
zxqKNXp^aR-n`+l=1zk?`c7oR40|)$o<{YZWn(wn{Qx>hK)Lk+^s-ue{zsNq~_#viJ
zka7833dG6&w%{fYMZLA_mY(uUu%a^G^u3d){_#(Me1k$IT?;PTU#H!;5i<4$&^XsZ
zI1#lS-lTimr0xg^2#yozA9d{=`s7_}zUl7Je_mKf*?`4XJo-Fh?oT<)F(NcJ$(@K8
zeh;PQZ`{aw4i(FzoM>)W%&W}q*xk0nslsG3V)+pxni}5CNtvv#7;^pA7tWt+8oD|D
zXU|pv_Oz5KDIZhg=K5R+LQNJj8Dv!rN?p5lOwwQ?p2EnH>s%hNbc+5)YjeerG@5{I
zVpv2(G8?(&wT3fiijo`+W?c`ECUuC=bSELvvW_AQLVmoR%)}-ezd2Xha?>+1=5@MI
zJ_vB{H6aGMPSMIAZ9gVjZgi(e$h9Tj;e0JZsckdM)HIoLxYLyOMfZ!8c?pB(Z2FMN
z8>M8Wc>IEn{jy~<SAAR;wJ0_rVS)MH%#B}$JRL)}TVK*qP*)^{aAl}tvMNckOG-=E
zjTt;;*)4yb3dYn;sgf7y0w*q{<#eTCTs`N~vlW-fy|l}u(pE=dc~Tk&YNzvsGbSLI
zvnV!p#KMY)_j7ZDiypP#)|-)D_2si|@McI44<E34aEA`tA!F2j$pc$%aF9oWBN9S3
zRgjnFouX~%Yy8%&x5-~onv~A8Y>v(l$jk4-kZxoRuJcR_Jq@97^-P<<RE6(lTMF_U
zy4Lrh)rJ9k`yS&;J$&N8c%}eQ$C+S5Wn5lq&-htOk|5a8GrM`i9@toPzjE>7t*yf(
z1RuVoY6Z3n7v^y5z|IHXuKz0rD9Fp#&mck04=Yyt@%v7z0wtpII|9FDcp~Sjl={t0
zgt00)`9B5{;+od!26KEZVsAKA48e!A{=`wHF2+`G+PVzdk^WW#MvLg`+7qWpXI#Gw
z*}4HCzB+Rli8O9&(E*I8UWLi%LpPa`&`s1M-80{iEs%M>Ya~?9@NK<k`4OuA+C_xe
z7rqOwy~ASzBKeD8$bFAFj^d<jdQ)Y5KXc*nsRQKX9w(@?2>4jdW^-Z<JDx0^^g*LG
zh@!Fc?6{{c?ZkF8kJEfoQPD_Ej<`E?OXrj%Olp|>=Sx+R*g|m|e$`>EB$*|T|1~)`
z<s_CRBBHeE^F<=@J-Z12QMY2iitiM(I8W`d4t-HE#qal2gQt*T6i$Z)Afzaio%Ztt
zWCg*<eazZr_V&7uz*(%l`^C2L0<9l%Y<R%jj`DuU6=sITlLdW*CDE!CT9rY3*Eav_
zRBzSD$jC5EEdPD=e_DV@%~`&Cf*|k#&W9WxBM|nupj(YV8|jG+*s@<yWITO30Htef
z9N`@KdEY<6Y{N-a+8oO(DH+(0e+TVhSg<WFPO<q<ppOoD;@w(NTicK1x_)i+xix?N
zD$GmSBox)`x)~T7_xjkTd{SYjUgwWI-N)*YE8(9xvMR6EvJuF<23^U;#Re1^Sr~qf
z^bykV76r&EB1xlT4_=G(u5bG?RaI59oQ7T`uZL&9UCB6(oNf@u?}v=h(h6Ty{~4^Q
zub51PWR<*63IV2rQ?EJO+3BqP^b$$IiH<M3GBi<-^Ulj9YT|0CG3jtonVH8e)MfYL
z=d$N<WL~Pk#p&zcv!_4v!lX$rf%g$FoTek!K2&dB^JJ=q+Od~&q5d2l8%yTr*v^{O
z&N^ED;L)RAEC7nQ@VcZ|EkAy!yB8(O&zMnm_il$HLbUc=(h>*2nV>{dVP4^>oU^lU
zF8rD(nqf>4Twk%z5rW-Qr-LfQ-^jkFVq)6;d8ydZGh_Hx9c~Z@5*urT(sriHmzNV!
z*GMJaH8C@TSW_%A0C2I6uSG^hv94aN%Bc7$nea}sXHSYvgSF?{*lY$_rB8ofxO|u1
zi6kTC`i-xnGxs*Fo;cER@P)ioP#Dp8mITXSEi~5V^<5?Ik#<o-nn<adGZBt@n1aIT
zj7=R&o|67+p1C57{vKX*5>V3Mcv}criuB)2m~*0b$tzAT_n1?s0xU+TJ+{J{HPR*y
ze&1_}7H=-Q?lv8qURM!)@tL`kp`-6=YlXH-mblC7>i{A*z}Zt5VbU_1F9-~D(@?V0
z>E&2w-2zf0nsJmLIdrIz<2J(jd)XU!2B6wCyX-^I{)Um72W!91@Ug4W3_EtL^pz~e
zMn0@JR)cd{=4-wmor+yzPti-T<(Ls8I*6~(+?YA^34N`b;0Q-5EkO+M@LObP4SjV4
zcz`SmaBadHaX>JpCPn9G6I;$quU(qDcJS)Kf={UZ{hm8o^ge&q(bb)&Hu{Xf!dx<?
zdef=_y?6<WC%s9Sn6KL*74mx*?v8i|IgP;H+WAp-_VP`~AIB5HAz>nu7*H5s=jHwL
zo2r4N+K87NXpqX+KGoJPDC@IzqK3wY?r)hZdSV7VrwwP(8!+fKbEoVWdS5Eq`)zNc
z2(ahCfd`eepf)^8nMmxS7WuZ32ktQ8vVbvEi;mBh=ln|l@KjkzsazYk4lpner+75U
z05IWQQZ^g%;M<X-CzJuab`{NT(BpRYg|0EVMP@;io9f#ddNaeQyz*^zEooerewYSK
zoG`)b^$^;hDSAqag&vDTOHK^Sp4Cgcuz8?n(13{$ai<RF>qGc653TnRC=55Nr59jS
z$eeXgDtN8=`pMWZi|m?d689~kx3>E9`Szn}b?>I4G4Lk)womQey*q?R(VZ74lY!R_
ztM8VT{lSTV4e<W_eY*+89#>707i?JEzD?(OQ4!+ljz^|=*62($r|LlN7y<7j-AzJ*
zCcd8Ptrshc5{M567ylqK=ihr|nk?e^dqg>x+~&wR{9zA`LIzvRm}ZU5bMNzEo0ogq
zQ7e+~6bMY(v!`29gnakGV)@ad=a;E8HyQe_EHEy69{ND5aE$xwcS5e;)Wk6e`}<&y
zU&rCFP}8PmWPBSV4ch5vW(w9pOa|E-G1dDrJl3U4<HOhW{4K5aeIk#Vha#}P+Ev1#
zU6!e!)pcB>uHkG-C9AJ}oV;J?64*R~F3L5;2-2pU$(S(kiA*Yb{+vok58>fD>?k7j
z+tG9A{+;as3$<AGhE{j{yW>Pxl1NnDgC;u2ZoX+ye=v0gfq`|T_=lq7-4U(^D3Auh
zBh2XzKO-*|3tfM!?46%NvhCUxop~}b@%8!2^`sma_E@UO?pKAO7wXAHy9NvtSj79&
z-1aw3LJZ{2qk;jmFft-podXRNQQBT)U`DOAi`;<jorhY6VXix3+@wh}$j!H$-3TYc
z8R_u?{lPVO@e5BjO1JkT0eBF8xi61mXpWL1nDc6>gt)Q)D1h>p={j@fFt@t=a#)z`
z)0bN)tnjD%4IUgM&1h<1{$s??UW@3BzLnPfn$^W+?WTW}5M$tbd9;#t6<H$Hfm`+v
zMk(HN%;a_ZX;-nn$l1#m^%3h#tOv)5rba1oqBsT88n4zmja9TqQgvUzhYr3MnG`CS
zx#ia$>2pa*H`jfXvff`BHgur(lV6CVuU)&w4X=BFMoA*+^lqln%q?ao#mTXkTl(ox
zV8puvj?Fh!*pZo6SATVZO!9U)`9@}unVDPXiNr!D@(Qndq55+1V(>&c2aX~L7xf0a
zb>Q>2n<N9Ix8LqhNH(yauqm#G(VVV(_w4b~Ed$ZNZNIvrw6)d8!}`gxaf~t2NuC1#
z#Lj9eE!M-aHTaq>xmxQeWp#CiVz|m7JHaZgJ}Hhq<O`nIY|XEE>zCqoMimoO&#9@x
zj0T~I@4cM^h}t>gQ3)}~eAD-}WT7cnUt6jOL)VM9`26a<Jdn-j`=c??E=6c{@inFD
zb!%gN%k}Y8x-Z997Ae<wJpWp)M_)2b6{2)$Z5ZGSNl70MTVA|-@@D>_u16$<@-g{*
z%77w=?0)7uvsPSU8||E-3Z^^r=fWwyeeM^k@9wbLdh@Jr&F6>--?v-sTz7!gb*Eq@
z4b`vq8y=<dH8Uiu>Dj#%mq<+cU<Qx%Z}N()Zb$vvcCs3~{OVK7#;>Td4nWS+*ZQlA
zY4Czswzfk)zCh_$e)u7;b}~klOpT^m`>y%oK+2TX=H@#dgXg~X=}ixBV61Dn{wil@
z70*M0GKbI2OS^XMf~PzAr+lT`)hky*TMbiU9>+;YO0HaRX_cd6$@!{MLiQ!qZZzYm
zf!5<vjg~Hb7WUfzfOnd6L8l)N4~m1u_M(UiP!{xxgc!nOdj2Y0Am`wwc>c_po$EGC
z_o^pJ-8pfmG1EjCF$(taZCT|M?Y&Dc>^U^&&?FVn`vI~8%HMbJT=NJTA<sfFFI6H2
zyW@x|>(;SSj@N}j*2X7#nuMqu_4BWkM}^2qz3vL;Ed-Ffscomzwkt>`U3z_4Lvp}v
z3`rf5A4DxKF!ba>4XrB}T7r%ftm!Y!sTeWA{t>Jes;FDauL2Dy$%hXX=C%K6h#i9E
zm%NT(BjhlxZh$YMlxI$Tn)e4=luX4)ysZ+o)z7ND5tqAn+}Z`HUc~czbE*YgPMyHb
zk|N|5=l!dO$P7g?LiwGi`mc+DI;dBVguGKS^4}xCXiyXK=Z$P_GY9u^H~tuKFfube
zck?Qtfep3!mLH7@6a&-KBim+NF8BN<G_wL;f#X!DXu-R(vO@6nV-*JkpF39rReel-
zrv6GYM7r9LY3WHYV_+;W8ukUw3r|<j3)&($VILggNp{d46I}O}WjRf--#Pr2Vc)Kq
zb0nH+0L)aXd6L7car~<4*wB0~ZQAXIP<STrlj_LM5*>eCHDf0++G%Mf#W@795r^Fa
zTvgld)Eos`$qpE>Tbd^UF?}jYMM-Jiqx1`Bbv!-QizaCdZg!mQ64b9IeDw{JMds!(
zApON^Y*ESvjd=rdJ3}kNrqaVS@YnjSWgU{7D*_@S1}cTvpL!{;pa6J<<{eG#h$9gm
zrPkTY<G})9$Pb)IWRBLfX}%DDWFAoKt)NO?i*8i)?cI}?-pm_oS(DoaPbB_tu5_E&
zYKUhlF4C|x)Xv}u6+^Ng*jeQ?1cW^^{&;xX-hMd)?rh#Sn1)_!ya|YUR#p)QRPKnb
zgwxDU8Dm6%kQ{&dR3hk!Zry~lEUzkMf3eZm<*d{q5z=;mUUIp4O77`XQQGsE_}N3$
zQssN;C3l`1FH;qge(1Df8vVFc`KKkN29Duuc=ANvaO<%t%l5{)!lY=O_-K-+qB&D>
zhx_2oij;czStN~6lEU~PE2~G%S6J|b(<}B@I{#|fY})ywwTA-@4%g6D*!o{TdGZcr
z&+P1ZWB^N-`av#pacYz5Q1PJ2xW<muKcThP#qD^UBD+6Vn^nGG;_|2nojpatFQC<c
zQBmRHyQKv`ZlG-Z-d;$XAf3el`+|agd=}eEcXLvU68p8&vq83irsv&r54o(`Hu{I&
z$;5N&W@+`18as9CRo6JZttZ}7eCpDr%jH`gr$w|YVEL+GB&rm2<v0ay<T4s+YP?e<
z%Ui!_dR!1JMnXe5<~dJ7!^5+~y0sUdI)6SLIMLR&xwr3Qt+ZrSDvLr9tLOzT-x%}{
z?E1Bju0c@S+uI$c_DoW$Z`*QC);8(vGyzZ-G%Z5qqNd!+LrB)_H_U>AUT*vfodWB3
zhOue-m^<iic^VcPdd1+Yi{#CVlcV8<%V*R9JK;noO+FLWT2xrIeR#y;mx?EUyJ%FV
z^}Q-pTGq^QPp3UhH4%PJvV*alQ1wd6tZHKk%$sa7>tI-z;70_DgI<C9mS_IdmLT=k
zP6kiw?xonNYM;f=KR>^bQ{N!1#v`!+kWN}pW$W(a$C5oI%`Gfu(2&{lM>9n>7RQ}C
z+uo0hKB-vXxo%zTnDx3zyT5<?8@WZIfLy-B`sE$){^}=3jcsighE=UuYr9=Mdxua)
zUn)+h(l7=k3$7RFs{wGVf`%iDw<?b6=r5IZ`*yo7x~Sg|u2LA!oZA+Pss(V+p`m>P
zf^i0;9Hc&I+{B4}l2Vv@6_@hG+Vq9odshSP0AVZ|MVO>S0UnK8-d-EK%oFJ^tL{2E
zcB+xnQn$}Ndg_y^ii&(*YGFHp0p`=XVb`Zm&Jl8|5f7~2TlP$9SM#s*OZvp?Ry#ei
zZJKQ)-<32+LlD*UEUGe8|MhnB<DLc#G^5Z+Qctz*K@gNM8;}p8m&Xqu_Lq|*T?nuB
z>(TsY^7!$Ws=IDq?uPbwW`g6hs#4j`x5xGGPB^9aYFGeojbk;wx&msGGi(NqN~DMl
z8Im>YPwVZx-%+N3D720;)1WT7@Y$G&pb?7a@GE?o1UhQ2Eh<PwAiGV_Dnwexf1h0^
z*Ij2?cF?y|c_EQUk2<VfyWheaJ3oyH6C#*Jx@XUPo_&1kQa0dVKbKk#Fvgg$dE?KX
zg(SMFw*p$d1KB5fZ*~EtQ3z2Uik4Oy(8Q3IID$IIh=6jN*O*VN*6i4_`{ap>bGHwC
zy>LU}q1wZuP5BM#t*aOQa;^ONM9t!-o=dk#Qb#Vdmo^Q79&T<9kMfK^ws<VLAGOCS
zCa-UyjWEz;)F`&}@BmIF4#TRum*Zz^6i$Qt!JS1~e1&rYm^0mH_)MZIB4^v5-(muB
zW$Ps4t2}1Rbc|Cu33;SThhLg3eO_<ul)@R_DZ05F37+fMLn|Hlvu4?{Wk{`JV#Fgy
z&WWnNkdPqPx35mJBwLq8KJrTjeZuAW0lAz}-)VW(M>9H#WSBC{%N_MCwxK~=-2Ris
z4qFTwb0|M#M3EykC1veANtj>h?GnF*q~zxM>fYLO5$rH%H{%9$lq#5znP|IQJ8%-g
z{+;c53`wS$g~b`RVtjn@7GGz>!gl$3rZq;&MiWQ5pM17*k5|uusrmjc3&IQp%L2-o
z?m}^ila9zwh4kd~iuG=8^vVw{yW!BawZS{$wzqur>^?FwbUToP+(36kz5t!aDb^ss
z(a5GdPo4<+E$8yr<20$sM6VRqVwfOTqKeZnJd2vuh&*TLS`uGv)didP{{9jgmtxsX
zYs!?;?Ch>xc97~qb%!|RB*sM_$aML9@=fB-h*K8>r!Bv<7#>aYtNGlyRbcj;pBpoY
zj#O9C!eI+}1a(DiqLQMLQX2I>eyr)7|HM%v0{VB?lk9T7bO(e@;fy{2G-$`LD=!Y!
z%3K{u>4>Ip#|1u(39-LZyN#=EIq?o+RMYewKhSf?)R*^et#I_{H>mG|?N%{qmdho;
z;rR<u1ab4nF3*9}q1t?~h@I7Nr38BC-RbD6PF=bnur*q`HOryn-rLLcBQ^70zVt$$
zz>{+Q^4Whs<=;J_4BXLv%?O|c+OE8L1_W4NjwSIQG}Vi<XFD<j9uE?J`Jjp6`q0hv
zuYVyuDoiyx^P9GxGD=TQ^Zut`7x|YV>g!T}G4b?ig)&3<!svH&+c5=ZaLy1EhTgq>
ztt~8e&XFutXmIH=Yw)nsQtv%<6Laq@vi0sU(DY{$oPbd5v>U6s@y~DH-X}2K_yr}o
z)M<$hZ~XR<i1X(^q4uIJh{O9Wae}=~Q$z0^wdLgv#3w3VHKralf38yLoO*|~m^&*s
z>lL{k2|K8{T21m`Y7(3?dW(uiwiP6an-+-;MMWzsrneTWI-dxYi<(yudis)FGu4Lt
z9FE@~Lr$SeV`jLH1d3`8gU5zJA5_MU{pdQ@fK@#w`6_(+saI9=;%;s~fT!R5`FHG*
zXfK`gRb$MUwLBn@0t_3H74lh!2RW#iD(>LWF)-@p%X4&Ed<{!w!M}4P0W$uhZ*_d-
zRfQtu+us82(|@#PVdJh|BU&6nes{YSpLcXkjl$k9Lu;PaEqZY8fw7m5&+#c6rNnA9
z=c3=yTiv^lP=zemj9{xu*;KiH@Kr;D6oZ=&xLuriY}iiJ?kZgmL@z>OU6`G%#3ICD
z4T@aa=jTqR_4;8Eoe(*Pg#v-xYt{w?1AE|pqpx$xSZ6h(z{lQ>0)AsFwPxyO@#?^t
zdEsB;|92RQZn?e#=|M<J#BkXCVujSbW{8-y2u;W#wUNwy($X4mg!K97;ln^|;AZ}o
z3@25rc6+27939urB33F`<GRF1&r?T!r-{n8-yiI9>&5fuheI*(dF~UFDyYz#Pv{Q4
zrtNpmJ8E^I<4)}GQqc+k&(T5&lh3E#Xsa*pVErv<|2IE@_g#kQo-!;P44=AVa)*bc
zHMl?&P<b80fT^BhHON)jEtPn1S?KF`KLV((Dk$q_7x<l^f2ffvW6Ro|@|n-`^D82X
zgb{gm$H^Op`2EQ$^z_0*$9qxifzOLfO-I-tT6MTzYbPai-t*EFRwwzg<mmNvf&T{P
z68zd!OU8^+ktKpl9f?0UYH&EA01cvIC8=!6#g%6}6)A81t~2bRjDOaO@_nyu)F)-0
z_-(~d>$w#NxOT0T1=jHvF8aePEEW>Irnpyh)D~2rf@I)@1xME@mi-vCj0Jv9$ichz
z<AA<@U})L-g}oW2RPG#{z|%XFc9ED`Xu$YcwRV*+VI$}Lg7i*ysZ46}@6Vh|rcJI`
zQNe13#}a?<dJFVhr5s%D93bsE+>&xI3ivU2?ghR&E1Ao)LEpmTn`hh5*m%;s|C9|X
zN|Pqb3FO4zPfHzE=oM=1D6Fi_@N6R>1Rea8f+#{@=k3!UzF~WnPFiXF^Q2Vpp?izd
z-)>&s-}}iunP1WF$oRZRDdGy_>;ERNvomFYAW8Nsy27<(Ic~bV{vJ88FJ|DocN!iU
zueuP6mz_U$>E3^4ei;d5@J^wxYUWSJe4m2B#cZOvHYH>ZRK37K9G-yz0cbT}-MP<!
z2%Wa)v5IUhD_8BXfYW3@OZ_TaJToG4zna+oC6Lm6)|@#zojP^iK7Q4c{`n<PB5SM`
zVA;>Wk=$H?7i{(f*qJCEl(s2bSkQuP#k}KpaM^D)+y8ZB8VP-9e67^Q+X=xN^@-R*
zBci=MT-$qK?%A!Ubnuaew_uaWef`KA(d=uYuYW!;{8Z^aRtS9#)vwYfj`92XEiv!r
z6jof@kvqyYN50e^2B;_rCwQiuimcGYNL2+Q|Es$7StSd9YL~6vl95$^OwaCorhaLo
za`QQ%W)mW^$xOQ7j1U2cw@uoFA={h2sd&c7GUnHSE}bHudDsfw3WU_e(cX3n@MQSJ
zd90&no^uSw<H|mv2hb}b?gR02t~WuotxigUK}`8%^Kd4}@;1M{)h^R?*!HqTj*g?B
zX7v#3WIQyom`KO6ynV`=@6HbQU!z~-1t30hsA244fWGmtTG2AX)cY~N2w_2Qh5#uU
z!Yh2<Klt0qAuOldnBeR>XwFxdH2-b;=(q~`wEX!dj!HHef+_DGji;Fo5g4e;!}2vA
zDqz%Hi#YSQGO7MFJ%_JS_Pi&w=BBsKZu;77<)y`lVZkz}8BQl<pCHH|+8Fkw*QXbH
zCmMGYy-kmna=~zqKOi3wSinJq%znpX=j~093aq&-`?itul1BstCnP8ji4+=Vd=0qD
zXoW35V27l4TcOhDN3|BpL)0n-Z&Gy3d`2n~5rMCxYT01*RiUe17;o61VK9O<sJS#A
zeWyXU#iWcTifJBc!@Df}yy_xB^x%Y~&5=|Pzs6677=D<z0Y5f_leYo#t!D#4R>2e0
zHMbjp1!CQM*9~Xd2ev)bWZ>L5d*tvKuC1#Dyt6hqr%or)Jhopl!94L~SBK9=fLV-(
z01OW*!M&jDJhL${6+9G)9)mQ46<3b;D@wrW<<P~~XU~p0)OJ|#H*NdjS~pIoMrIgH
zs?*@skoz*zgVK%s_PIZO@M?aBqxgc>!|K`)$$?K5zVb2+oa@O0$sE}AKsgU0F_HS1
zx|3|0)Ksbi7FQvdF0bm5{ryQ7N3&Kcz3g(Y=kiel#;>>s4cqc&go74in>hBdSla^K
z2}&sJcwc0M@SW!J`V#cL^D;99saMoC?WfU*yk;j$M(piuzI3T!$kAE!Oj}wGI8wZ-
z7knB{`?5lWr-NwCH~9a6BxYj=nJij#`b%$Ni3y+j$Hk~Ua=b44{n<ia%D2zB#Yptt
zeVJq`9lmzqW`fAioga^}^@q7<Q^UbaG2Ky!IR|awd1_2fL^4-(P0Dniue(CgaNkat
zfwnAm>7)tt2TYpO?^oH}uI&zmhMpResNW|fw@P9|ZH)K*ozgySE#t><ji?=ii#ly`
z_?lJ@2^%08sk8-}Z04gDttmmgLU914qU=6?Txqm=n^ayZZM9<K`9$G{rl#4dds||_
z-6Pg*<mmQ3UCnG!m-RtE<A%j=a6guQ4st7Bt+%Y~4%dzJ%;+Z%`$B(3_O#DL9q+_%
zD4&8%TbT-Zxl@ypvU12b@23ahAZEp;t%B8BczLP-h%MAfAX?)(dVJf(%VOwM6=>>&
zNg}uI!qP*W%J#r?L3N%BWB#sV(MWp5snq%;3Aa*KW_k)Gowv7Y(KRGmChxEisv_3b
z)-En}*bku--TGy%WN`F!<G(ECgcUd)qdUNtPO~ES$&-C?1Ivn(8?Cox+vN}HyEp63
z<FbpthCB8syxr+gZRk^u(}+@M_^~9NGrX3Tec}Ly;N4bFzjoN?x%T@9;=NgbN`<74
zm6he2vjx%hrpI7b_j1k#n;B;*w-~h94`G_l>ecAnD4q+W#QXMb(drz)+%xU=Idq?)
zAJMEWq+*icBXfj~hO;3nipeE$urOV^^u&joE5X<RzH6JAoO8hwP1{3nhxK~x927tS
zY&_)n&z?MC>3_l{_@Qopk*#LZrhvVBCDT4zT3MM)qBCmSsZ;Eh(-#KLY9=F#laI(?
z-6a_S`jKH?dy*FL0&R}L<#$6h-TYVeV{f&5yvsj%?I0|QzaRgl3l=Z#zvV5vT3b)5
zo#<`qa!1Gcm&2jw298v%xVq)-P}D?H{fN2s*Z11smlUw?D;4<@nNKNG+WxqM3>h_)
zn=rXAa%=Lb8Pqb>e#IvxAvqo@zIW^7|Fi(wb9Hp=K#*U5mCMZ=X$&`}eAj8wqQeZM
zRQ8NEhE#bkGw~PgBn(itq2Ub-Imt6Y)H7(6+YqtNkXas5InNmU+|l|31=X=*4_J&4
zuzGWijJ5;5OXzchHVEUG-j;rm_Ei>($8Y>>W3y0SAN-B1urNQr_uO}=fm;z7yK&C0
zCp<*PH-c*!%@uL1BshV5f`Ze#MtXf{Hqop_AQ&)Ra%@^r%+kzRByfmJG%rhcG4dFb
zmGf6;y{~D`+TzQBs$mvLm(G8xhu_Rm3jv{_P<2(ycS3P2e|!1*^=d^A@1XWV8Q2<f
z{=$WYYuk|wqdV};nGEZcu6^z0Ir<n!;U0}uIWR~5ZPH<MPP8e~QH$>gLU6j+nqHgP
z+P>kTPzEL8<PChOuAXO_M#l=v(x$^tb{w-m_DUZL3!A>-Q-6d0-)Ur7O5>ri@xjQ*
zHk4W+kw@z08X*G?-nQ)ydK0%FBzUasw?5ZtcJ=xB%_Sh0!o|lY;j;^dv3J;~jVl|W
ztFR@B^jxeGhIGg_+#ZPz;o0#@aTvD59kXY73AbYO1>~lvsHiciyxsQOn(M2ik9*Ar
z>vBof#EL)Mf%5R-`KyTf?0XZ%a+_V3_w9!_F13Dm26e6fx}2HVH^s)TG_7#pWTR=}
z?uttkaj;8`pH_YsX#}%Vwo0ROMBMwXdl{|7RJjIfpCz@Ue4k#@RZ42Zx^>E<N7E1l
zxA9(P<ZHM;?(p4BQ$IhyoD)LZ7zp@YCB0Y&%SxkNV2l~@jh1IDX8IrlsJ#!Z7F|Un
z$&x9v2x^$M6j^Z%$*M&5LmU6n!bF;xY%kk&9RpQwxp=W#&c(4~#>9oI>o9(NysdJq
z*5t_*gSrBkoa6gSYeSkUuJa(uF)22?`P*B51YMH0ghBX(EI47MCI5$5wuo8s*RX1s
z*(41p-eQcngAj11lD0fVTd0d+canTJJ?|xGOHTEe%ybBpMVhq;pAPP9L~8aOQC3wc
zQe#0e&@3KExf*Ta!wyvxBjAzpzdp#zJLnWV)7Lix!SX`U6*P%ZN3Fl>5kz>NO1FP3
zo6~6%l{R>!>Y~YO7ZHGIK6$^dJ7xz{vcId07%_XutzPY?USSfH?SNds&ck(XDP1~B
zpcjk*5x=vsK>bTQs?EBf48K1^W+(=SguH8&7w%Cyx(rfZY={2n>E%_(G=mo}*0-=Y
z@@g@CK$z)vK2riG@}uG1PP(m_69PX)>DM|l2`$;78(~08dj1KWLKh1p^`b?{0P^0v
z5o}}RwU3zE+gAcIrL_rSJe0NTg;jlP<4CA=<SdM9Kz=}Chd_r*rQ`1I)-$kW?;?Wz
zaEfX%D=I8xv(at_pI!Lk1)aeNuV3@lsN5`Bl9AUT^~x3bkkhDwSs}+M9kTzpB61`Y
zfnSkleWs0xI?Wo!i}eD+^z8b}-NR!tttD`va3|E42<X2_agj_=7t;F~5*$3U$A`Cv
zi!Fu`|6$S_?g_Xa5ag80xZ4p1PEfyvn<{bqY0VJ&ze}zt!T$+;b{EFbQiEilmK*MY
zU`siE8sgXN+u1T*hw4bs3zzzC!T_`OKXjIJxo<A5?D9Jj`4wb_21zySCHW<udyFjS
z^XAR*tQC}^2<xZv_V#4cQ>r7^7#;vW2F)Y%n-3nmgoBI&>%@sauv8GOj7^pnvp04B
zX!Vky)sq~*!o$Qw0QOLdOX&^$C&0x<a}d@RU>f-;%lfcgAo!0)L{OsU8MA3+`}L#T
zASn0Qrl+KmPMu3EOpp~g>H)qRJUx*Ete6l^_3N&&CEFo#{CDu@$UfttxvHwG!$U$$
zsjI;^Q!kKFHb?U!5ZqgC@?KIdUAi5j$0EWtPS4Lf#;|^$G{wCsG<|VqNNbY-SQ5if
zxDoQXWte18k<dUVm<H^EEr0s3Q}gw_+}xS#f&>v|xSun&7N0(~ad<AWut0cO#(9A<
z1X@;G0)f_{OTVS6;9*!GE{|RvSlWL3?W3F^L?A#a01N!7A{OVpiE4lIC{@6*i!UzY
z1VSu}wF!Oyt0shVVH~9qGyJA!I>I@_NbYP>&d<)?{UqhO@6Qy{aGZ?6kT@?uH=yvG
zfTt_bK}|t6owFN`u%u+NqGBA2Br|h}q1R%1C$k-J4H}=*wYP-0PoEG9jPE{2VUA1P
z9|X_i1>vZU9^I{bcgCOD(z>)a%f!@_G2R+aU;nTFUR19PF6y;VX%cg6fn8TOZUjy)
zaxS^DyVXdr*BzI;6CG>kk@L?cB`syH0O5nInbu>_Ar*0<r)$NEZR=za`l3XN&+809
zgG0}Bt{y0L=F};)C^T9?G*gy6r0&W@f_uu6y{iqmQKCNP6+;+0NF|zZarm$&L>~N|
zkb6tSP)EnHL9%A58{DsYSM}czj?}eL)wLnoYTNhh!Gk%JfH}eHjy|PeTgm1myCDOD
zWC9~m+U0%mKA&Z0&yrvc9Kp1?DsOvT(6(t;h_@7GRfY|t87LZgUwV2Ql3P|S`L5=|
zyDwiJtv!^CXk#;AJZHh+Aw#BPM$KV_!gyl%6VgRoPT89Vm<~lp&onV<<m^t2N)lSv
zq)%FUO&dMh5z0p*hQJnFtlfqNd_?>xEn4H=W-T5%&FB5wx3>~X1X0zCi*K&1>2o}R
z83z@Jv|gP1Py<)2nKaUZc=4cNwizTEEXlAn{o>hQD1k}*8>&y@!P|9Vl#`tuxz$G7
zO?1x0<$M?BOi;(w4EeMIeuN`O@rUMaPATLJq#V+7hEYSQreP}m+gutJ=@adXT-8%~
z`|ZUUK3CJyw30-(>Did9rR82%$IIbZTjT2b=Tq^-6d|u9C>B~A)sF)wBpsH(^R$`+
z?Vs8+#lU&>SZ)BULKwCwHO8z1qMlf19j8XyZ_Sl607yXV!JASn1xhU!dR)J8gG~`p
ztEd6VM38>J`Sr7BNqJW_G{^Vv-nMmh$gyKL5Y_eezropcp}||IIHg&wsj990@+vLl
z{MkEu!OA~2QQ`a6++0@}-=`=-b%w>$;b#r|Qvdd5XGFZLpjHO+L`2s(?fUe(tuZhi
zzXW<$JB@t*jz@kOMaQ~z>r#gPL2Qfvb@_9ZG>J+5uq$zMdpr5`B7J>kd<%o%Sr*Q_
zozW}q`KaPiC(LgEtMnYCc}r_$Z?WK}1+YAHZ)aj&f_dwm#`^kOmr|TD9e@V^+T5%)
ziTuA9kUHnc<pE55A+~|aR}aO^YpBN-!Cb7RVWn-<dtFo0kfP0|^l^U$DckfJHzEK`
z@$v23aZE*ykfro|Hr$;sI2kcaL@TEqkl*IW(bePn6LG0gIlj;gMEwk(68juRpfI$n
zd=LwTHtIbT5(kJ|b%p+F#%i*>*FDiSqA@-U(`;4Mr_DFz+V*gaX|8NQa|N|4_<AVM
z`@@|vL!37L$DU5mfh-CNQ@1?Pdz_+7M7CHw#*_v2yWw4zF<m-$#@(6D$h@|`hmIUU
zIelgK?ygDQB*Yv`Y%X%YrgIl*Y$h-hSo77|Z!vbu_zxqsWiM8F!R~HwZ6cdI)Ts>8
z?jj8?`$CpdYPcx~*^W#XGR%W|97&i?wU#$Pm#>@dEsyo&mYPDP3$x?QAW%R%xacWA
zjg@`XS9`8NWSRaZw-?Ys0Ks$r&>1-i@q}0;fst`D(kWpw5cd<0JN^Cxa5+Azw+76b
zl}_vW`AWIoz4J3RZA7TespIhiAu`=tAn}R5e!}<Ex>NBY`(V-$NX6MO6f?EC9+@6K
zE%oAsAF+PeQ2RpF_Bs1zZrmYK81R96=g4J1D{hA(0DhC>N-^`4Cs?rvBQ}o#<}gZm
zP*S2ZomW~b%?M}E7J>*c{I{&QkgGskTW@tPHE`z+>DJcf1opt0Gb4*`$#wr@NSd)r
z@#QbRo25WzQNCz7-d)t$`#fBksaPZNw>hD%;=AQAVpsn;$p$B;XH>^H8?0WnN&||T
z9t2%-ID2~{CdFL*?l#iz%(|ydAB5SvFjE*Mk4*zdCH<zqTN<ddX@eawdf|7daSoiZ
zWooAJhdr;B>mSod+<n8;*f<Rj0>I;JZEYG(=;Fr%Mo~fG`5D8ZB(@|1kspGO0N5F$
zU4okhrU_?O(N{QV{0fYO@|b`Tu{n1xmPhniJ$zWj$qI7-+UwqVHa-8$R)(R>eg52K
z#g<$_RmwmM{s^oZ<tlh`m@gKC92G29F=@};w1J(@U@cRJI~zv{<rpLoZwK}-8n1-+
zr=})ymAGryyx>n^dOupZU?{-%?>cbc&(~^GHW)%zclYtzZ&%Seaq?sZSv!Z^tlp3F
zO0TTpR*hLbU?J#o@4z=H(`NaeyvWrsc3F(ID6x^_5C9E7+44-7Ezg7fdTEDqOQtR4
zX5!*YE6@YtTEO=J*!Y7gvI#(Y&H%{eqqr<t?`Rp<e;@z*8UFp8k;G1l$I>z~b(k<y
z9mM-3qNP`bFgXd?$dA@m5GPzz-+uiT%$@u7Q}H_bm*`z^{3@b&pvvNmL$Op>7Zn`r
z-@%a<D7>s(R;-wSO~VD(0q{)xQbsz^<0LE+7>YjuF8|1}$x_)6iTRK!Egx;P>d?<j
zL7sknT8;i$r#%aw$iaLg&;A1s=n^$-njL!eI>_?R+xcdA0PYFwI;;eG_3w{MfiXr|
zpe?{Q#udv6wdoa2{<qX4)(-HGW0XCO$S(+j93RZKJV;f-(h~f1HYVm0rdKQ-W7GK;
z2jG6fD*FP?1W&R!D>`*#A;aerq;J5TayY-Ut4}9>@Tq8f${L~e^zi9ZRZJT~Ls@e7
z%05wb_u48kkQc*Rriqwi1u>HWH7Q-)KpKR2Uc7T$i}S)}Z~Duw@E<deH4yA$QqdKp
zdmKT9^kq4~U9%OZkBiL{OxyM_bq-6ys~hU93=N-wN8oPX##ljQ(p*;y&W(uBi1@JE
zkuo=HTOjWm8wt18{3^L5Q2<0FOnhNcsFw`oMZ6jcyMTTh533hNMQl{;>&@oOS&g-F
zkj8Ln64-BzjU_xaR1>^CI_QjnTC_xv4N{KepGu(+SIjb$Yk@353GT=Basm?tkN|U1
z@~`h<+f44n-cEn>vT?Ssg^0BvyJpU*K~ZSt4`BDiQnj=!<~9U*1YB3pZ^Ir#rm~SV
zRa8Eqnh<~*`fCUZT8DWr4}#=U83)hd^3aTeT(rv-<ImQ)8oUvj?f{eny6?Do)2~l~
zn{XsVH@9h|xt5CFAL=M-8~xS?msH`?vhw+h7x-#wDkyXk_Z>7yKVOy+Ik#qw$u5(#
z{#+V2vR{Yj4%=#FM54A!5eHj)t=|ll=QXVrqxhLss{g*@|Nc`tnv<v^w-nqs`~Uu>
z|BL}TWxve{(r)cm?fAb6)_;Gg9UdTB&>z~c3soE%f2*4R{$2F6-MP+!XIt_X(N=uY
z`M7+s75?4ZKSF`%%@Y3Wuw`y8|CPzx!l|-DK}%~BZb)^g#lsA&lfJ-RR^AF^yy$PW
z^=YoG7q=LW5QE9ou_2{z&=9O6!I7S_an6=@ydsGhcvA_d4x;sLRAD7z1mCNTbA&(3
zC%TTE4jCjXMu*MBxLm>cpzZg=Yd##6Kfj^pt|G}S*JhP*K=T($`Y-va>K(f4gv#H8
z+5f|xo2g>$1SJ;K4ne8B^OBipW)}=d_x;}Z{J*o?f5tsFSi{sN#9-)Mk_7oj%ddl%
zXs<Z*tq$5Qcf=8;{%V~?{xf>;?cpKNgC&Ylf1hnjB2(wzsdrp<&dGFrU5QLTpmm9G
zB{O3$T#$NZq+s#Cx7JbzCtOYvCLzXm16{2^$iF=>^S*7pjzj$>0m4%9;j@3gl(+Gm
zIbCjIO;Ehww}Zdy`w5-4-{;w#=@nB<otU!Sj75u{6GT%jcm91$TjWR<IFTe`2t!vk
zF5S`&@4IXC)9F0o>hlmjmOD+cVf54AkDnC#P<C&iG$JOt_gsV*_gjMEQC1*zrvL99
zClar4<2>g1cM-pxG<%D4LVWzM8Q(rn8MDX9{ZC7=c2YOldwY6y7#()Ceaf;USG9Yq
zk2l_)*I#{6(D<-dP6liD-CL^`X5-Ok!O0unwsrdCbi2KyPSBpSqeh=Cto;7q)#IX}
z;?6Rfx3;|TdkT^9Q!~?6dEc|+2alaHt8Z#D)u1!&$0OyWhdnxo{3<yt9_f8P!#ov{
zDAa|$i|OS~>5~{WlJA8K?&5#?tku<XXqSj3oS}BBXT?}z<-?I_e@4_T+@s(n<PEOM
z8)QW@FPS{1Dk3skwFGa>;%HxMX~PuPN*uG#MEeQ@a&MiqTfG`9P4nX8oUTaKUQHFL
z%tsCzD2)UQI(z>o6;bd_g7-*yd5O>cJNWCK6#k--8K_u3z$o5~-0X64XnO?h$*HOR
zw2|h`_C7a76grc?(@*=X*=UBcP7!`e)HjXMPvumFw0Gh0LEL@F9D&QYy6P7jnZNDq
z-+OM_N|oa+4;jTG^>-ADZUxASGhJH(V|zNrirwe8qfwrztm1pDW6R&NGqcyz0Y~Gu
zU<w<BFVrWdMR0XjXsDcO|ERD1&)TG$=J948O`?lxfDKa(6i=hD_HUDk%{%K%H{X)A
zy+tz{V*lQg(9LlM#33y`Y|4W8gPX>B^1wYpL^CsK<Y_F<2{IGDJZu>VL_lrY+zS2a
zMjHl+WcmNO%zl-)L{j|ZiGH!WXqYe`lVX^Jn3BlHFJH#u?;d^a?<Cz(y^+{2I%n2P
z8y0ZkhA@-aNkxn(65qRQ|Cl9fk0E_J4Nd2qfd!^qAD>Z<v<SBr-TbW(z)X~g>Au&w
zQ<a0nhRb%d;eIf&RpbZW3tq$2v&A|H53X7z@`wB@^!|?KnJWwe!~E6yii0sv!hW@q
z`qXLDkQEIN5axI^E?F)brG-bYdrSRtu@1$yfs@v%@RfY}CSsL8TtPusqJh~fK8%~=
z#c5|W<xF{WAL30|PBR|=a)8VUIo44$tdlS*!~f(-^90YyQJ?yrenK2R90UiSHcvF{
z0PU@#<OHAWSMtJl8x_LVq7_mMaP#nhd)g-4mj#82!!l#&8>7ge%0;xDz>ADLrkfaL
z1E_Zd|7>Razl&tlEDAmqm6ZwsFCoAnEbRn8e_<FILtaJ&jKaAdsT%)~AM-!ijN>d4
znXY1{jU~@NJUW{@@rXAYUAZ?GQiAreP>lP3Kf^EgU`WV_#Yv0;J39XdxPez7h76hR
zkULDQvuxS0PBtP19n2hHHv*-Bjze+6{crIFAW$j*ii-n~yHN$e1;T!0`(=5gZ(zK2
zbtUTJdmO*K2r<Jwc9Epfy)a?q$Oi=lO*~FCYnK_~AiwPI#F0=H*UTSr4AYYyPAgLN
z^>+R?JRHi4QPNcLR|*4r-w9VYvlsLZe$0lUb>!5>8J&hjG4d5Z)6hER9o=)*c>J`3
zXjQT|b@E3ZJHfqiK9VBYy?-aR&zB9-n}~CP{U*K+Y%c5;jP}<Ea?`b^SQB}-Z_nB=
z{O?21hGZoZk<nIjepxTf*y4EwNUuPIMh>98i%5#C;3J-DzTl-m+NCYQcxPeVu}%Ed
zMpya?1iOE4p+$kCsGoLsIt|c(a~BtF`Y7rjW$K~yow71N^cRlmWh;b`Y_tliI*8wU
z4<1O0dC4y@0WKEiUoiUh7h#j~pXQV)0PgG9GV|;%8u8NOQKB#o$|~^<>gHw3qCe?~
zL~D-!@$2CoEgaN)a%k^iXO0XIMvwAe*mcWdSaFDh#Q_&nQoz6CnS#Z~zLd-(w8n*A
zQU+6uGUk={la*Br5R`Y!4nZa$yvGHou(fCmhaVT;W5qLt#6NJ;alrw&iRuq7NIw|w
z!3cF3T@}7`8Sxq$pWfPd+4BT0Qe+jbT!p|=P%ZcY^wuUxs}m-XFx!#O9wQHrMn?Fq
zhbFyhB@7ru&-|DHLVr%651r&#Sj7RLy^B;D+l)E9E!ow)O&Bp5VRS{5ru!Xq?)+na
z{b$L}&rk#*tHEOMGo+%7&oF*qW&^Ikkh_Z|4n{<jkmQs2@{|_`Lpc!`E#lt@lXi*k
z!s_IY=LxMLAo@UQ?t_^h(H%Y9-Lc#WBEym%K6UD5>Vki7i9F5pTq?^N=?hbPL|(_g
zxu8}IJ>vrJ<wh(D94wR$Tat*K)xzi)_RI~>TC^&|d9Hi$@*6O4AUQi>3EO_~eN3C|
z--aA}f#$*EWK_V#<YZwY7#lEnM$|%Kn4eN9dYqUGqUe|XyYJXo=4(-m25-#LMeRkq
zK01}5zR5%)slAIn-T@N@JI~mQrovrmODuPCf<-Dhw<w{Tm}#{6WGnFX#WA8#cS?c6
zfKMpaKxxWw6Z>~(bjRh{|DFaerSxxIm#)D0{DOI61lLl31S9My8pR~Vg!MA7cR_@(
zM@-`SR7!2YDJg?acZGl!hNy*(oW#g8cCdG*U=qjjRSGalc+P$h8tY;h#3>q84VAs+
z&##TxXB5wRlnHr_Q|IVsey*h{btGU*Y;5dywl~sXtEB(<aO~fxjhvjd6kMRb!Wccq
zXU_|;ChK=(z*)2Se0HkWz6t)l(W0$JpZ0H}4rLuC!Bmq&e=-t1Qek|HFv7^^>s*QP
ztZr%zjc3(3RDS%K)*mAs66!$dRmnew{%P5ne~-<2Q5sTwkh)_b+A36ocB_?s67-Un
z-|Y-*FTN-Ie5xxcje+n)X9=~0{L_0)G8lx%(&_QvhC5#D%w&oKhyM#@Mvd`i=SU^1
zn|;LhAWm_k%W!-?TKEP7nmMYNabuDpq}l8Mww=gh+MgL+M;~@XbVQciPn-CF8HHhl
z+U_pxc#;d;ko!XX0VUPYLNL%5;`4H!!@kV+!$^>!Stq51X;+h@)>0(k{f;u_j6y1c
z8El|1mYKNqg_-51_o2^ifVJ21|1JSzOFW0O?U*69_Eif<Z5W}k+r3AR99iOjw>nTC
z`l_DQS!p}XUz_BSneEqLJgD_`6*J@LCLB$ox$=~LqUer!%&Ys6W(as;;9|bx)xSZP
z{j!|ENS6O`)-AG!?xnP9G1n(DG*n(V>RvJX(UeIs#W5<|cJ7?R+v(9+u=X)Eu^nB$
z_kUUdlPzrx59nCNL{a~=N%sEE(S>_=?PBT_z`LKe*;ylD02DJCy<3&IRZP1U<3UFT
zK~~F1VG~RoF-u*M{TI{)zT+r^>7=@=z3YVE?=(7!*!44K#Xon<uw{Y_q|b6Owo4_b
znaN@-@5<fU3**7BagscG%<?7<Eso85XwXSCYK<AE@Yv~(>5<B0Jg$9=Ct7B9Zz=`*
zwgleYqoXjJtQ5gM&zoV0uZgMWILAFqEh7M&J^PPa#_{;*F`!w+@-k*7)@w(LPFp~Y
zGT6uC#KM<wR;1WI-rm3Jb~A0QK^T_#en^m>Y3kY~jJqhPMlK77_hjNjVZVgB4P{Ue
z6|O*OF!AHMBchqrmr;x3n+10xAx3qAFZ}?lx6kzu9XG@&gSauBv50h7Nr-8PUj^F7
zCAc_EBvLRgzv3DwjX>x3*8;<_ORk?vhkE|Sk&X1Eo4p4=mqrN+GyYTpA*#bsn<)K^
z$DW9aLfUN=Qw&XE1b#CRKxxJ!x^8HKcZaA_(dzm(oTWu<_)g+~bK{remxb+2=qJTh
zeOS%DzAMa%#I}H)u(=CtkQ#j;S$}xbF}Wv&({SeU7V4w5h~f_(T+S##S|MZgdI)E%
zmLQJ!YloKb=rN@j0$>Unb`R5bqde(8Qs)v;b647f|BKFB7BHALjWkTbJmxC*3XOYw
zhTm@-@|`t^nWGQaa06o=?wQ3;Dufj~3wF~Wo^`IJhj}UmhBKJz8~bXR0OS~SCs+Vg
z(o_EtG&Y`gc#^vKG*M`K!VO=8(@_P*-wU7O&EieGyjp}&j)RiP#*vZF(bWy74uaVw
ze|yT4yLV?>_L)Q}31rmiQ;U!l&&(sj{bW=?fHYX>_`kEGUi&luuzcrG%6lf>@#YB}
ztPI8IJ5W;8ZTR%*TM#5N)Xa2kWi`Ww1u(t{C1Y_O_hPX;#Z?AAsP*HFKi)(amfw5t
zZbFu<bSN^?d0c;C3c;&4iSti)ladmq#-g>aT(!Lda)vkhgK8tPio?_Jmt>;YLI#cF
zy7<ajG(V0tEf{R_m1!BJcQW>(p&P&uGc|Vu#R6!NronM+vy6<xiWe51X=-k+T)vB&
z6YVMHz@{}`K7dTo(Z(i{;kn=+BZ(AjEUA;dS|rERJKLn88qSL$;R+qE7CMOLpM15*
zYoFdqgyokD9zS-%;fQx1e5{Aas8MDM!(k!!7```9&B2HRQu_o$CZf-hRxXEd2O{KP
zCGfhKa0J*XW8l(?aqnv>WeGE?Su<fjLlVssTtGK&Zf@izOMR*p(i%je9f5(buemdg
z>=oU*Ax;Gws&bbD@O8hxy~sACer>j<LXWBeGLf?7lxm^9Oe3!0OLjg^xYOvf#7<6|
zDt?Oi!_--1B1b?>-3;I`1SzQDiQ_0ZZa^7EaEjr*xwb&abA#@AiRN#^XaJpR|DgMn
z0Vz5uZ1fB4CWD4{29u_yj}nPG8Ot+~0+zG4oZL_SGiId3#LZ{xAHhu^UXq;tCNHnS
za)AbcWDYTAVpRTF;}K9F+YWrRlG!}9=-k_PY37}VKD#=)#p}YN+Yi<ycAK8r|H5-2
z1W^ifPIz8JD$GE^i&fVxNA=|~9%pp+?PU*tIbJ@=FCm`b%BbDKlL{YR_4#1Yo;lnb
zP=b(*`Lgt1L(7mGd(yNM&G9C4$7?q!y>b4f(|C04-@BKF5R@2@fn?H;Mm+ib<niML
zYbt8iU+0$@;wED~S9p-JE67?1b_WIUJ!ACfJrCSPqc(v|rH2g3q$b2cFe9z({{1+B
zDpv*RON*W9+JQ~uB|=>Y+K&dm6S_*lSpN)14aXD=6INdh9Ten}(9`nDBSq1LO0()p
z*>RAV@52}V0Qzw3{n0-Iwbaa-?KHHEKM$&#USQ2g@ktyJJZu1$GO?<@!6xDV@4@za
zb`=FB1k`et7*A#iGgwwn?$?`o+AqpeOF0wqIfcuZf078z)P!LwD*COlR`7FhFY#*b
z)V;r2{Fylgi#rBDtf?RbrEuL@k;0(BnZ@gP*q9|?TW*nPPD@<ym&gvI?h>vU;`Xl+
z`{VuH&%{v|m)GcU`9F+%$>|{iM5p(eZ{SP(4>@4FLbLhDpPx_fCon-)(^f)>H(}}u
zsx|MPnN99u^eq(C51n?1zF70F#P_a0oEa2@N`Mcgu=COkVVbq3A_9}EY`g&Dx-Y<l
z79HpQ{pPklUS15rLO5gZZbQ52ntwXO(=YgT2wvp?%=4tdSoX)v6Br^ZknyG_Cc>@K
zMozXDh5X^!h(gigzD#GzbFlTP7F60C(HvT~<PI~}EZSC*irQl{lfaKxhpzUOO=&5c
zyrI+3FEsX?cg@4wakcNbs^savk2)=K<8{5Wk{QG<xMtqz!PU{%D$>kB%+Gwf&Bsrj
z&k2S^nn?(jI$%W7jmk7_9BGb3NzKedm}4?zAevHh_8fg9*$cmZKW7%-n~K}OfujQ+
zqA+D%hf4*3Qe)#wJZ<&~SKL?I;i?t&uUCp#4!Cwzk`)fOQP~{Au~X}S*VO_}1(_S^
z1N@=!39n1V2-!-Ar9zzj;<s3~->6bL_?G-RL&N|)ALF(2Z^6@#se&-7l721ZaRsnF
z8%&7rv3%{pe_W~P{n+v28&E3F?=uPQJ-g!n6|$%uZDv$5iz$<DyyE+FgZLifj6%Pt
za*c~a8HwbYeMrIk#VxoYyefqZ>8p<~36Gt92T|ylE3N{@6pb28CV?%L$FXbF+b11+
z`eXms#zvEfYm=icfEv(8WDSB`7tQQ?1uR(^E=Q!H4Uu-_ARz%3^``Y0Z<@PU40YeZ
zKB?3~X_Qb@%K?E|=X1xap#dU)ldXSg0amUFR?v8zger0{ZW4M&zl_5ajD4YG@Cv54
zh`n@lop7Fx>Pi`k7$GP_B*ko@=%QLk50NNXWy+MF$PV;kF`H;61bOc_2X;_8^U27>
z6NhDM(!0HA{)E3|4v4E0<u;z*=+O+RTuCI(*?1IQFaI7!dsxn=rlyYl5XldUPQet^
zz;6UdvsI`N3q*t{+HUuO1C!|Bi*;wiqJ%OW!t-LJDsf+`Yu5(Mst!wwjPi3c|5Clt
z=V=s!DvHd%ryj0PedovT)}%a#niq))YVIA{w}))<Iy8_UGg4GgROE)g19q(vzj0+f
z2C4-MiFtGYOf&~p2qzfr4*ZYfu~4EkbaH=eaOM!r*XDK15=c0Lz=Y2zg%M^MM_fbE
zz)YxxnRg?jqU6*ZqSf5#n6r%04qTBV<ipXsI2YXhPUQSwQqPg=Z-N{E)zG75;~1BQ
zS*Mft{uh8+&>jOAV1!vdVTYte!}|TDkC-~RfWY_n{`ghg)*t@&JfByu=PXUWbV=#o
z^XwLwZRhwVJ+0Vz40!~pAr(VA;SG+mXZx&F8%QR>r8k)+Vje>dmsBK7!n2GwQLyaV
zPL%a`4mDRDRtu(KX<9^F1BeU5u`};d)(7H-3K#{`Puqt3!ZxUnWQ&7NJgaiW;(H(<
zoY({&g)Q}sjD|TwS<A*@=7c)sSN80IL2(cjJW|pijN>n}WcKv<?+)vi{#!C^xyh4T
zIhlV*HPzSW{xW3uqF>#s1r8JWx3^03sI+#?Pzb1RZl1>ARp!h7eSs=oa4Faz2kE^-
zZ-Gv&X%Ke);ItRD%|u4}MGx^jjS3wErwhJlGnGr8^TNNlB<ueD51V?<ip?ubw6<7N
z0j~hTyg1RkpC2;}MIyhjhzO<;zQ!m*e2?`bV6wmWXt`437>j!?C1btVY`aJ63ZFi$
zV^lLT+RvbH(4epphim^^HKMi*+9s%7j52s7G@>aeR8GM%I{Vo(!+$5$imW~Vom3aP
zLK-0PlE{N%DBJSW0NRaJH)i^D+xYU&#mOTlPHf7a%?LsAX%;EP;&{l4^Zb~$&VN5P
zHJQsjz1wL*I728S1WY;(OP$lZ8R48uC7q1y-vV8Hp7PuaYBlInN=oQ3DqsY~LE)L+
zGU1>UOrSe&=P2tTxQwl%jAW5i)P+ySj#LGdSTtkz{}*Sx#AM(U0_XQBvYQY^L@!%z
zmH2|>?f>EH%)@d_!~XvaW;E6!OZFN=C0j+P)FdQZk}M%hNm-)MB9&1l>x8mPB1wp{
zYqKXyilmTGl4wzBLH$10^YobB_xK%ee>jGDx}W>L?&~_2@A*Byr-)NV0C*H@2}3@*
z@33L+;Qb~6xGUHZbHuX&yd65Uj#eg+3c!ByjghHoa9AM)&5X`;`<$IY@WT;Ia+QMo
z{6dSVXnR=R#Aia6tS3wO()~S?t9Uf&Gg?qfkv(A0d^B#vWGKZjJP_s~XgI%qOdori
zwph1wD-Q6W0}o{;rbP?Oj<-%*hHn;2=^%e^%E72#K-A~Lly_<VEV$?bdwyW}rl=={
zv(7J`51Zq`6fC>1_B%E%MGcVNP9|tIj3AN0<5DL*jFZCjW-ZoTo)ZUeuJYa}rwdnE
z0nWQ;{M?8q_skU}>Ix0^GgKtV2Ms(d9pCmCvFZaZuBMIas#OI2M<zwIY}INhU<BBe
z>`+*o6ofOz^D3Lbza{w^C0)_H67GeVk+(;l4JeT6dY<Z2cl|ZcB_ROBl2?lD*-%50
zNb;z*TI|0}Kw_s2v4hP?%H{Z0bIZHztgH^MlX#3!Wh`?vryq$B=D*`qFs@SjlJuGd
zeE{>G14s_=<@^<yf$7<$OEv>MsJsdT<I!NUC0`IiyL0PS-cX_WexPK~X=`Mb552fW
zhxoKeE1CCGzgOWIFQS31+%xm3q!4*-77@NZHi5^&zfz-|+=FqalxUt7RgwjGF|XYI
z&t?*J`9W{j_y6?fL^857r#=M)IUUU|Ra2}r>`VG+q`oi771Z(#)L-T_3yK8(v)}|A
zn-we0&W(1?JaY&WglW<0)?mmi!t7~hfhak2rLYY{50rOD@jG01<?0c0AOV1@ZW*r0
zTK1X(hZjT%__UW_HOVkz0;iFz<EIpGcd*{(>2`r2rbw}@-~@9_iHR#c*MVOX^&IQn
zOmbg-)Pwp6R{o@eygY(|ALS?8w)p@1_uEU+-zIMF_&+v^asm|T4cGZLLdiqH!IgQc
zSO?||K|Cb2XU|NtHj$j|))2fHYrtm1&B05W&}S;`tqC{jT?y*59C$u4F)==V-QWV@
z8a6)#nR)Z>T{-GOvdNqDS0w{=V-Z(jRp52PMc}X?lGiJB3ics>ZcjFiqGG6JB+uuo
zE%!$1e-NT$+xNV<MFjM)DuJoum0!PFOEXCP2nVV9i7@N_C!l_C+7(T_Y6~Ms7I%tp
zLUJ4Jq#}wKxUD)8OG4581)|@DRHqx;GjeKQhNol7LQhGnqWnh6aX^Efj5T+BOOr@F
zDH`XG(Q1!F*QeiFhRiz&=4Po!D*YQ*(YaeUEx%Gm?`_+*+Qnt3FrJcjlpF3W*6q?t
zFwzSm{UF&Ueo5&kbAAA@y9r@Z+BKZ_BsHoc{U*-db#E1LKP5-6e>~jX)hEtAGzmOL
zr=uSiH2%E`7?~Ytdv(<~g)(rKcxJUqVSM8lN=e+-ZLAB^ukqcJdss$g5Kj{@;Q$!{
z%Nta0+HfW$TgoYIfHL))_>y}1oCGgIT>90?2+iA1k9YzLCE9(VSWur5MwT2Nx@^B<
z7)}&E4}p->o&_nhx8cSA6Vt1hulXXJt)<iD|2(bvK>4GkgXZAz5C`Rlm_i2SR2hM1
z!%y!oja!U+z>TRYFQ>~F$qOcc=3&mXgpZY&tHowP4_+zVpcDH{4LM5k)s{YKj2K28
z34pZQbP3(W=r9SqB^cnUk|vVd8ycKaQvJ{1kVebRV*9+`H0Ed|Lu#ebD8moV%{o1x
z?mS<0b_GEws0c+HKsk<z)Z&CHUxr?X2_|$R$pXI*F*ugCn5of`1W#4P6b9#fa5hBi
zmK(dYB(s>f2Sh@vqUM*-VbV%5m#!WUNY40A5irs^iGYXOURoz<2Vg&8$JmWBW3Kg>
zXdDGBb-xpHu6?Cf9Njyq88UVRgNj>tw+#z1sIveK3LuL|&y8-|rp?!?s)0e_%^xg^
z{pX*%tgD|Qa^e7nqgJda-7>kQBt~v_)ldANL4lJZx2nDhPhVA5#(3n7iZ>SPQZJe#
zd*{}aPK3AYvW*-8@rC=Y=S}=uKwL~Ks^m1&FyvM2mX+w^V{B|(JU-YQMCSzL>+I*x
z4XO@GdMPy+NZ*1Ta{LRGUD^izH~#h01K6jsPoZV2`zf?;{Ud*mt+W7atiqUEXAyI~
zV;RtG4>5rL&`D8I5$n;e-P^*#no~MHU$Y$G6(+}u0jKpfsXp`%*sAMFl8qbUqfkYp
zDE3b16TayhurvWF{w^cq0^lNf3n(L044GrFPa<_Drr(GBvXsmHty_k$D!GLaZ-5Bx
zbh}Xb<-d1|7vj8l@d?}l-2uPs=}L@>!TM3|4bsl5BpJM_XwuYpu-wd8B+K$Zj3?wc
z5WcLXkKB$y;FN96rzdudIjZsggm@{d`6RqmZwS|_w}7&l>DyH1EOz6op&4OIfpd5b
z{ZrGr3~^e;(B)y$X|-gNK11%ob_cBskB9@cPX_A%^KRK!SE6j#a7-nm`w@>qS}|{5
zMbU?P87Ui`WCVV}eU?LC(S+*oU^qdJg(hW~el0hk?wn+&O?Gxq*mTMGmG2*@kh~A-
z{I%ru;fU4mp+P0ATN^)DOt&z4{o?!k1I{C<2SrS#N#Y?5BKVaKE{i4VT^js~Wb-s4
zI%$s6XO{-)(d+q;@-av=yJbFlbViz?hBiYOq=LN(JU;6*dP8CR`D%%Vs521)kPQO{
zfVvuam>o?!k8nwJCPwonEeu&%xH7<!`p;mdUxig;4loCQ^K*$sfZQ&c?UHWHT`z7C
zNi0pc$wy%nEpMMm)Jw_NRWL__n$fXR3mC5Dhk(Ne##T!=rEfR_vj5)%1eJ^*qQJaw
z-^`@p`l~*ib;&npyzGrOEhYY&8myG@I02vJB`XnPcJEF%<-w9c#F=0h8WU&x`1t7Q
zU0U#{W;*jN=aQ<4;u0>#IO{xb93q;6Dp?-IEoQV!O@*YcpT7QDs$R0d9bdcldX^@7
zY0&WVBWxw=`{WNCWXh2yoxd{iCGOzB!}nfeDXQ?D_?<WpS1`~8+xHjUY-{uq1cFjK
z<uiMG7C}9VvYQpC90<Vgf>BZT%*I)^5Ap_%9GQcK+IELERQ9^}!oW&112lz2TS8yd
zjcv^({(s9KTO*K8rnk3#=MWNmu)p8yKe&GV2$`~Z`%63=_@CyZKtLDF4lE^#!3@h@
zMA3aqISe~V%EiG8?|3m>*(3nVOxN5=D^2O4@J$rR0fsfc@|g`WVyfOKHXXQfgedzj
zA0%;KDc^PGhovTw$lhLiALI_wh&OMJ030-J3YLO#-!j0G5l{G;<?T>$qG}cY<LiqG
z6INmFY}7t!!UjX8%?pM_*_ZnRMKK?;O)|x=5o<nGtLZ>$7|4J|hz$N6ewVb6@@n{m
zZ+d^C5;A7i9;UdLbF~!oa+YQ<sO=oJj%qph4z3)A$qomHdGm6?!2uX70-X7Rd5#Yj
zx$<umK>ZZ!zZV!8)qi)DJ^J5WXcwcDvL&^gBR62i*`({&-`tL9;%m}?UPwYGT*2f~
z{Q?mJ3;;hQ0(WpIAU?xUL*oJ4dGqV*DylbKvEhfSY-Y|}S9-lUK<C>>P_vTCvFeYS
z_w)iAo*qpUa2Kdew)0w~+`nJMs0n}V3_SSv!}Z@I$Q}K)yGr7E$t{gN!Y2PGmf`VC
zkLkL`OgXgePBNB4PRjeQJ|$wj8JBroa8ZcQfZc)5nXxDhDkaLIKSUJaR+<pL!UxzY
z%xyt4--1yMCO^qikk>&`&D*<Ce+3T3#zDT4dSQ?hs$cz*^9IIOq+=})kL&}-*1#YR
zJC(Yh6g6pqkb4FGG6~?VP(`(srdhcvB<iavQu~iwMq~-&YZj@e$zc^q3wdJf>u)a2
ztVqWA`H6-V`v59@{`j%BfIjP&krqcx{#vffc|?WzJ&cjH+w)hm$F&}^WnuRFDH1ep
z7x7Eq!EB$<TO_j?09jiWyvN+rj8gsWTc_K35(`E7&h90Pnj@s4`htZE%j1_2w=}?F
z()#Er>nMN4F-NCFXLkDj8Sqwz<VFZvX7?1|@W7)qcYSnp2_d?_A|au*(`EkI*l{IU
z4BlYx!}n_YZKE;->yg`?x_n9#<H7Rm+&4}X9i%lb0-wFqZb}BxnZjU<LzCQCLjh<O
zT|?X58=6P3<}}YYaRYl}^U2lNk(}N84;}<{{mpnVd@=|rD_5^(Y;p|9f2r;tW47)!
z@<~t(19!wP$wGYhq3~0>YTKFZNbujVVH8Oc43qf_7w%fSr6f9}?AK7ObFr~BA+KJ!
zGLj34luZtgN<tv37Km0s$dIc(h(U6b4v!=cnl@t>L;J+zQB}=ctm{9-;V*J>7rqGx
zMS+AGS?Z%l10K0Gw=kg`D@5dp0@250LugJva}cby)Q1k3mnWz7i_M~=!v}Z;uuNts
z4GvI*VQ}Th*Q0RJ1nQ8{X(mHS9a231L4Ge%bZ&d)N>GcH5c*fGI?sRbSCFL(ZTek>
zwA)KE&h#&<3p^R*i+hG;AiOLJ&tXAR6~`1ncS}uMh0xw`rAe+U%gaSc5HRZ`S^QGx
zi48|XlA24DgM<lnCLRt5j**eK3znuIyNB7|ujgF%cB<i_dv>#~u8K@sj0(Tgl*CTH
z>7%1EYQKG>xi$f^3Uxif|5^nKG~!S(r_m$;$lV9SAx%;wspuRNs1?|R5#Xg7eilj4
zfH}1<oD(&S1~pCE#o48tHUy2r1t;F5%dlN+jo|Tc1Z?A!;3B?;ibIJwg@HE~e;XpH
zRsOoZt_sV@9E+U=fiy1AlIoOs=JVUPH)8hq`kPIj3_4UtObq-f#>E+oMRlKQl@YNi
zxzv5L=CFmz2rVb=*Bz%IN*_RT9m}VR?8kSfBR~U~ETd;#L;B8iydZ95t%nD7Zc2tz
zp6Pt}ML7aol<iGT9vqF?-31<YhsPWhb!kBK*Vx9e`|^-eGBEF-p5C1e+*cf#kpJcl
zzH2L+tz*hk6NT8UZc0jn$$PlS`{MJlu}d&e6j?!Ei=IY~I`r#VJEC#~xP|sL5DP^N
zmnsY(MSS|;!&#zs?|wNuf%JD7p(hL`>nKL$nLz-RqBD}-FL1C}J_2sns>QndVV%)J
zMzni}*oKyp1qKoqiu6p_BBXZXTY(cm#jgB=i86#l0vZtJZY#Io?y7`^f`Y0#={aE)
zOfOMb88|GJvx_eL*Sz*0R*iT?>~71Hc%X<*2fI;xWAg)7!h`w9$5MI_4A3v|{aw#`
zMwAB~XziQN7vi!>)7qJAWh^HKNw#sc=K%T8{3IA1uig>f-|6ZcNA61rE2wiT*u%Uf
zQ8X4?0b}Lr_Vq*BLJ%%+oPIB><iWOYr*!yokVBP=4+5`)Kz5}L2ffnuU<gyOXyKsr
zlJ9*t9wIUdUXJ|skvq*Sexp%)1vMS~JWah9V?;rT4j*oA*UNblEmJ%#X}P~*PEHOK
z1h0g{{$l57CoLU{(uOm%B(Ry39#pd+a2_LEGI-D;=?Mstb5=|~NGF4d<6`dKyXVO2
zz#*L;mEm70FB3^hzatz>JI#KKk@Bw0$f*ug=>fyMj>pm&*8_=-o;{=EmOeX`b!uiT
zTXTAKjU;FeUr5MJ3HlDGXpkZFin_$Q%7H!Xeh)QcCBvaA(dXiHL5u_CVC<H{6@l;A
zSQzFlTen)xwT`s3hfRlfJ?_>rG?&yPlhzjSJ>G!&S{fwL=xnhh|C32nnK&o-#wIW|
z)K2EKPzB9;j31_##t=mV-w3MSny{-QbHETE?j8J&B9fK!F9!Wkgolp&7T12oRzm01
z8Ah*>KZn0>VsR(_Tk$pSQ0ix=##7_*y)Ak%S@ocgkS{#nZPR1JGaS*RvSX4CATAI%
z*aXqxl-+T!uDCi7sE~x%s+Y$$1e%hN*R0T;?cHqx+7lxZe(#4i#XnI56ItraSjf4P
z@$ZMd!(gpJ?SQ`*Oym2-tpNci=+;+V)O6l@T+wc=nc5sexP+>v<m}9{cn;zWF(~LS
zIBf2)e*6_llo0tkMbjV-fSYedE!eBzkQ>uu{i(zEz@UKDghR|TkEi#U(KT;bdlMmI
zPR%@_>`vE#!}>Pl$`XS_4%iuLM8hXNU98KcBMFW<LoPNK#f{I6ze-3D`)D_+j~zRH
zB~n)(h##wkomS@SF7eMt-xyvOo%6~jgfkSI&Y-0wWy6uMA}2@3SARcbK-ZC+Hkb<D
zMloCZ;i6c@FdiMffPtr)-)fpfxdi!;bIz|vs`{CSiv{d^*z1d0`~OUJ6la?Qcv#yw
zDnr1Ag_)Y8ZNlwsPR|;`X*anSNV_4m+M{W3J!4|%nY~9TXPUZBQ{yw(PedabJY}PR
z+b$<~e7voxMZny(pD);Y*Nm3Z7QqvzNMkPJzeUgAKJC$2J+$qTsle-Y+4j;5S{b&O
z|5Gp^Pnc5bc7|0;<<U^-!83E?XT^K2wttbGEhxrUqMmEAf^P~7A-QcCupw)yfcm6~
z-Pv>JHU$KHfDoP*-(50(cSuMbCP!=+1*iebbh}WgRM%JbjhTlP0l+s=ATuB&K6|+n
zFI?*}TsNq^>h^0fTI5xG4m3AbLGwNia6#h*Hn9?j=(|_B9F<9Oy$VZ&_Z$?qj9BQY
z)9NyZ&QxE?E&T*j5j#Rd0RJeNH=MKBymgoJ<VC7}ga&j&ve^Y?^r|7EvP2Z1#O+N8
z2P=v)Cre?Q(98X)iNkWHk_~r@kh}nw+h1SFR~zXad#FABiYDQf;zPMf>r)5+F5h}V
z*5Xzp4tr|~CUwr)`C`%%VZOyz*g6Qxogy1mc=P6sm{kn*`hJ^VLzTM{d4MkXDRRFr
zU<66Y^u{Y+P)H)U0ITs+6owb249<p@3C5Ek#a|Kdkxl2dJ={>9nTA?r!}w+^6U4G0
z<N`2l`+x|JrxpDf?hoVCLX~n)EX7xm5G(syYJknr1qug0E2{>}f(4h+uGOunEsi)e
zVDR863QDLIdwj7YSDF#X4p|4^?yX1S!hvrbeR+5MB()S}8BFsTtxbdH-)MI7l4SEP
z+ywTOAb5b^b37@Igu`N?iIfT)7}J^R4%8>GYa%h@hS{}4!aYSB0~Hde!bvdzisuLn
zO#-l>abL5S5ay?@8u4?bM-tRp6yL`sue9x9T;}TLW(y+~>_|}%dSLzF2-Vl|9j>Tc
zshZ-SakHXT1UTkp0-1GwCnp*o>CU*YzfV`+1>tFn(H#%<>I!%lPyO(-&Y?4w2^LbW
zClE$Q9>klpS!#|Tyt%l#vQWPMwQ$D;Y^ZvR^T$A{-@~`Il>Dkh0<EkZgbqz^?h2;D
zu@DVOZWQ!o(Lq3=smr!Scn_dp+=;LM2-uP^`Pr-_&~!A?GB+)$eGT1V(jW6(;HKLc
z&7D<8D++Y{z{OBxuxjm0P?%tyM7=Zo4*rl?V?lqq;BrS-ipqP(#n2x~pgNlP`TL9Z
zh$*;b097qNQU292+`sXHjA0?M*f08A@zN#%kY7NwMYl^jChQFMldcGe>;Q{k_Cbdf
zj5SJg)|T|2an@KNjXDn)8oea4WO{+UZkI~Wc*QUxQ+DY2OP8Rf`Ho%-AZ=msm<ZNS
z5oi#_NY}p}rNi~Tve=bPAb$MwfQ(UcrdBWd5U{XRuGuv63V~^WeyQ(>A-^FZIk;B7
zENB%jjChS+f?>miGKl|o>2|{D5U1|hyI0+Bui@H@R2Id_5$OpDLhIi)D5{YwXlb45
zWg9fD)4D;jFArTY;qXX9(d#pqnxRNJwl@`VZkEGgtN@6O*37U|fxpnEMk<RQO3G0U
z7_c?z;|I!e6Xi=-_ijesvlT>pI1EwQ@27FAVzwbf;93ePEnQmt+`(;#I!KJ#i@PUy
zo68HV9*9-^cVw-Z0#9}hkqKy2L5m4`QECoT3}cx^1Ci81_%_3iq}+j#(K6<0Z;5hK
zz+-fF?e(h`esrERC1JLF=dK_=7Ad2@B1ES-AR~gH?}IBlN&Pz{1lZe~&K$})Vnt2)
zUJ!krn2d_x%<hfo1ep;^bo7iyTZK2DKX+GA(GZ<UJ_I!&Xnu+?4XsC>3i<ozJ{plX
zba`-_0g|9@T%1C?k_ml^AVALKI_jImV26$!k85qwS@KK^OF<D~0uFQf&>YXA{&Y&^
zrjlz!FNMXN=iK~BLr!C?v4TuQ+V3X3=&^C_rJZJ|RoXAa%+RZ*zE7O>vbUpEO^uC{
z2&Cz2;o7~E=?-Gy{T121RBnPPZ+(F=H!Z}8ma!4}EzeJuV?2`ZB<yT(Yx~vXAY@}F
z-xU{QyRNQY0+_|3neS#}GMzv{70_ICV_26PtrE9yN{ME&nVF)=d8S#4W|MA`<oj7!
z!+Z5IR#-`dL<?6$+xKp3`7A#{O~^JahhAOtS(AReyzxiVsFl!RMzswIOYy>cu`$FB
z+c_LKo|?u*Sy0sN%BAYL9{C-_8by<J|CN<Ir+r?u+$FL^kO5RpafhUZb`HPze$V$d
zG_Iko@HW5`+<%say2<OcC5D2XM^_LLU-VMZiboZrdi<517FT-C^zoMmfaX_;!k|sH
z6fK74;Q}@IlF7>3zjN+6zIl=hcqPGC3)ODjy8|Aav@Z8?<)UY2vu%`31e=dT6cZ~k
zt|Mfaa%T4MDZVyd-#3c!vm#F%P}QeX%hyfR?_qEjIy(=<i&fKk!%Uz-_G`&Jfjf2|
z5JiK)57aSnMxvj}HfyFN*Kw|QjlJqE3>11R<w5i1;fh5%b(L{hh^6eH8!yD<Xqx-l
zwJlYH#y)78guX5W7k@?1cyHXgv`8h<&aaD^9Wkl9SkFQiqvJ01&T6**UF;Q<$E+1@
zZhOSVCe`dsp}le9#Nb1}`r45+CHmClcT)93HVkd5hOIV!(j@DlDSHME<h>fqDZXn}
z+<bb%Pw#w;qDex`o40S(#Q#17sWoP0Nsb}ko*Wz)I7k}&kgJFkU-sMssyZG9wV++F
zbk&ufQzs|h9xI0){+v_XZYK;fmtCQz<0ee7`Z10-U{~<>Xa)~i4u#v9*XNgBQ()!m
zm&6^_$}ysWWxDOUY0}`vIN_y@7pL20SI{p#y-L>?9ENR`dv7q{#*gy7&}wJyTW{IB
z%1154dyBlBEHS@|TCe>mnkFVTkxY)TuCF?CEo<rS1I>I*acD2&JP3U0M!+P=*7i@6
zV&{~XZ<|`&$oHOL_@iS_xz&SOhY@zIiy*HRAMNF*SXNfX!R|oI>`4+Ykiajp99Dfv
z0!D+n894AdC|f>5j<cgO>m!J;Z>zKozYmG>DP>H^?8JK0XeZq$-@0rY3xx-Z%0vI!
z5Y&7scv2k5m+Y!EC-G@a<#RTh2rX!blC{C~k-vMfKTS$$H-IWSUExiB{t)IF$c~Jp
zx+}IZWemoocxVebD9<lQO2b4~fMa@ZRlhMuXOc658%WQsL}D<JyoZ?KBc#qq%WzoX
z&%h7Xgr%GI(-*%g`?0tRnvh5;L&vvC0MDk2{gA#fnz(aF0s(Z@^+!kEl0ypoT2f0D
zz2!jaTNT5=j0x~X>y7wP?%()R<sz;E%<)qsi$~rs=G-|cT}k@E)1Rcj!5^i1EE#;1
z`trl|LVJc33up}<6z57scX8mQOI5Lt))0(RilP(8_VNnR^^Py?plKzd5%wT{*TLD@
zR@tYS#CT@#!Gi`sQ|uhoTWr6K(7$B$H%aJ<PeerQ19I+Z#mQ)bAo!*a@h9p>V)h&S
zD>$<^3=#-H`oo4jrMyc52*|W%)hex9z2Sd}%7~p^9%McIn3ZNQbLqLce$jF1P=~?G
z9QOAIIz{S=cora#oRj<nzSk^~=v9^2hMNcf{a^(ch(KEi|G9Grj4>B4BIQV+$25!2
zOl<^ms5<47g!bAxG*I}rYaZo@Of4kpf`Vz4bYCbn6TgXmDzHCrM79M_6~%^Sx&?4l
z%c_A*Cd<(jNvP?y()#DuN~*h*)Yrd@-qqc--K`VxPXF%CwmC8D+=6DU)fCLnpBsOF
z>_ew<pDp?Y+}domRbiIR_zicCA2HdqBcRpFo#o%&)q15CeAjOC=WLaKzu88+&M7Hc
z7qZFb=$W`jMOVGoF?5u$B(3KrtOI%jZQF{s1lR|g(AG5y&9jn(Z_uecICQleQ2Udu
zj%Yjq$D^h2IkdK$QDD`pfUH~T%|1JCJ;l|!d@>ZUhb<NNkrJ9gVJNfJ?~>%1B)4Ti
zx?9XxoUz-wgiBLRNfy76NlirCAcO0|<NshgmjeM%Rk(+9yaWq8v$MvaK`S6`IJViY
zqI2iT6IX`ntzW;scEjWlgW()bP(ac-PWMK&(VL-m1DK)DRg`(_&Yg|WV*mMv+|Yv;
zC=_|W`O&dPUM6}}s0hC)BkUx+EWo1@SkwcvF{dtlHsXCMii&~nbGa!t#t<$q@?@Vu
z5b%JevE%rx@3lqIa3{Crjhi?T!pz*Os~jn@*5wjJ5$w<=3t<@IMg2p)*I;P<ANucG
z7X|S5p`^rU{tZ9&t!8t&|3q=ssIFR+f}xoR5uJ_Wu*yQO3ulilQt1I1fS+rmVpNwo
zn+YH|aJr9+Bw`apTE{lEfa6SkOWxCVR39hVvBoe>5K38CK=Y#0{{FXtjQ>zm!?hvZ
zhU&;m5_>W+m7KSFOlCYsiJpX<_!sR)(82H2Xy}>XzXXQ|&_{`)Rsy61zCP#k^j{^(
z{Q-F1(X!8)E7N3lh(d>yQ6jNhFQ6xo5g{E27PNJ%0i(GvO2JcE4J2zW2Rz{JiP10*
z`sf{UIx*%WX|8VBvZYcD!0xM8H0FP{xg%zd5T0@LyLam3K=a7Qk6qfeb0Laj`6<)G
z0~>ub4bCU=_&Rn)bom<MA&{yXS_}J3Iw;Z>&VLd8Q<yRl#0Ef1n1Hlt^20mWbXpLo
z;Pp~fz0O>un>YVJ^93tKLj_zNj)dA5zMO>S2fb*~HAASmtRMnGMUJoq7Lfsv9g9j`
zWpo~xD@Ny;TmEqh_50Y7nx`*u+)3+i(!Auso@X^+!%Qj6q1~h_<+FePO<l6_dZI~H
zgEoloQqAGRukfhghV7NmN+bm51VoV<k-uZUil3i*hrMHe(tOiXMWrU?m0*Ug?Ee1!
zdsxQ%jC{QL{OMDtT6ECN`|zPNld8zX?el{)rV@TvbjrEBa#X|<#xqUvv19RJEk!(0
zX_L@l2c?OAcmHXnqNaxNo-$Xvx+7>cdy)H}6e+qI!c2c(T`nwt@_W${O#eD)3@VwL
zV~I?=K-#3bJFME;lm$N74L#p;0m_<IA1LHQ2};Sneo5^u%#KQ*ib&BM%?b*&!gb$9
z3>&uhzyaELJ=p3vkCeZ`&VFzjcvT=t7a#$W)1f>%K1dz;^Z;tZ8cTpiMSJM(GUju)
zgBPZTdMZ%kKEt(ZC<}clNM#~*dD_^Xq?_~Tm`}Oei!yWKw_^R9OFH$Rb<ArHQSFnS
zLhXRpLf);5x)bXql#IbvxdGrj_?}aqJvg9O!rNDQzYjGk-tOpI4<0P#j0!J+r{JX{
z9Yy7a5s98ZT+h9KbO?cPC--F=Uw=fk1B_N!8MdE&O3+|Ey@6yXROvx@qO>jpgi{9G
z>az#O2CwGv#bq|wD;=F!wkd<L7V#~$*AB77prdB48MKQ94Z=+6CKzdt`fs;5(I^A(
zX!Q~5iymj^VUulRVqt_^hMR*`B~@FCz75X5B?&#N{k1Uj0lSHchWY2>;t-}wxjzFY
zL9`$eAa+q+JSr*V0;aux#OBdtxsm3V*NKT=@}6K;$xFt67GSHyWh2J@qT`b>k<?7d
z&KBF9a~UgqSjvQ~sz0=~U8vL&=udL>0-LL?<tU@czDxUah6NrDup$`BZr-v5tdP21
zmZ=>^TP5c)fhYxOU(0ClO^o@SDj2kNq=4*${qHYhxu|%<>kQsQbzy?hI6ZP;!ktK{
z`YiGNif_o19U-7HH>AiwgUmY*jP|qe@tuAcj*G|8(86UE+4^p?V-AQWLxX*@YX&-D
zuV_M3!f;Q^zP#hYhlbQq64;$86pEhe7XS~umz#D$PKux=`sHV%`kH?&XUv-Qu|WCw
zd?rpBRWc#&$HR3${8LPAl*Du-j*-sVXmXK0y?a}onjt8J>+A<fjN1?rK452UG9$F3
zTCny4u`Fk493-J&lc)^ASdO*F=&(C<1`k#t!v40okJ^Iy^SR=`6?esPI6LpJANP58
z;kFB%oMuJOH4J~|bEfX0)~xep8kjC{_Ta1-tj@=^lYOe+G=C=^S~s@s<Sro`LCZzO
zi}v0{Ecn$c!<z^$PpF7M^)$HeN)@H{?RBWBR@|L9aU#Ud6HhVFdNHFweOlLBqzjEW
zOHp|${I;oVccV4xj(#E$jQJ}nK+tC&SW_>^sJ14#fA;#A*!MfsMr+$-9U0Yp-J$m$
zK5E*`5N*G!K4<E~rQc6aim*Aiz&~c#&3)}9zO!@j695JsmDv6uQz|v%r9JrROE_bk
z{-Es4Og+@NZwgLw`yCkPh@PXqP09c%0Z>&5A@~Tg?0oT`<5P}LPziU>e|K`L0xl;Y
zz~UooQdm%Ud^*PK$K@=18@u#XGF#(f#_QMPQ%hqjI3Rik0xX_QE*MFCKbxkT2SXiS
zUA>U-EH&lf1VX=x*Xg>XY0=8W8dE~9?Hq&8f|?7ozwMJPohae%b>BVg?2eTUs=m~C
zg#dSKi<&R8JHd%0kRH>K(IBR_oYSYtDX{tqef9}Iq1#jl-&BDcX;fkow7>uO3n4kS
z5J>xPZF?-u+a{3=&m=Gbls@)-jTf-Rz8?wFNoLxR!7AP@EkURqT)AQi5eIMKPc@ZC
zW7b7zTAc{Yifr<03-!??5n5VW+Ov1z9>UQ9^m*3`sQ5`H^q(OK&sSUrLfv`_F8$RV
zvCx6iGjTw0HB^>=yVv({=d4?g9u;qBJJRAK*yp?U4xpDeA57}NfwFaYjs@51-@mdF
z2>$ib_I4YK1^gp^vUa@LRR|$!vDeB??f%?)5bfuK3A78xl%1yM;J7jgdhPm-o=7iS
zZKg0mavqU5d*7xP>>n&(SD!z`nE~Ro-)+fGBw{)tixr1kVniwr?I6vYoR~xhCM7U@
zIU%-Z^gYw9UXNcreOeK`L`;2DqXl4_PUoDiPbaT`xb;nwn=PL_sk*)BgxL<Y+OKZ`
zS!gIOc=!cNqCT3#TsAX~cqjD}Y6HOuDDpp8Iul?J3VK9@0|oM;dvP!1TmUD+9;oa)
zf4k~dQa9p2?!%!HEtXwMtA^?LPUo56dp?l>Z*OHn81=gWQ-zU?s9zH5&{*nhB_
zNh|+U6ZVa!>bmJ|)Y{=fm6dz{Ht{NFVi9{irQ|lH5M<_B*UNgMGLeSJWM}yLN|Z#<
zOKCBE!a>m%ELxx*-^yDz`MGYly|qJ^XzLOUV1yT~8LF=Sf#9;^4&s#$Z6S(wejPjS
z`rozLr8g=nTs?HAh8nNu#P?fGR#>M~zp+Y_z?{05`nzu*sLKjubKjwdL;#B-T8(Yl
zb<&$5tBA)@sbpwJ{G^=^hF<TwP@YEnelqhpU{hB=j@j)w-z)C?y9mbKl>LA7>C;CP
zUIn}>0=CjmZ1@Yamcb*SNxF1ViGn2g5fR$JAFrE?->szquj8Z?aLQW>dEc_dp-GpJ
z#ke^Y*bJY2m0>vSfoW#DTf^usqUV3fyDhDqdyHKB^+B?^PVDrp{f4|uDXlv;Kg8#@
zyI$#w^VW3Ipq?|sF`nsqhZ#%9`J#q)v=#I8%C}YB?KfDClNhhD8}>v^L!)rnMeT#$
zO3KPlNeeDzArtobfOEnWHL8`g6O9{wCc}xWuYaqAUwvnrM`egM*`Y(NO)2*p&P$S9
zkm|l9XTbxqA&uwgZb{}p#U^&vnHF*-jaSI|k5rjrxbfes&@FWno|nEq^5#&I-n`4f
z^MAYP3C>UG>0Ca*8G%_R3Ias5YHxFMq2JC%26y!z`F}n5&s}vGFnfmW-CMUh+b;F0
z_9Nj)@P0S^^2Y15%LR`>k^q#_Jf4ggLf4BIFRF?%OZwBI71)~2pdXlCH6c_%lWBuX
zYCRI3Tev$#5rF!jTB1#Uuagq(B^f*8btv0!N_mOBJf)=*r?=Sa;IjR{-uV`6o0Yu8
zz5MZ`N0V&3z8`#WQ`Bzu=Eua{1kP!mQ7x`Elnf+xV|n&xnM~2pO@J$Lcg&-YCD~id
zi2h&Xw;Q~7Io&=`9nU-!IQI^!=<#1>x=Vb!6`-_2Y?}~3u;>i^zQ2Qp0Be22h)}Vt
z=BFQA>!1YOciP$}cG1L}2WELp#3EEw5~$gGm2oQ-7v4;~UBUPWr@W<5eCkHkk$V4W
z9+W&Nm2`cXLF(ES^ysa7MJGUqU3vDaWG!qs=<WN5E;||Bx};A;70g{$qU+4Z07#iH
z%}voD--0PiTP!+0vi1vK<5P2vQ%x>{5~ApB08(^!@BF%_B=8+-3)e1RCf^7U*;^MJ
ztqiNV;zyl<5>jw22_^0EG+w--OENG0`laa9(wuqBM=!h00nN-LBpn<h=3I<<G?#f9
zwwY1W2gVv~yssTZ?;QJ#sLcDzy}oy*Ck(#n;IH!N!^BZFd$e+t8ZADHBK4Ns+pXlL
z@BiMgVVm<b(6V*0l!e!BGwNDQ4va31@(wVH&NGjf#H{<0w+Zr|cJ;Hgv~!eD@7yE+
z?09IB9)L0xrcKEdFKglUoRm@=jL(prolILC<VkUo#i~btqD4oo=hP-wZ-|T{K&xFD
zi;4H9biAk^GN%zG3Clo&aI#yMuB9pc4+o~!MMMLp1kVg1`6(Qmx99_va!TjzcHkYm
z%=|jE;*VXUdqba!wNEixRJQ%GRG*LC&`V!?$dK-2RhEhFWj9{D(7$QVu@Tjb{N;$3
zY_lZoQ~wt!Mp^>^FHA)+4{$JM=&F?~(<TQC?G{eTXgh-R=m||gD`dh?%at@{o&SJr
zWJRK~$f|v{rhqwub@f|qVG;O6g?G1>X+Kf>$FHlMY~*xdR{o2fstCoBT0JFD&`EHh
zvWkb*HmD?*fIQuAwcxtwJ^EPQyHB4k_Yh48d+DvKqvPu8suNPMEhBR3SX*4WQLH^S
z++*x7z5d&c7g3_aF+@EDQGxn8qjm`AQ7|ER(6ZAcVu)4@3J)P|AjV0L7K3?9ONY^h
z7eK*CAf;o5KG-wxa9x=XJL=zU+xnC2fo09D9P*3AWhYAo6>p}gZkLDD=x4{duWYbn
z{uOTf*m6oedJIq_Wx8^5Q;a{QH$-HNBd4F9wOSf{kk@^tMfxe0-kD^q%#dXFJD}(f
znBb1vc_j;2Sneq0KDiGFs>;1syL&rZ9ljg9bEh^*YJS_an>UZ*%kPxO4aU)}y8&uZ
z7*XFPn~KWD*SdBc{@ch?Nq0%&`pt|v7Qky0oiiNuO4u!|CP@s_u)rgSf%(9AKyA7{
z_!<NCNUCj2&(B)M8)I+wgv@~oEPYc9hC$eyp4Qsx>Hss=gM6?or8T*Q<FH>=lAlB`
zZStEWuNN2?C2?!-8o^{@FGtGueDPBZW!sU}#`Q6zr!c0h22cK41_@wn&E>TymLot(
z`b}o3*t2In&>B;!={?;_ZC1sr=+h_U7lWoRA4buxy1JTGh2bh1?nRe9rv!o;p&Ef_
z<f(mw=9}>Hw-uy0U0{(xa4LxQv1unWoIq4MckcWgZ$$vT$&7d<E-o(S<F~-M1R52S
zi($up$cim!YxI2uzJ-8`><f=Tc3(^#dcx~bTeXg*O`h|cEQX1R02E=|$r#hR@8zjP
z11GvSFL?ok%QQlPezZzOxmJkLVOd$=Rr{wxx&;CnY!LRO5KwR!**JvjR&F|adR2^c
zN6K(X&R)>iRxMjj9Ux|o=_p-K?bl0B1wmd;mgzCl8oc|Zvu7=&_)*}7({tmqU%i@R
zN{v1D?c1<52bVg41K~=iJ0XW4wFIAJ_*@n_#y^kXJ%JxBr=MfTMAX*7A+Hn6)1Q!D
zZLRi&;CjyG6{<If5NEb+-MYgLXq#d*MeJ!ZA`lCw-jf$Wrz0}u%y*+L;7t<<&VmJr
zvT|&9PDK{RHazFzvIFt)V)&CY1FpU8IZ?;i2S^>h^RVW6i_^j$eA&BiAB|;5$%S9J
zUcLYIN0yE3dt|IVAl=48Lj8|}fs%~M0MY9f6;(v}H@(~Rx~?=tg?TSsAsQBIso87m
zA1ErUSEl=OW1H^#v$SU&XefKzEn$xtumw5eP+q)1PN>f-t6PMUfFjNH0gg?-o_!tY
z&)_avt7zK+%>q6lUFD96Wk=FLNIH-2gc=Rqv@rAa3XHIMmC#aH9RI8D%iiSHj`961
zLX+@y{0rNnbkdc}m%oF|urakT23#;>`>t)Y-;ry9EpQj2<xQxHe6I%l(Xc#5fd^hA
zJk7Ph;kG_E!R3ozTp9?F%Sj{z*t&iD#h4gPWPSn3>7{7>kD|kF2S9_39=IPw>K8aT
z2>6G47@@D9=(6h+vqU-r)I&}u;TSt+j39&F-xJMq^E^ekMhI-w0?WO3=1r6bNXBdK
zYK@Z&|J5kk_vIVRHoTxRvQ5)lnVCW@%Sp8EOI!>Vy*BCZxahU$N`GaLTsPPC>*;%I
zw>Fa<2>!HNg8UYIrGL{s^?Bj5WD$+A=TGCzXX+t**wU5_3T74gh1@@~p}b-@_P>t>
z4xv11$-1rHC2`k5pa;fc2PHI{ut8*NC!|It`OoS-+P>ZHR<|7>w|arE@>37nieao7
z)YR@6`qa(V`0%``OBnn7^0jOID)Ik<dA^)M<&s`{_ptNA<DH-!x%EkXpBvTvNB(yy
zP#=xaOzFHjPLgza@7~VgkLz=r1WpyU6eZD=`lYDmXEHxiiuf3x39H}Z0#YvhG|8cc
zf0>QfqLM$Q;(~0A_e+;83pRQE>^Di)t}hEyZr$?Bx@5dvhYST9vnd%y?q%3iviS1K
zwz`fzd!D`f4OW;GUZ01-3t9l_gyOY($&o;v57(S4jV^27`qYq1+PRU)SZ@Xej}*`H
zxNud<zht!L`}VH*);)O~4KqRpPz`{TE*7dw4R7=Ss*CEku~1LNauxbaDD=_Wp}MFc
zvg>s%ibA<1I&pu!>Mfaf3))Ih)+PG#r|7rz^?5@e02jeL9$ALE_QERW`16ef+9GHr
z8M4-I`NfZtB*)F{vvTqdAW73>sQ&;P>Bekr`&xA2^A@f}p}B>XFzsz%F#9tC!I4<7
zexnJ)xT)MqKn-RP+jtz$BkU6Cy)-23m3b1fWWY;EP{4Xl6y~=IREtk67}T-1j(jKe
zx~4N^h=58*=dtnWCbNcRVW*|5Yo#%=Ca=BNUF(FX6#HFOjf8FnqI=mQt@3?2p}r&^
zSm`9osa*~YOY?R==d!ehILfzMuIM$`Ol6mA7%w779DOT2Jq91ky_C)ZAVg}WzZw;m
zs;a8#i$05IbstMOazYsv!be1IA9OQtx6q*GW#{$hE5X>{C;69(sw>0@r_t$6ehic?
z5<VqISekIKxUaZuQZgu4$3^TP1r~ADgWvL<9gR;V#8*R=O%-J<Pyn+c{5<}zqte>7
z-w+1sY&$`Yh`RW)6auX_B?5qVd0KpY35dA)o4mYZd~@_4BS$6>2*B`H(<X!!agJ<{
z7dtFper9VT@$NmW*;B|oju7xDA8j@xwgz9@I`FtWO&sovZi=gG`yC7;ktI$^_4L3~
zN!g~h0AV9)Vr+KzA&_$d0u+=Y*=gvp$QD0NY<2T<B(!36JyZIRR9flg=99nR=i$_0
zfY`nR2eu?rT1AFk+)BM@u_SKZY(@gi=$wr0O>;Yl@RW%%N|fZXuC9)vks({Ow)@6~
zf$Ip><Q-t8wDOPs41vdpRT;VZV^Dw<hyxopdiLa>i?)%>1Erp+ra1Mps|qc-_aq<o
zChImn($K)*Y3ruB!T7-8Q1z7H0*eR~r7hXF8Ia_NSjH6pNog;d6|X+b-Y8l%LTQTk
zg!r9fX0{gZ9}DV3&dh+!<JM8+;fV;BBJRdTjENXNpi8VqcLsrrCEIu*cVdBEfZ(FE
zkNH%f1{srtEzTQG1PSGVcX%d%e=Rur??_8R1~WV^&?)`@#MTf1>PDpA8!RY9ujPEL
zUM+D+d8FY)zLU+(5sD-fpAJPcu9y^;plCdKIK46S`C6aa%Wc__*n_53vN&q#1Sur0
z`}dXxnQ<4n>*z<jf7qxVWZ5(iTR~X(Y_YD>KTQH71Yfc1&`(qg)V{zK7TmqnXWoX$
zNZbV3RP(KC9@juT6&j^1b#f6wJvBBhHK=S5p?9-SI_og~LKGWK&G!t5UtMI)jAvOa
z?(vinvLhi7DiKsY;H%jRy`pVcH)222V-$)AA3PsM>Czw4ePhG7#$;381pm=e!@%)-
zTN%s26Xs2kDdBVks@)9!d5=N>he5jokt7-0p-CJGQC7$Q%F?^!Cqn4;_{6UKuZ0U;
z*~lX7<yXF(z*tcd=tE;7k`rX>jxgqz%uSZt@Vr4(%<$3ecVGYshvu>;rN2ZVIU|y)
zjA>E-K%rOgnQUSDm~<kHz0ZXGH(rggba8X*w1f5^UR0u2jivZPwbR!6u*8v7P$I=~
z|MARbz8r;e2_wR0W%~a4=cZ)J%0$yq7*>z*z<F$bHnN5q--J$*6|a8+jx-LK0_<H4
zoGylkkgc?$(xz?Oa39M-b^|z)VjQaY=7ZF<JgGjaS)xD$BntSQnYcSkZst7j!QXKG
z8W-UD@J^mS<o&21#-HwK!!K|&2?~4%BZ^^Ow2WsXz7r>$O_hijy|nBFs@s365kUX0
zpg=I}8+TiX*Ti3d9!$KK=oJ4m4GJ73rrzVEO@($OdfgPwM~ge$EJ)C;y<`p%<_-=q
zFcLSA_WHFgqq&#jG0XGsN67Gp`l(bBemHzeto;}zOn$i}Ss&vmz^bQ!aYA{n?sJ<i
zhbHaGK(A8#_*SWcknxCO(&Xr4SQzTY{Q=>#k4W-dNFu*hR47wb*hS=9Rrq8qx1?Q(
zmmy`x6<Qum19p1?ipIb5qn%q=NabXK(z%!zAuR(x^Xa5!z4qusFZcbWDSQo6p;UmQ
ze$#d7ji39;il>8k$WxsQV2tw6CC@%#F2xBvz}FI<uQdO`@q6sq($0bE{>n9Lycwk(
z0Dp9!bYJy)QCoJmr{J}!VNp1F%qgDAa=-k9D~38_HOrYCF%hIx@5MhqUv3t5=`re6
z;U&+mUsvtjTjVX`J6{|-etdgZy+<hVc!+b!3yAFcyRQ<1Q@I->67+!ckotS~zJ==b
zXA9|bS0ciT{Xi&fQ^3A`S)ksa9^q@$C08trjeVg+SVwi(L1syF^Q#9BK0Um+;O7Rv
zh>Q_J)q=yL)~<VZdZ)@sY@)iiFh28z0ELu9<Mbuy!boYK&uNnNk_;p0*$dDc!tqVA
z$3H-7M(fl++P}~bLprsFV!ip=ZW6oSIauHh_!|u)8)i1K>Av1Med)iIZ-bl+)jHZm
z0=~EqtixM43S@qgm;W20&j(^-%dxDYRQd_@r^m1rq3ny!v5-^I^HI_ur=Wj;{T(UW
zf6;#q8IpmZJ2;Z%T%vDxN=NvoX?h(f3=#9t&4J^yp36N7xhpy}oMZdOHsFbYU6r+F
ztN;xrd!91)41&b)nBnYL^4s0|vrD1jF9VE7_bkfIJ?mC7cb4<|r319IXu2Dx7M8Xc
z9Dh`isJ#bnpdJAIk0FogH*T1iij0r<45aved=vMD@I`<dh=8Wea;EBwy<&o3*rlcM
z9e&1>d$M4x3)*12c1$fxBO*^_5GPCyr-CMdH$Tzx1=a*NZTejH>QY>&;L=@<lyvSy
z6`wkb0m{*)#|gd{-f1N>K<@2%a9~O?HY`{WVGaylIEIo~#>8KH-6bs&AQm0bN8+@m
z9)G}VBI?_vm32^Fg2@|7Sgb;`01<2O9z1eSAQ5&4CF`fHf9o}N7%?S~CMn;^xbrbF
zwGdhQPS`t-D2V25kzT~X-4|UgJLrFhe|7(s-#^B}p$yM}IEH-{pnQJ#rcL9>=7ja2
z4tug|E932=qmEqMn^aK1)+{qcPMF{`sH~mxZ22?JX^E>ORR`s`M>v+EgdxRi{@Z!5
zV8(r1)HD!w^`AB0zD3VC_m&&2-5~5qSs9`K=FQh7acyXzr@Bq+z7qeGBGd86Xd`mf
z+-20Qeb}3@lT*f1Rm6`o{{1Y#lh?~O#-7}kV1)1tbk3g0a0=c4BW;G1gSYTmLZ_c0
z>jkRY!#E7&+hj&2KBW{=iYY`BX&?a{k+#@sYx7?EXpUsj0m)e=REC3c(r7Mrov?K^
zT0cFNjkgOS>vb4QH9CaF^ePNhlngq0W@c`V%1IW8Fd6SLBZgrXg8c|_O{b^HY{c8{
zl9&y^HLN(MrxXGWP5c=>-BzsFt8Kp-S`$<9b|-55ykn5g1ViT>12*oDZnOPQpB65L
zX3aTh8lZCts6nsk$qwz3z8%SMf_TIa*Hu(fS_zYYL=t?u^O;Cmn2_DX`a)pMNclb2
zmgn;!3WkzI<rv{-3OT1_$9E<{m;;ywtyuw^H)|U#pFdxqxq?W?e5stAxmRxA?d!Py
z<-~|5h!toJ9V!G@$o=C0Be7Yq^jl{JX-;$EWpRcHWj)3Yme){?Lll`3i!B9ntRc26
z2<hxwfs-WGp!i%0;2wGC*QCT1IAg#&w5|e@MOzR;#0>{%I$$3dEq!NjJN8e~mnbU1
zE}}0E!@0WnWz8I)^C3TvOMkjQ5qe?HatUr5<`$8a@kIKD$(=atqj`R)9{BAuJbC>j
z{3xyp&__*;Iu<j~9qtG{$-qvuS9Bk?qSvrt)%3@(@=u&D;DAHBunmFV&Jt>1ZPDxd
zajol`1Hb)zbJy4$^`LDeCx+{4tvoufV<-cbGfuQFV+HWz{3*KA5Tw5rvi_IcS2(Uh
z<kdwGdW^Ts4QkFfE37R)anAf-pv=v1lxmE&-u~4^k4L{&Kb1QM(1XN~P~TaeO7i($
z1uC>^lBEdbCXYMkM98o>Qn!8+o=WGNhzldMrf*dLJvQBQ$@1mJB_)%k_LB_ou>7_0
z5#vc;aQL3+D+yGB2G`S4RD$Kn{0Y|+NOXHgJkGY5FlkaprGW#Zc`Y(bRRsEk91&|8
z07)*Ho56EJ<liMSDk`Wq`K#ET(gQD%eCZ~)#C*0_b2FAEGx{(w7wu^DQ_PY%3}4O0
zH`!(oKrb<mlE&=Ir8MPIY?WmhTuxnXrnp?-tNtj~;}6spEMG2YG~k4?fSCK7^S|!1
z*Hl_1dBY%lYG5@ww28}hqr}%>n>>)<qR9y~%P~EoXL#iryb*qyf0Rfve%2!SOV(3h
zE>5M_kJ{Br%xV_%jW+#$(_Z>X-;~DBvW5m}fkVTL-eg7r7soIK!pH^b&;^M^p=X<)
zOKE&@7c*0-)11%Qtv*GqcW*ywNWgPWZ2WSTI&fPcDr!4H{D_x>HKvY{g=_#_qvgpX
zw-VC{M3z2>fPl!zLI5@)(J3<y65skG@|;;bPBKs+sCwj6g2Xj2&-G;Fe&a6xYYmM}
z^GH}IwtCflzY-vnqCgjAu#QAxwnctqxuji7I=6V-t0Es9hJtEM&Az>*y~avcDDZQm
z=aC5Mx>I-nJdxrK!UX?~8)q?sPd4vC(yOW0?9S5P5`Sn4Np*osJ$u2shIF?<zC_Ce
zOmgO4>yzv<sV;#0cZM*LgM6Q%<4QoE;z{X(f7-A+12r0Lr_kdlXNb-|Q(B|ZK!L{C
zU`!>Fh2(9MJ~!ZMTsrxvH?#o&ZsZ+_sC|8X@e?wLkR#Dyue|ofM^`s%-kj3iY?Y28
z)@s=B#@mxVz34{3?TT5m))&sv_vJjmUV0v#BE-E1UJ|%7?-xdfnl@t=m^~%3K<Q7k
zfANcMOUc^+!x}!`^*b~MOrnMCVg`BmLmaaYb;1A|B2#I0zvbsJRJxme53axV-=exl
zGsaKAMCWrz6~$zNhE4C=?bd|PlH?|ktSBZ%W^0J=&NBdA%TDx3Z1`(`d;`ZSjt21n
zDmq5YkxEaul*Q6)$R@8YUHOZRK)e&WMoJ$VKAA8&TS<DY{siJ6eCTuk9RMv$C?fva
zLfJxJS^AVoi8n`iN&idyQ92SQSW)u29vYgOWtizEM&1!`%Qs4{?a@YCy5!)bh^{yz
zT24kEBUvD!2ArBMzwm+3#-Du&Q53DYKhz+U!8@uawIbD#esSr>Mu$1++65Rx<Piac
z7MoBYqcuPZw(!xMZgP*oU)=ver@r=fCtInRrQDyv>>}|~`X5sZ(*HHYpgHgRA#19M
zLMUOw&&zdee7U~fG^zCM8$r|;kPMhlIB(Nnyd-wNG}^*}mCRHUqi5)uQ=<+=+!86j
zWXw;W#*HK|z;#Ce@53$;DUdg%2-uqBOw#*I?$X!=^&^pq#k_(KbQCl%=J29@cv5~s
z?>9QlzFI<PoUrZ{gWOH2rc?b|Y;P~pC-LKZ%Cxb88iT9{L211}93!bJnwbR%&{66r
zv0t4VtwC}VvCl@Vy6jaG6JT}Hs*sS25}IYh8#R8``0YFQgM9?VO+46s2ly~11yF9b
zd{dS*Yjk1ve)J)*RKwPM+8dw@Gf!9dJgKvMCETYs-T(*2j~ho!OxTqe@H{Jvd@7oV
z>gQ+wY5du)Kw0R5u%SAM5E^M0pNh1xKR^2)BaNMK0KZ7h?JOda0W)lDD1=^vWbyN}
z1&yzp$Xp?9DjkH9O6Sg>XY9aO&=a{uD;?SRZTO6hcp{oj!70w3Ev>35d3<Dpe0g))
zH`+L*1JzGo;xg@&sCN7K)Z?+_?>F~HV_y+Ho+}clK$raDbNUj{;Lp2^ZhV(Z(h64z
zl$&IFG|BZUc(YIsrpgcCw=s?O!CZn+lz$dcxD?l9r%h}4oNtT9*NlQ`;RueXw!q2h
zDouZ#I(3qtf)g7WIiI*2q$R+K1JxinQJ&}yfmMDAtbVd|*^XQHQH#I^k0vqHBIBW6
zF85{@yY^)_)b;pa;JS&l)KV=|BQO`y1LLxPe)ZNyUu|d1qYxQT6BP*-&88*|ahPx8
z_fi2z1*ALy!4@UD7HBidhkGWEnF9Ma{%k#{sHj!bk|SM763#Pc&d66ic|hY8s$YB|
zo80_cZS7`(T>!dM84i^nzNLPR?)3X~U^lWNDKQbl2dkc3SZ*PWzy9RD^Z2HwWY2Kw
zylG5SqJZ_9q<<t$8uBQ{PQQ`4FlK#_aif51+MA(aG=qrzTz*d1G%^6jbD7iff?7Fv
zct8L-5h}d2^vkz$;6!=pL%e|qqVEa{Ao@)GmY+T1(x2;)&}bc!uaO`=XAe`+4KO1|
zpySwr4#A)IyX3z{42;qU0p)~^ZW&&A5nO;;i9G0wYuCt7c%)J8qU&W%<;Q7C@a!JI
z0G7ff4W_M|?|-Z7Yf!+%xFzVEu%@GrVIzRzgp8B?@VWe^bD^okxJP(~6T>SPd2Ne~
zbi@P)*OInjU}~p!*>)#8I~zeb@-aFmNE*nQK_U8E9y6ms;NgSxy6-;xhX4_-k_M5p
zO^)pal&7X<5-F&lAi%k+b2_nb2seS9gdvdN)Lgoy<8BbRfSezH>2eAYv;P$mEfkyY
zqoB?TQbY7T;B>IzV4(y|iE(1$an!o#2!P<iB23tLfZWK47x*$^-CD%WTtu}EoQ}*{
zOeyi6NilQeWMXt?Pd)jhL(&e2q#-ai<kIIG7)W}xW$YrIv_jn+As8L^(p;!e4WU%v
zdZL4?moL+IyvYpnM2gEu?Gy41eNb#@i+z;M30iav?3XP&R5y<@Rq$J26$&fRneIVp
zM5D4DltG|<-7V2`t_krI2%t(DB5AV>hvKc$y?Z=sNVR~mn{hlq;cGFulf;dd*By0c
zWPFK<^#|6ir-;)BB0VijyevVjr|iQ`Plfx}u(#@#@KUJZO`15dgOW`4ixRSY<K+$F
zp!lZ1t~^T>EOYi!Cnry&4}aW;nntAV#l#RopdxrM7!m48l@5%z;NbP>-hG*);}y;-
z-?u5iI*R`9G%zDV;abAJ=FOW&cx<6n;^o!V_fu0-&wl%g3QaspkuD#%B{Jk;ZaG*L
z-6199EzW6S{z&@qJuN$E9+NF)@)Pm{J(?cF(WOO4>cT#iL`_*JP@O4FGTmKCcOh!K
ziSimHi|Xp?&LN0u#jh&dDbU-Qj`1DfiR+Lp0v2uIBqy^qpM1(f0o&EJ+}BrPU+ces
zccYdLdgfM>L2W;RHa3XHtr!qvx~UzZ?kk`@_K@t5^h(8rP@SlK_5zj`hk18HtnaHQ
zY<Ql#5WQ(}^W0^%7?1Kr!i-XIFmt^pKvfgRs3BHN_5o5LZ|{y^<M6NyCu*_F9UTkv
z@`5t55ac$)0D?l$y(N=PYtp32&Rx42jIhVwO#gIswYj|FNC2Wy?5EU2`qP=YdPvrf
z-zy@F5Yi<MxKHCY8>gZ<kV){OmmGB{*t!q@WWGW&rT&0VFvfmt#HV#oBHzu=r~i{p
z_8sO+R(AFx4|j|}(B8e5Ia-`U0z7e?(D*eUH*Si|X6DTz1HjMX$T}!dZz8E<WdW73
ziX=M@BJgpEqqib_ke*&<&p5PewJiiaxNzWL3ei}C(W1Ood%uz+PZ8q+lRLsYneY2o
zaJt>v!+~_ew2Cpbya94M(X)q5I{u`v!4O|bVL#IxPFIQPt?X=1qAJjqpN6;WfdC;q
zUIB##sa`0gb@tYc8*hX4zf<KipMur>ah+m~hUi@nd@~JhCrtp8Cr*s#HfV8gHMxV<
zj8g3S3;NhVgGeqo<di`&v9@B%4oT=m-|Ph{j&S$Tl48Lt3!gZ^b#|;<F5RMW>$(2w
zPtVK1@6@2CT;i3poqw*7o(T4>xSK?w6mWAYcoG%b%ViV_eIO?<6{S=1sM>cNiK#6J
z)mjbIvIsTaAH0zY6^(t51e*$sY`}m=Sy{tzY0<To#P~KCWUK&14N*X&WIAWg9O$Ld
z4JT>`P&HdkxqJ8iYOoJzn;H7OVx;t)NpeIe9;!{591rc_uK6R^22iDE&;jb!N=oO*
zY6v0Nb)*{9DJ>GUX?%*Es7z_a9R1(JbxU~#EtS>Ja&JTto6T_JbL;!1#8R|7v8I{C
zB8$6<b&7((B6O$dj+xA|QZw*`ODn+MY%z8~;u4Q>g(7WyWt`{)Me>|z=!AS<RY_Ov
z8J*Yc-T3!!Ijg!GTxj+AR#~J+&mft)N)J69ohP<C-&oH&ZLI;d|H)xNNC({lI*oBi
z?ZH|C8OJZW5qFBc<usNYw-4lhnyo+^IIW9cU9BXJhm3xt=W8jFmHe2pE{ba7^Y6?9
z*%v2v$&x6!y`O#C^fUh2CxUKKcIr~>FmdSU-W+`v5*W^Lx0H)RaZ3AT0bK*;J7%_>
zN*N+pC??ka#Sps_i~-I^c!y#{i?(g2%LY5!*`m1x%*l8a>Px&wa+vWAahvgOL8H&>
zO?JdhrzL);dy$Nez4C(dB#JCJk=m)EG_o0$?^c&^l=JroJ3?YO!b6i>V5jcwEWMHg
z*)(H27>54Jx3`ELV%JPY8Hz{>|F-Jm(@m2%h?nB4gsUgM695kZB!!1MT2#L?%G)rs
z;9EWY#u?*AJieFLxyA4E=g(4ke|h2`x(o6TmOJhpI^kK^gKiyTH@kNlweQLEu%Qul
zlUy&(9QR<RdGph|<|Z12ypH#do;*A*Ju2jO<(JIz8h68xe+zz8e%kao<MV1CP30C>
ziext5{ns5Z=rvD4^riCqk<gSeI*h78Xs&ODKo#~3MUS`c>B9P&f}YZ$rl9BLPk(26
z2*tU~HTR-p7~ynfyhO5Z9n`CA^Vc6gd;kSn>+X)Re~^-5-fxNcrlfwKWHo+D#wQd_
zCQmLAx~t%wFGtH_@3@#%c>~Y@=tF@FzCn!Q0@j_3aq2dhy?TouqsxKxa=7KpPbVng
zX#~F@uxX*yQR3Ulb;SyWR;}VF(R+D)ryvs`#qp9_Qo0zfuRlJ`X6n=@fEt@Vh$<7n
z=#?;hij@Z^s&&hj_vx0`ahQa6wMyMdLPSe%3HrE*WIjb{u5uIvD%E9xD$##OP1}Qv
zFOnRN*XSbKZ!p<_;uc21L}ApZn_};;@Ft$br=4Gh@mjWQ&PLB)B>Bm}5jStXqiS)H
z0q@tOO(ZAG*$NCJNu>6vph~eCU`SM!Q|2^)A^o=JT#{Ji&zv=D5-LFo0(yf}8hIyT
z!oIMmsD5H94E+$(wi$U<Kc=9p!1vNoH|I@dT|E)`wI2l_sr$RdK=^X3gR7vs@9?2R
zr$>yYaFF%!SIJ^q_z93E2}gEdH)?8Xl%(`RU&}0tzldfY`>>3?fKeM)A-!KoXp=xT
z$@J?SCc<D1zr8dud3z`5E_QaNu)0+B7G4Ugh?*SK@lNzK^KJkg#wwg+Sacylv*upt
z4s_aL4cJWyXhYM>aHXHv7`~cgBCY6~_VDmPKk0W>3>S*5*Ew)0bA-+qTW85%lZ*N2
zEDB?yGzUo!q!mdSv(qd56)|_3nwqtU06z*bWOFUf@cQGAqj(e~BxWDy6+}l8$zNr%
zgr_Imr#ViKrPjy&cQn1pRKHQQKlY0IC$K6iEE_L0VY#UmH<}M7UNtm~mnu)=5{K+7
zVOaTs1z!$^rf*fzVVzsHXu+T2og6*-ho6PIGR1J}5pbo8ut#u%?HnBiF;;cdM&%_N
zjzmWf%y>c{vcsVpOrhL=^mNBASo!q7#r_#Ju?=dcyV{()c=4HC!p9LyycHzIMXg%5
zMgV;Smk{l~UF!U$SgreRic!Rdb7(+NI1&}bsGC6!kGfsi(r}>T)q~8#Uk=Ir^E5UI
z4{s81_Uu_?5|ihM2cPT40FDeO(9=+Ty$TqDIMgaPx9BNHH*WQRGn|gh?mc=8@Vg8h
z0K=O<K+x@zTGHi1UR6rCXF`#F3tkfO$x+2&Vy(EQeucCXCe)tTg)oi|?BgXUC`%-Z
zr_kp^hF)G??ytxeMxf;qa{+&z`R!&>F5u(saL8z)On&ktmg0;{ru0#wg9tt!9MacQ
zS2xQ>*P{wY00}Jbv@9ITy)e2~Y){`ct-6n}Dwd9(&+WjJxuN70%&~FIynELHckSu<
zeY1E9PX16YT^TsXUo6HkCd<3}%|DRV_b7%DM6tD1kZk*TbH%colm4pJ7)%k$yUD~y
zenQ9%kA(vC7Ws#bKP1nQeo*>HUE6R=nwl%FH}7ed_5$_%gCkaNmYbhH|0<Dp=t+Fd
zdm9?QVt^$uP<k;A7?qlOgvjpMh(Z=WRv-R@mn~+JY2AUfu{j4Q5k+X)zhoyDU-v&g
zsDUyb!eFX>`;Jv$Y>$xpCndbsU|uAfkN$_q_&)gZ<qPbhq3C~VNX6U;?%GN10arcP
z=<9n$5UIfN0;@L3t}H8K{L|1ug90yU%I?&lWqbo8fWO<76WTH*V$$Xuh>Gyn3@CBP
z7r=gu^u0-c&G<L-aMbLQ;duA&KT#o#e2u&XnOn=^tFr5sHt}?k*#YtQ((Egs&uCS^
z;pzh?Mpxay;3fVb=cMe@p}%{|`-WJ&6CiQ=!-tPcv*92k?n-1C)<6=P_G_ct)ij$s
zHwX4Zmu}roS!wz!qJQy{1|al)UJ7w)enOQSZw5c=1@4}nA;VqAWK-s@=fu<8Dc_iq
z@bEv}{sWYv6VP5E#l#WeD5zs|CO62}Vw!2A&EO)&vJ)}zHpX3p-|#%fehavB+%(z6
za>H@evd2W5ejtZ~0UU@FpFCfbpI?c3bJs>2C&gr}u$c`%bN^y?f~fecT)89c5)hD3
zx-d0umL}VYz9)slk^jsRs3Tz?$JqRuGurYvIbOZik}0xJJZ{ryjTcWfHTB~rmE$uq
zGDNdHCSK-%DS&D>+CHJ(iSVeLtOABUPs8B1#QGnp=tlf}b90V4{}LJl=<7nW4hTLn
ziWoA*1)zWw9fGbA97un<ppz0ZX#NfQFT&@LgQAY|=+-UMpVwK<br$!g4h02nIQm*3
zw^(YtEMHnwTr3j$h(`!)K)o4DT8O<uu^?w%)hnzUl4^TqUnF1u_(uzWp@zpv2htK$
z2GA<2frk*rrYp}RfEF{P4RU9OGj^<&WCS#Z906;ywx(+KIZnJTN(qn$Dz77=dgbFU
zlH5PF=3K2Hyn;*vW};P7B25tDZm=Y}w4VS1m|r9BrgBNf^~GBDqJ3RVYC9%vS@!(w
z8_~vIn)}zevec2^1a}6A#Iomu<mAqjKFFA#Hply-&o3+*c$Ce$I8b1|ZnB{xoqZfD
zO7)e*vt3kFepeO3*;ZC!2oJeE=(DJG<V}H_LcZ3?WNVipCtHkm*3s5R;KCLI&TotK
z#ObLS<BApDeE6X5=7=bo3B_kBz^sQ3oW@_8{>G6Ik|_8UmrUvX=lAQ=hx+bdVv3z>
zumZp_3mqLboTtg&q`DEWDlQ3U1b3yMswt>3jjxrsa_cDT&+vi~sVax7fN_i(dF8~3
z#VF+ndXr)pa0~(KkfB3I;^6?81l<&IL-vUntdKVo-t&1pm?M0PZ&ZA+>{op1>o}$a
zMpdljxPfpoJ`!yU<_l2`G>pytEiEjBBUSa=w{Ip>o-Rh44i}Sj9zE)4La8uehOvso
z_oVtCe=LNlN<et&{CQNkLScykE0`P!lx@(*F}NuB82|YDYXqB1uf=#8SW{YV4Gu!x
zmZvsI#Ug}%SVjVc!i?jhMSZr344Cu7GrIz$)qbbQuHh*tc)7Xr982l`7D}y<89-hl
zCLXZ<q__#<34fo4VDs1RfLREo(pJYC5wUxr$}-wlv870CR#Gqr%m%$WnvhHfh*VaM
ztV1-K=h&;ek+`If0RhZEO(pE~<BSZ^ai}N;KC>J7VQ(^@?F4+3sn;@v12vw;B-o;N
z@96YW`+W7v6;a2vvAI7>7QRX}|K9%cSILPu#uxR0@#+UyL1K1I(=&>xHCF(_D|c|d
zTdKl~fW3cgWHIjS%^Ov}0K|1D5B70jI#L@2g}{qqRB^^-DkYHH6c@02EKiNyxqJ6}
z<Zk?s(>^$8{@=%)V_d2sri5?f6yCS_oiOnn1W&M76wFn+b{+o*OT<vk#js%rXnuVE
zPR5vbT8sdaqzJ7Dz>uO>*Dfi?pC<AE%3_CXY9Io{^}&AS?&Ae{6g=v#dW%4&zm`_W
zh_QzK_wU^40&J>kO5y+oV#~vBgXSd>pZF;fIj5h!8FjL2F&t^D4Rpy7p?+bQ3#@L=
z5=dTn8Pt2GeYaFaQ6d~Clcpb(0d@&HO?6pR_QF*}sXroL!u%`*pE=j`?zXOO`6(SJ
z3OXm$j8Rz0mc>H|=#Ipkef;I?SEQ4dpBqI0EcOD(gu|jm`wtvQM6MolLn%dL=rTHF
zDZ6(bv1&2PKnZjgI+AbSY9o_|Zv+sq_4zT)*0vrpcNcBDfUg6ZLie!Z>g*lfFoR@E
zQ2QA%!jm`2Tv?L<zJ-Aw0I2>ygTD8Bo`_{Vxf_#-UVA=EnNB*D(6vX8<qH@7AwTvf
z%)|kBLcKP;wVRvU#P<xTG0A2Gf@X~qQC9diA`qt;zSLdM@7T5L=*W?fj0s`+S44v5
zk!w@>Hk&YEcI+aAV`&XMlzHEAqMQq$@AhoPIuAlSv16a-vWGL=o}{On_i$(XbAXBp
z3pIjMpw*3Yo_!nbf+0hu1#f<on%Wb=F&Iow3ka$n9yVescomdatDZ(sN!cakU%m*^
za%BghAV6Y@lvxE1Z1VnC*d7ND9%S#t-?I5l61Ndyf<hEl5IEVro1E?<&R>jPvDB#h
z)@d196F3`Q7>dq<B9eDT8rp=2gbjo}i{`fTv`;@|gG@sBdF(%ZTg#ky+}vsdg(RJE
zgJ%jY0-KDJBc6`D@Ng7~+_5ufL<GC>*V*x&)3Sy1WbNhc+aGbvm0R2<{(&gj+i&lX
zB~W$OLk<BfKK<rx3@Ci|`ubL+NL{vW*`gnM-0q#tYD@x_l2!-?TaO}H1lc9bg=l&#
z_tY(TwVQJ=?aNtl_5dHGam1dB0+&N*Gan!(0r=4Tj2j8`Y>TI$IZVj!QWf3fk^wpG
zaW-~84i#4|2-jpz80`dR^<=oM4#BOW(F5T%<IJ2F#DZ#SVb3ou7n><`XTbLD%QH>}
zYkG2_*P=i6z|d_C;ZxF*0OFPqjp4&*Dx@J*XK1*s?#yMd@bgn}k2|A378cUwibfy)
z+xBJ?E8rextj0g34uvQ`M%^1s304>RcSjQ~h+CxbT?P!eCgR~`*;d&j5t}|wEvQrL
z=&ytu0A2~aQDEJZzX=XaPVgF5f}_{L1nq}GBSy?JH)^T6v2D*B8WpIMG;7ujJ}W}p
zC!;Bud;=Pk>X1TV8_<2ETk&jFQwn^1w>KrkQ>V0mDMV6Rlo*KC4(?&dzTYM8`3&2b
z2Pqo`j%!$uRjVqvIZT+J;*~2`gzu$~4i85eXcSF=bb@{;D!M;spjf&9fG|w=0Avxe
z$pybc17kc`ena{s(N@E_6{6#OtmDc3Monq&v&kN$tLyc#nd9sspNA6~Ay-9d&rk8r
zojdIE{=@syHpl@bKLONJHlZ%D05GV_?%EcDp(&)Ok6toSSO?FtC^KsRGJrs9k)R?M
z|N3OuC6fRwv=3zuph!`z!5bf|yRHei7c`W~qwoXJ$RQ2;bDP=9`QetL?1Fb-S633o
z?6<NxwRW&XViF)l1GCaoQ%m?lItCygfu3|QK_eUzIzR9gkfwTk|NecV%O5?p#ZsIY
zMEuKg9HLb9aq2`?N<@u-Q#m)p|HgO>OlKnvFfy{uetfn=lwufdu-Wd<jy^jUPR%OV
z-2PmvhwK`m0K2RpFHb1dcz!&8<cR2IJe-~Ob(>z!42&VS6#fchnt9qV2X8UH&B^*W
z`Zd)2uv()CZ?;SnV@_gX%uywMe**sZo}oN94uoI}5tneKY0CPBhI159<o097I9Z;g
zrrPKKb@t|QHLq{@Z+qM9gs<3To<cN`dCa(s70PNglQEPzBn@g)hR9S)h73_ONTMVS
z#tel>g`^CHB0~x#b>2^_*uQhmKj*C1_vO1~wT92<dG6=Fulu^L`(gM+PW8PG)mVcL
z0F;(?eEIRC2TeL@$72qu$AVazKYsiODBeZu;?kFU`XR%;7KaQwEq{H%{6K`~Vly2g
zG!NoFnaWdgD&VJDn|`=TU<^DMr?HiRVmyj<yNc^!@(1Dh&8t_du*bHKBf0@gb0_xH
zS9`KXB8DH*S*RT{d(fQu^Sdt%-II_~@cOmkL&Nh?Mn*=u*%PudGh^6d;B07(JhI#N
z<RD_5Y}oK8y&3}EV#|}SH(_@+=!L1kj93pR?_C1W2eI;LMoIuZILbOIl2jY<S^9o_
zq+#=p)WBV1+-(bW6>oZh1QHUP=zvVWvl-_d#>egs2<Yj>3>f$I&7uyXR*~MdXn1JQ
zS`Mj<!yz%er$vq_DX;(6UNdy>u8bUbGd5#J!x<3)M<XJh3J@;I)dQz=l+;29`TbK+
znUrn7oIhwx%<v<2-i^8_iZTB<q|$`lN5rGOql#r(YHSdM{IHtCazpA$MrjwZ!OF_Z
z7t>ipk&MQ@S@Y@cWq}MI&1?+A#{9?$F{gmnXeH57Qvn77C590_I%4x_ADCDJPX=Dh
ztEQC)XT8theY<ymK2^c4Pw9VMqD&wwvFI@#1nbD?vBmAQGB;$VWS+7z9Z^yey6|{N
zT130)OHpkFXR0?)`qY&Z8%_=Vi@f+%CyRql?;(g7mMlN=CAPLbaNh?nIVN|A=26D!
zjT^54v4iLR3KepdA{(XjOn!-tkIsSzygdL<ilSg&5#Zehb?Y|H;rNftWnp{w?iJl7
z)A&NspOs#+U6bzMrYp?#N3D21Ci==A-}SLqA3y$H@sG^ify(Ug@Ug{Dg)Mr(={dwh
z!fDq06N(5pt8%>A1om^->xj6v+4o7iX-pZq%EAO78?(H?d+j`f|Kmnx45z(C`N;PO
zxly7+BG?fVRLE9zL5KbHs|Aj9kidyLPSl}LQ_B(22(p20+QLO*<<subq|W{{1!SHe
z-$EknB`Jm1zi<O0w0}ie(yOAF{B_&Zh$`v~S$M^!O`GIB$1Pc~fTYgC(lWF-WBPWr
zJ*UkgvW<?sB<N|n{v2gS+j>@IFSE%FN9=#bx7VEO6f$@AY?D9w=_@wsD{K4Hf1Z|a
z0&k>|gk$&qz`#?ZmMvI72Fa|p&y+VsMW1VWiBFjNjAEg^ns@|P4qne8gyRfD%(^eG
zt1&`@zfT?PxsXHynRv8nSER3gWsx`&M@Iv7?i3uwQnM@}`;D>bdDw8)ETn+h!W$hF
zK?J+?|6(4GcA<qNksPv*yAN&&$dN7l_N(ve-atvF!HUdBY(U1uF6~pk5@drymeUSv
zo1Gr|;zP>zVaYG%^n}-p3xj(=PyWBhkG!o|Q}b!_efW^f?jco~G+}}}ys4BP1#jM9
zO!5fxft*1D2hvupVd>%CAlyfZOA+Za5}YDcuim+H3Yvua*H*TxD9OZRX3~D-TVk|b
zbp}NN#jBQ=$<Ob$8)#-hI<eBh?(xWw<%eUEP;CMSwC)5*9?qHGSOX#`<bl$w73i|V
zx10g3jq9v*OAMtJ<IEmBpfhV2A{}&^NP+7aIvP?}TC{C-1$*$*MFC4O7ZagcOhi<a
zFczf`j&tX_CJMxHjOHXN`7bzp4cE|kL7>UY%OmB4YP6y3dI?yL_?9d~?vd%v<?BDE
zwB|j5=7A?*`Y(BUK2o(S{v?R1Xd2b}YD)*HFTm!H{sOm?X_43)etd9zv#KOa28Hwi
zk~L-MFx#lhZ&}WvsRN80yfkav&%=GRwGD=&s-5Up0V*GEDn_%|h5x1Ge;05Qh@92e
z@t=RXxqll6u>Fdo8H;GeIHFsY8hlyLM<(mnS3}B0#Ky-B*-eDTE8-%3vC>uwinFKA
zzh%8W-(bIZj|`lceh5g3*pDOWF%EW`M|=yGm_(Thy}W{L-=ul-Z(qL(7e@*x%-R_;
zOt!^fp?MnDQ9&$7KioTAe_ih+%$CW`P$PO^YegasFohg}mv<B;+IV&uws6j;hbSEl
z<TN64AtU%e<-*DSQ<DwMba<8=aPuuQA=3AJWHn<3Sl>fbFuh7BVi`_BCqi3I<=2ff
z8_SJrG7xneG+S|oZr@%-s>^%p`hvI-kRgl(J+6YGM4Nv-6Ip50n!B%Gx_GfZy(~(N
zqPK5li^`*txa){a-9VsVFi@8AO%JGEy?%W@WHfSbXau_w#wcBmxSgM$|L~z{)>y*~
z%#e)5Bb_`W%}fw^Js;(w#TZ5llTyz>fGuq3n&OBk0gd;rqb-{?VRQ9aFn|6)WFB1b
zXUHkoINWabd8Fyd9j8x!FnzPOI8$&dF|TGjr>CWz<D5nPQ9K2jGZ>Y!1}Vv`)8|DI
zp{~CFz=7o47ji@*-9#4T<(#c)*1BcOJ#x7ndW;t@=?RebT99W*rF&M-CSQdzVswvY
z_`O@V7BSM@VLrJdD>{6;KS@p57l(%@d^auh7)2+@eyoNU+UXvf#cT<H8bwRwb;O5$
z8eW)v7|9zeX~mLIhn`3CGX!{_wLE;-u-ldb^yh-GZebFTo}YFZJ_$$#r~e1IdH%kI
z<11T@(I!oX<}<wAw82A$c+8Q=xU7bUl4I89tvlIkvqeimb>yJl<Tp$2BFQAyt>3GL
zv7U4XmkJZikMuSDKff{Cj#rd@Yi5`6c<8qN$;W<5m(!xpoN2>B142TIHtvfnA4{n}
zZK)gtX{6{_3H9+6jOi|2E~lj#1)1EkX)m5at*k}f$s^_jU?b6sJ@5h11Q#f`Q2gBa
zSx4M%DP6gJeonsIx8t|kA92xtBa2*tr+YfNReiom;?j5U;Jpd80Ea3EeibYe0!^q)
z2l$vFOV+WeY>N84_mc`q3hy&UpKsrNE$|7sFau^t{V@SMeDI*I@6chxptEHNsJ1Ry
z{*<ctCGlh>TXKtr9LF^;xAh^x>gkKF<sNhkk>OvwSc)Nqcz9YgW<}OLOkyd)Ydt{H
zM;RF!?wLrjIv5C%g_;#QeOz9S;(4EnHGVfg!q2eGMMoG$LB?rG5SQBo!)_9gUyD9U
zr(=BG1Pxxmxvfa$CUtf%c>n&CPoD6l$2^E?VSHVi!esq_(CP4fkRE6n5(Y*PD%wTI
z#R(H?Fw5UB-#~Gz#D_bY>P%?j0Awo7J4!eEU}7K)MBF8*$4@85)*<$Th#iVA!+H1z
ziywE-PhY;kG}$q5?vf=WXa(#{SmCMl3&K{H#JcWR#)O#5WM#CDD=RCR-e}a##l?jN
zqcub&cFXTKVRip{6DfrbUlQuvcY;(%3t~Cd4ns<XG19NQ@u<~*?nc{`l$5*s2fV@l
z+QgSKyc>3fK%@TAb)B6t|Lr7H?-Yjw+X5C-_}fUFs3D=wn>Vi>GaKXdpU-bcGovv^
z))p3zfNKr9fzRfmYA0)FlNLlaQ+(=OZ*TwZop}zWFlHJE&<W`OKn(Yuzj;IZzl*m-
zWBF3WFL)0BeJ93DBNcMx7YMFq@ix#<NYLE)7XMj%%t-ltg>JpV%M4wAA#lH0H@r%S
zj2-*U^}-rxyLa7{Pyg?2i{T4%U@?Y$(jWw40|a^G)lG{C^QkxL_Je0@O6mGLpfioH
z2iMHy!+FXk5T~AN760ZLzM)LPK;WsN$}C2D=x87#9D)HpczbSb#=}>w>+WS}|L=W&
zy{Lb;5UKE_riTTXl+`7^ZZFAZx&{Ag38ix5A8;9f`2f4@v&7~Aa2ofa?iI}f1T4$L
zIY~`jop}9$TEnsRpZw%#hr-vdahtk<&nS{@*@We+8-hpRzI|_By&6|X15h$+&9aHw
zfWXFE@_<cy>{!>fX}9j&iHV7!1QJT_l4$yQ(CUe_LZDLBvqz5;9a5PrFb$BI_||#A
zf+?Iplsa;n)av4T-#?4XDp+aGlP6OdE==i>ag=36EMdFr!@lm=as3d*jqwckHNIeC
z%fKb|nt&ZUrV+A973f*JlNj#VOqbed0Qi_uvY=@cWg&8oXao`$a)tD9@AhSMU}P@k
z90NJ-YeQws3`In@O2g~Oky-29R(O1QJ=I&%qD`CcSNHSL_kk3s&LxQ9`=y;_CAt`7
zRm7l$c6LeggV@f?*lD0z5SwDl2FW>nxr<oI9PDv{FLhb>75jo4%mQN#2&avKT7}CP
z1CMXGnu>R#y148-tXXR6lfq=M^?m{^l4^ZS{BjTB2bEDwtg}>_LC}tgZI`-?Qcqeo
zOj|oFF+CR7Hj)gQP9Pm;64T2iT<_SJm_!gF+HtQZ&AhaJ<j|p(%Aa3s?o8*moGU@{
z8YUCU&s?2`N=GFiq)tT{b~xD%8#YX`g{y9KX*G6qA&o_!?UM>B1sL<9|A8w0;lqdC
zs%$9Apix*?_$SE8Q)(Y$4o=QLmP=heGqpu4b#)xCyXfeoW@no#1I_fK80&p_5nVCs
z8O=l?cfj99ATZ~LFAQdG1`!zwkw<X^?V)xkYXy{28NPMvZ4ON@RT3gDaJ$2w$xt#1
zgsG2xhS#Kze<;~%DwY`O6MIOa>@9ggCjvU%KmS0l7&rEx8GN#%K9mF78EZ}+f<0b2
zlvovwrm%z(6Pbo^0a6(OCUs^P#kV->{#T;ON^T2%hRvHN!Qo-ryehIhVe4Zz>&fHC
zTuM6HCxe6i-3Q5f1BOt(b89;xH#nc2NIc}y2<uT?oCWb^kgbV{uyKL*2lJ?i)GK{>
z6@h$jt0xrTn-&TQRSXjVO*dWrLn(AMT-1CTkSrivhAfpIZ~=hh2jNw4{dzFvDm$7H
zIYG6z#zfe>3nzR!Z-PV&*&0=;w*cu_1l((;B$G5H<qoP_VTImhz30zfx^yJ&&l=Z3
z(TRNlXZtK-QU>G9c`@M!^FAWTK}-)&#hm&X-Bg>!V%9Agl`TyoTDpks1(!$TIx9&~
zAKtzDfxEk4P=NTR4^Jfb=S(ooI<d8?%FTY9v2oO$!Ujqv+LL>WNP@&$t9tP|SIiJN
zC&OMNe(MGypmwK>$tvaMZ1VB~PJBu10gm=-M6rfnJQfnd?~<}V%|=6Ur%duD=Sw=#
z*~-{t?vk&fK_cJX%Q9}+Oma~3R;MNd8k%aUfwAW>RfIqTw#CP7-@3I0%{>alA$iW%
zZruu{Y!sNvI^hZM=FWZ=$w5d$?HI9uWnTNiQ(R81>Iamg;w=d>IfbP`K?JQ?{$@BK
zf>Lbi@B)%Lh=aHRY=R8R0;ewogVdFHF?X8?GoO!e(F`|&Bb$Z7>~W2cD3ig#MFkQr
zF7eE5X(}2ZLqK9;9ydobsd-}!I?r$*#bqK*hvxsWVzZDlC`XeM6Jr<N6~wsWv%O|q
zFj-?;o~HiKKc=OZsZe#*e5k&aH2|wbLc-D}r4R%&zNcpU4*&>Aj!O@%%>#Pu-?uMK
zg0HS0lW;w-lF4LCPSApq1i>k5rlJxKCeQF?fB}T&(9RyBI;YBo^CQl@l+kujqrL|2
zjnnM;@GsH22-OkVbv3p56g!w1a2DINW2{>d)gg=e&E12Ty=yG@R@}TXDMxR2EeJaP
zrDjh)Nu#yZC%R37`O4-&MK>G@O5`c4EBqphFr<ki?~h#j{<CL4u`k<Y`z&tx5saXj
z?NO<La6(L-OHGyfQWC%7-|2gI7N%R;O(NtubU2xrl?0?`m_UI>0qVPL+Yh`_X3m%q
z<LDEx3U!P9HO4CkP77M!Y|y2{teBP(wN|azMKEhG%+UD<_4#prkDIpFiBx|%VIE!p
zxt`hiB32{{l?wFLvPr1lMx#CAF=B)pSvnAYuk%Z&AK)E?lwiRqXa#NBgkH@iYQVe@
ze6^E9Xpa{A8`{wjWV&eA!e#0zDfIe8!|u)6uz&;4iR&xiiIBMOI`CW=U$DERHX^bq
zl`{55<$wRPO>kN|ZQ2fp`za}Q`0Y^)W@W$GRd=Dy;27N+CvO-N8=IB7f!1ex+;3FH
zXm@L?GcSqKvsq)<PUN8WQ+=6Frp(m?>lc~-#*N;p_+9!{CK?9EEHgLX8y{am7-SXD
z38HNi(xF%+zUUH{W(_960|Vp3c44|W=>$=mbAk$*L(k#}HXFSPJy?i+nsI$}ve0(e
zN9br=r+o8LrQ8jr5^WWrE&<1KW859(Ct<Ig3*oKG-Nl%}arNpBR5au&hdrG6P$63?
zz!Q!PTF6%lPdhvN%AZ;5viev+iaxw*FBlcF(NlOz!u-Vbl9nnpqwL78BS-St3#3|9
zUT!q-U;y(8iPLWo5O}GIyd}HDa?gPS$S?As!xg=8W$6)nfr{C7<i(cCjEAhKifgXq
zyi^`uxYS_a6_H8lw1?+~u8GDJhnEM94r!S=O;jR1=j0U5k%I@@*KurU4HCK<!8+t*
z6kQT`0Hme8*)`4`J02n|AEpaxv)H^lP#P4EQ}lLW6Wiw{#;YEq`$N{rOxzoY?}P7%
z+9~LOFvfZR9`Yq<J}D5Q5i5lD7-xce^UIAjnEuj4`@7VM>OrVT0UVf<ln2R-I?LeS
zOil}I%U^{Auw?g21`yzkXTG34El~!Z4><iIjEOkS8EXxI5r6Tb=zB5;cdP0#GI=sl
zp-KT|1&g@|dF0Kh*$xgmnrZm09DkN0qd!HajBy$)QwL5tz+KU*CKCba37_Rmqvo*w
z+G)zsd<9@KST|^}IV*{J2E%gb4`C|V+D2d&91Y2++aSILw*b0928hChI$lKsEl&lb
zAA)?3bWwUP50dGSQ`Q|df{Qt7)F@<uANh!3*}D%s$C-koD5S>)^c#9<9*$z4I5#FV
za5FefI5x$BF@5@O)vG8N>1u&bLFfS)lqXEA+f;)gqIo_t0+FQ?#GsKMR2s?}RLaGV
zqM{;GAm0cP0WpKfO)r1CO`7>WBE`F+pO;4Ofl`Km#3#Jp8A(tmsW*F$ExXjdd!Ihb
z+1|B`qBRT^bni&q0g_%#!b$Iv2!@qYFW05e6d-<>b}j-Y1ljE4bpbWbjP#8->Ciq>
zmoRb5C_$HaaRE{R_n8k7peiOVXIw~5u7=x&E%6zImx*VdF5`9(AQ7>|+8j)!S0}t1
zH6ycp@dS%a!hB1^3z(P_ZrfCNn_|ALYS;e#GqSSGeul8cjoPzRSW`ssaGUAne}Clw
zvcB{_%9AZ0@i7}%E5;*5_)--It)npHwn`&ca4vIU!gc~C$Q``lt<C)wNs>ZS;<YYl
zFB7lEx(g>K8mD?@ZOM0TrKekzz+phC6fDzaq#?bj2w@n8lf}$%b*h7tm9P4U84+M5
zuXRU|H)Xw>nGyVBl|^Pf7r@{j7K>DV#a;77vNSC4o0+wB4v1)QZlvllaxKyW-K8JQ
z4`u$fmOd&RYH36y>&2*9QBk<q-Tn|5Pm4uM<BnJ<ZH6JtB9;bQ5w(pxdaz<$9Np8!
zeVgMq4uic-4HzfsLTH6`ku#FPi+;eVc*kW!+_*uvF=1Ok95_qp`H*RGnv+E9^%-KL
zH?Lo-Xuxrp?x>yhoMSe2Px2ZkC%wT^zN!7VkzO)ua0!}AMMj;gKfM@NG%S;2q)*Z<
zQydC=cJDsoT^868xpLBq7GXNtoo93+k%b|-ss~jbjSi{uXYebKN_E$)>}*k#691$%
zv)807OC;E3Z7Gg_kr8Qm<H+-flPt;N1h)BYm{F$T>Tt#o+w{vijiJ=4Whp1Elik)u
z-?$UvKEoB%6l^7U)Sg|tOiKIw`{sg2&H8Ag&)^1=M+Oi2M$;;yryB6dz=~BT?=d`t
z=6I|PHs?j#Iv)^o%OG84O%g#xs;QQa?odE%JK<i)zvhqnpxgMH+^{e)ir#4OC`-y5
zxDi${i>BZ9a4|G;*q{%qk1Z$$DLHINV2`qDQp@iO>h^OvP8c;boH2|zP7im`TR1MT
z@rbo%9~JjYNoly)$J5#KJs`;fEks7;(nTjiK|trInf7km^2cY!PQPk;ElP(HQ)k^|
zgaxKONqSp9)f={l9%D)P)p5p4a~@G|(1<Iyx0CfxJrzDRY3uBUrsa8|fj<WaUoR%6
z!+=Vi=dQ1x@Mi6BxO%tXlBHiTkiF7ezUNJ@Pk6JeNM$8YRf!KH<q^;i#Zl+%q~&JA
z*M6v?-4o^n54M!DWAS3)8LG60;srJd%!Jh+a))t2A=gHj-%U$<$6lorV$s!OTS{x(
zY&SVRBG^HM))-G6N+AhPjx#_=Qdm#K07Lf^J9bQ5xkB5bQl{cP7xyuCFeJ2(BD~*7
zwJSR{NhiXkS@_}ds?60e+!({|;^cH*ExDI|GmnAlRJ>DiDE@mr`X`IyHD+d~6c4i^
zJWV!R#lEE<?p*ElCck_5Xa4Aus2+6h=FLg&aU41!GlS>~7aK==m^)Su6CJcu+cO;K
z6Gf{Fm!xoYp#N==&)>ea%;`^m=|rCO)7!VhVsycD#*GZBu8OHN<;%Bn?MAK^VjV>7
zm16F^^@#R2xz`WFkd}ECYz0<0svjMQ;|2!n-bKbSCr0F=^xJamwi!&G+=dDt$RT(p
zg9q^sBu4eu6JX1jLNXnZ4)K4mhGZLwj<Aznl9EK^zDiLnAWxNX)a*CGVlbd1{90HH
z>E3-lxfMBKk+2Eaut_83jd6R1Wg$36)ybqpn`|GWQzPJnN;`5m9(&eKCNn@e9k?e-
zN1n?2StyHPMlUviHasw2df*^O@}~57V(C{Zlx@xEw$hTIg0D{aH&sj@lWc)0A%9p&
z>oNVvr>cYF2c52%DwVeN*bsG?NznaI!(<HXfBN~07t?e8zTf&)L4oYX5Ug`-zZwB?
zSl&=i-PcN8!X<rG(d^!=P|kR}OKQ0mEkEeW4P9p;Em#TAPwuOyrG=LORN0%HWaK$D
zj_(Lp$ThI7msJ|8ap)q)pne(o#>P@yo`DuXmgw)G1R}Mb`E%n~LN^CC(B<BT-)`68
zI2IG_-Y?v0kw-`6U1nG}uYHJu5tKO0$V!c2Dp|qw@crb`ZuegDXNSQ{_SL4YIXV|w
zhFe4(j@ca;IHNR+&d=n|2O>^1j?Fw<Rxor86%@>rLg}J)998nOkrbV~684_l74AS8
zIisOyj;lN$Ro-K|Z$y4~h)6LUls{SRoFCG(r!?VVz@VmoU05i03S$i<0WhHG$nZ!f
zA1ESTQ9f$8h*jc`jNu#SKCLm-DkzG3|FnqXWq=MOlB;}K`keK6s8S!W$NtPKv|=oy
zYS&@IDj>v}`Z74e*yRrXQVzF|f1o?CRG5@jyfZ#9(4e;3qXb`?H3L1)!-)>=?I>Sq
z*B-6h7|ejE^0G27RWeS!!8Jfx##2WGOrS;&_T|$oylVvqK{_Da;b0}TePa!Phm+4<
zrs<J6q@F(yAOCbaVk-3CE?$tkcDZOoBv6rN9<0zjJovQqt4O8Hj-5O?gvCQFFzUS=
zXkp`q4LNbkJ&H56n_0lqr>nDO&6=MacV4QjKE5`g2M$%C<q$5-CzqkC-_i$J2wFvo
ziv+tz?XgKG5@#)AnarPB>KFKBrd(umYaVC-QMRQ7y43sy3k)~Xu>&YplZcU5nhh>+
zpVcJD?SRGf(`V(<79|s*$4P7Lo!p|OC~iPuR9}J<3EM#rZ7LByT`S2x68iY~up4VQ
zMG8^@MzcEi?CB*un6q`^CY_JLaIIlvpVM8RH3fl!ezJGdW(F9}3?hA0Df(Zpo^U*v
z{nAoGxQ(y6S1KX%RhNiN<DzCJSiV7fY1E>4jD1W$ch+uJB?<Z27^>prH9q8e{y~;X
zo)g)loS)GM<ccDO9vE_Eq|^I0y=&q9$h13mpx!J6c_8M|c<wrK&Xh@$PU0N^dJpyS
zi2K(k?UyHNtlckm6_H7upi_GP(Ie5mR%Nb|*~r#IJ*wR_u2Veo*}rm5ETrlzEIc&Y
zV?A)F)ZYox5&W<9FDj)!p7LD+9<>ks3!~pf&~?0+l1x5r%zVSUcToW?0&B&1v8xx|
z35K6dAAKE=LRkz76}lfz4hkmWWjM;s8FF%re=AW8lXgh9(9#vN*`a?Vyffm1^jOLq
z+EXL4J?y0;DFr!Y_e3|P<w|DNL_-WFpd-mG6-i&bqnOMm4SdFIAMce;N=o8zcNGim
z;B&*?%jtVhNfSW#m~GRT^G<5Z8}fxdKp4+>&c%80?j3%eS)g-#m7+OTwHJz4*#_V>
z05E~xvc`2jyP0uF&GU`T84nN^73Bfh4#R}Y9AEeEp8=qe-c0?L=h}x+wtgwh>s^F$
zZ&|E`3KCP|-H?9$42+G@a>YQl=Dj%WDQmn`Nl5l%1sOeV!M(sI4A|DVPOQd8(0<V(
zL~)m>u!L9ffddY4i6neX{4-MCy=PAb6AZ}W!zx0{#yx6<WdPDfjqBvku@{y+5kBIQ
zEu67dmX^0HC$gtG+Opu@W7$+&BprmR@Z{-J*ep#nND(@;XrVE09i6p<o2IU?Unc_7
zqJYfz*&LRnanQCQGGtuv^D||q!wW}jn^qoH>w5}3{mzqXbo}JWZ#WkzYbdM`Bw61H
zD3E&lG75xNImvs21{=BN<sb7sD850<&u!{$O#t98Hnt%a?FfTt`BM3etaLk=SRAm}
zB!GN(2G8XbFw;^?c@5ckfQH7{VrL|9v*^I?995~Mp)m)MU*YsTL>lqr;^gx-R#x&^
zo)as_G9r8cM5w@N)2$K7`(NC;z0*kR57y%;fmo#pz>u|N;Hr@$lX%<m_fG`-gFDxE
zzyQD6{`vcg-xtK?+*7nJ<Jt{GjQsiYFgOUrXF4>&H|tb2N?P5#rD6UFjS_>oWrp$B
zeyQcU?4(X116i<eA=kWCljP{7Iv217YPLPF)7-gT-G3-4sS5l8w`8UvcA<Bu%-0-E
zk|lafl&^bx4As{sKT2jzoZVY-pdD)|vvm!(Logo`1pmrE2L>NF?zO8|=|7$tmB(?*
z|NhsuKN!UL({=4oPYZP<OR}Y)d1O)Ql3q6FPxW^@)P)5_5~AFXuCNd{O$JuH#Mob;
z4>2ZQc{)fmdP{9#!&gp+v(qp~g%ZoeT6*p&Uyr#R<J6M`2#i1zWML(pP2_^Abs8|G
z)CM$t%<{-FnQ1D(b4FgoFK&SZYKpNjE|>^VF672S8I0$&dJTX;A9NtXu!%xC9XO0*
zb4v1AfkiJ8Qhm-@BDXzM08$4*E&j|IHEH~KG9_x6@s+RJD|T_M@@0%1BKvuS92(c@
zLTvN#nSZJ~Bp7TrBwO&1+THa}rK8=$8h-$k7F;9iTHbyL33}^}9ZO!n&IaC~Mn|V>
z(2g!7#-+nxPm(<)B_@(c=q*2_c(2Rv#t4Dwm?m^XK%3gcN2rn?PBt>~OMo$^qYoZJ
z4={D6C<;yN0!O*yyz&yraWZ2on?VamMDzkrhy1(yvbqFE<+sMo(@;<^i0U-SM>B%2
z(izq^^F+k(AohQ}L!BhicGd6BX=ty~FJFX~7JdmR2$adJ3+(8F2M=PT3GfX2aeI9S
z>31*Qvx+#`5my108lYC_PPA_o*mJwM5Q$RF>aMJk5J)M$xYDk>B)FVDXF2~^YUQ8V
z?e4G$C~<`zHKMM>RWepcEyZtn`h2B3B_#Q|7sUBs(Rs+=!A@(}GXCZ}sZ9i0Sc`Ra
zFMoLB?~bRsKssuwe}byOdr76d_kE{OiZgUo@d<0$MdB^~(@z~U0To;tf^-1x;8Bk;
zK8Jb*ABK}AwU!Sfo-eH@3fQj1G+QgnjkJZLi(DI_@ZP=V;p4=`4zGZ51g;Y@oN<cx
zu*s2aUQtT{?eNHO6*NM#2aD}+An?h{26;zsvtrGfr2>8vC<?t$w-?&BM;4pi(Oor5
zarvHSjj3xZ%l;s57qNxv9Tz5=Ei=RC@kw|~&Q`?fy|Oew8@4g+DN93mgl1I{l&&%T
zM7Ej0Asj*e%h?rFKp)!(mPic6tq^b8+I~#>mPcHTH*e=+_d!>2eG}W&i4r|iBak-2
zE5q#>MU*9=Z7paRG8HkT$vSRmXPOru>iYRg{`Fi8leTsX7Kp*uaP}|IlB#qjhTH#m
z+^qAcMA?^FFI!{?PV0}F*tKuwOqZ85xY_x93#$nY160)mEUxtKzP3fm<dw}IW(*wk
z&7$AkOxeI0(y(5$X5BTso9WmmPP+1sN4M@Pw^nP`aDtlJYR|Ea+N}2o%wKbQoz<p4
z8Z|2WQSt8Zs>5GKeCx2VQ)1ccQRVLf%EDXPlMNc=Po@`ccv0n>p+~On&y`VSH<L1E
zWhGhUgv9Z8vuCdA^!Wn=@k6`0r-x$uc&6TAfu%nlcGMpS>!FNE(6}y~Xn|MfofJ<w
zTD%TE2=}p<7kNZD<L9Fv{!u*(G!Oig#>;7$PT@xKSkT80c(RegQX+5P2T9DcTG%=h
zqQQec<kwVxi-<~gc)wb44`#PfXrGmo<hRhIer5VhNO17vJ|ybA&<Ly{NDSD11R!3i
z<vNfi7Kyhi+u)xbQ$8`73HJe#Pv1kLox&s4rXL>g^oli{cRlF*kS9F%D!&|mpc
z__O<BUdnOq;>A!v%RpR+os)z+!hX;JRtuK&{<CVe_4H>lzTfOv`*O40Sdb!C&u~Gx
z1>OOfV2mM=U`&>sg9GU2N9rASx_L0TB+7C_^$s0I0Gz}>99{eCO5t6@6=B1(UtvK3
zJVqCoWjb<0WyX5M5H^U%OeboiKZvI{=sReT(8Y49V|F+dvXaJiB=Mrh$vT~PvW^oZ
zJyy4W+L)qwq;08&cW4RIDdFy45Yq+Q1k!&d+Z(dvF?fKM7kLb}a2Z_Z1@99TAIqej
ztWNMdk0Dal%<2MSW^=uwV@!B$uiuSpRVte3!l$0ZNV0`Oi4Rjl3-!jdT1K#vlIFy6
z1vR5@e-Ht<Z{L6XXfMWKc{*Ak$00~Irz@xoy*!SuD$u;g28tkXbbe@^DEEd1@cTSr
z=xH*R<dWMlVCBp4MTM+FeD0iHo%pNv6MMD{`b|bTdJM|LO4ful5%@_AcYR%V8vLC8
zn^H3SFR`-F6L_lvG2mP>&}q2h+usj}kN!vTpZw`;0EC-NEuWW{N5AF9J#k?1sI0%v
z{P*QWl&tpe>y&yX{ZHJOR{Na){1Mu}{ww|dVcCSb3-tR(NBh(*h~Ga_a{te#uKV%Z
zZT0Ip8CqD>gGJj9)qO0UFuP6N-Sc!@O^1uq7cwer<=n^dzhAY{n7a4YngoWdE<sD9
zudRLi|NSX_i19SqYKf)TCMx#6;<ef+Wsx88IGPBRSVLFWNcqK^Hw*cN`pqo<vBA;0
z+v6X=$!jo{R_OMFwY)g-uS&DEetqJg2aCk~5DhP1U!fP;e&~OcDsunBhnLk~-d#t2
z-JU)tXeI%us0i`x>}B^xkLAVuF9ZM5WM5yHDhF#71}ZnEDZC1lYBttXDmwUucpFYs
zp#wiBcM880-hrZx9BTwVMf&<r>n}r>(Wdiyicbh7Zwm0h-VHRh+YMoLc!mpnN3w<5
z5=rX3H;PyFtQlK>SzBY16!0fJ%vTHeHxu>S9$Oh!*LPVD#mn8=Tz?1k3VJLhGgADE
z+3u<Ut$HQ6Z18IqQlq}CPr#tI3a=frbiSVNp+9ZsLot;o(?P@^t@iy*%4v*F9t$I6
z<p9M#8Z%0kpRCLjje8#ZYqx+-d%2;e*BRu~g9@MMdyCdOCExCkv&p6?nHT%J*-qw0
z^A722SCCN(RDR;{&{E6_9}0qO{uR+X0(+l$`{bV;hOgG*kuSz9W6S(Nt75XwLUYe6
zRyH<9|CT+fI~Zl1WclWEu;nGY8R%5YnzH>1Im3-<fAr}^5My&U_zs<tVHS9xfLxMa
zZa;Kyn?aaN4>K)_C&`vM*EYeWn^d!y+?Hg%L#Ix{Zxs|4@XkDf9uY-cAu4o^E;KzD
z@oaqKMvnjj`~-9hH*eynawj)eFiPMN(!?Tv0C&B3#lqje(Do0l77dp5xsem*r7DnI
zN?a;_Qd0;~w7w1!8ZombKcZbPHI9&mAw^@tyjD^!#9YW=P5n!2{AxOdJXq9Je}LlN
zrfw;Is&favF^|WJ0{YXQj70-sl_P$j%MPGqFAn43<2Vi(u@&r0V+u-2*2rp<4HQBK
z)289TD8?8V?OqjjXq3CP3h<Y%<Fm70s1(b~)g_@>#k4Ql=pX7jGudre-@c#FQ)(u`
zQgx!7<6S7Q)7KUYaVEoX$zW)}zkU9^FE>=Q&b4OkK76>EGF9j>avIK|feOsWCnF!`
z?h3SHUG5iYzV1d`X_7OiR&ZQ0(QV@GpbgF1Aj<XpL&}sw``r2S<pUyGZ~r5%^}G_9
zWD9*=A^f0R$@v>r77ywXBor~2FnauaH!I|z8v)6&sE)B`JGlOs@t?S8(ObYIZ&i>e
zE)!XOpO5MRy{uZcZ?Eh1_`=G0EKoXnHhH>+Ko%NitvMDiT)b$0=UAVexw*Lr&z7+w
zLf$PN3OgJxH|_O>a*K)E+kDPh97X3a9Yh`Ydc=q-QDLCMoI3SqfuAvSgx0+*O}_5P
zM2Ntp!oZ-l#L~(NGu0E^4o)aan+gH<knOQ4um<YerHg3ub6PEE8{jAZT(^8CK4JNA
zsXGo?78qkSXvyOhVDFF>(6i&Alr82{av+fp>>wO-nBv5%Ra8Kf+FRXDv1mL4{;7|o
zGjIl<9wysc`0qXW3=Z+2l@$+84?VR{U6b1*y-%8<VpFF1nJFSjS|?7J&{%`@U(RBY
z8fQ`2!0u!frA?w#1iQnE{RmSbtLYs|w!k`n6lS^*g@fJgVTkTIPo;wnqD|sF?Kxz<
zxt1Cq420_%z;ta^S2sL@>L~5(vFo1SR6->PqM?CJ(WX;0kD^0o)Zc6$_4B5UHP|si
zGHHL#;lZ&KUV(`m!KU1BI({?J>yx;EQ*t8{V2V*A7mr?NZk`QVjSe_<ALy}ogzNn$
z;sS-NhmBR(bK@mvK?H+h0fxc#y9;FTr#S}>k)av9CZcc&sQe$gh6Tko;9^`KJ3;k8
zkV^YYTg}I#=0`pan%?CyvQbplkMtJ#!9Y6&ahn#Ri;(C9d>&Q=lX#a)-zFNPaPo*R
zGk;3-p|94>>wfL#WtZ4lO>a|HchA-1`Oe(cFBEJ|G_Y@Dy><=AOXc%vmdxe^L@ulI
zpn$b}3H7Wb2z0Q|yu*sG8G5Wf9nflqM3wu9@5urnAl62Nr~1>{!FS0lNk)9jE<5hw
zj5Y|_%aug6YvvvKqd)bJKnjo*lZDYqxqIi1;Ydav|5RzU_cn5AES5dA%_Fh|+*&SS
z?E?<@&4tK7j;TJ#k!vH}wQt`AZjCfbtKajaKTWze%C*&5a<Zia8X>c&4Sb-Dtbt^_
zVIE%%xqQzQfTO0(n?G7r3QWNZj8pChro~I1El?cgp5gUriISQ@!skd3*Oh+jt}vO6
zO=&Nh^FelVG^Ovay&XAnWZ}^M{m*7LA=KDLnj5X87rc5qu<yCt7p0WKA-2=!UUqGG
zbbQkbNwUY&G`*A?Hzv*BuSWeI7di);p8wi#x;XC!oU@L!1%0j^T;~ekC2ddrN*cJT
z{_OTl)dkH_5lLF+c-t{Rkn|$wwC!0zPqVD;YqDT2(`dbNM<M#|@NQYf5?nZni<yQs
zNQ1#r?|Dyh5$obL%^ahY{I~oO7od{Ie&^KN^(Qq1SYzu!r^sRHb0#}~K2|FY!&J@6
zs_3xjObBK=d>^{xQ>GRzS7ZvGO6&7S*&HSsQNs&AV^Ufw1r7zS*Q<{oKfZV|=Y3}N
z=84B~uo;h!0-*>B8LBEyCX~zl>DiV#FIqzj9so|%oPfY!>40NT=}Ln?aN$MwjKYkf
z5Mn&^9f>m3kd9eK&~lm)_L?_G^oo=zoK_Fy0cdMHIUv9SN>$S@r+xf(?_P<L2K<hI
zOMGv4PXA_dJr{tq1<BgYn>X=H&C9c`SqE26D9V8LVM}m;^#y;+$wAtZlkv0&9aT_j
zq|VvEfq)G=Aom%LG<I}6+u;!`i*L_4`6p+gXQrWhR+ir`q--#yH|+=Fg&3fXXAF@9
zq?mR5ln5xzy+*p`<?TIT;>1yWV~i9@(1B^`Cvd`(Z1mb(7BQwr;Tu3facL1j@#$=B
z4<o-?9F;T}*(C5)w*?_G9TK}vty(!DC576+=ScjTvsj!cAfUec%$~G$aP9Y825$ty
zz@C?oBU#W>Z$76mGRTT^h#fv+L?Va9@J9IrL;wlN7NA$Tow1_kxZK3EKGA!^%%7Vn
zDTVsqaV}w#2^>#H$8Z!Ans_*~qd3Fa?~8VdUOO^v+1o14E|Qm1U$L$1T(LFlEV~!i
zh<CScBGbEH!2t`$r?CHc4KM3R9CLk{*n^EQ#C-%sDoG}|PWR^obNid^I|MDrhK=Ct
z!uAYcZ)&9Z;?Bk(HwFgra6~}U2<8`=I16X9*iz;Q**&aPIC1CoYws$3sf&wn<Utg7
z>cOIdjiBH<avf|VsUvwUp;$mdBD}Iqw1UHsLC5wKqf9shVs`?!swMU$srmfni>^)<
zcTI3P69x|?DDmjuzZ6Xc=Hcr&-?zD6BCF+tFcIV?(Jyu<t<zdMm@B6eX9g+ZsZ@1s
zhqb#ipPe(0gK9DCd`wK{NXrE80(PHaQt0$06#kqFIr+z#1<ixSLdOcVjkG2VBnBCi
zhWgnw;UP<&LU3OG<ZKG2W)$8esijs8K3Le@xqe2tOc2Q3{)YX76SjUyH^GJ^pNied
z?m~Ms>xiaICkq%&B9X*2zk%E%C51S#{MIfF-f{2OW2Lu>fl!wxex8^dmUDh-0w4kn
zH;&@e{PD5Y?yPID3<Qe8g}ULvZJq10X^@OJ`iz@r^v5#oYFaEj@uYh;eq7<YS+7e<
zB7aV3C25LDSKq!S{(FPYw{9>^AUCLrQn<_6BQHQkaddl^-MeE}95i$;{O!AUk&NEv
zlrTDEyA>>GT;*0VkIu)6Vq832e#IgK*jxDJ3>-29IzbYV0Oq)O)u>sW+qTvAdd$dj
za$fH1!uJCwK;RJ6xo{IwW8M^7*_#-=4jAx-l_D&(`M+l0-xkw*=EQBHfg%A)UCFCg
zUs83WaAyTg4IJs}#BC>WVlGz|AJ7BF>0G0Al5yigj?T8*SL>HU;t8cmgsp?@Easg*
zLMuw2M^EnqmY|?)CJ|Y*OOXxGtZ2^^fS|kuqhKD95T@O}-9p0n7*}{0ATi+a3+N6z
zJAa3A)!@%R1+RDRn+n_t1@ltFi!X~TvQhI+h!+Jtms3G3Og6RXv%=0{rCuWCfGy(x
z@Tqol!6bbpsTo)vJk#VyRmG}j8)J<u=(Ly?PteV?nB^|4g0%&VJ7n1m4iMrt1&?vy
z&}b4M)`l@Ylt;ptI00MwxRe70NE9t&B8?HbJc|;9uE;t2hduCZpio1@%?f9@j!WUU
z6FSB{3VVY6vT9K`KoAA~7DBs;$2oMx7lL<+2i<zeK+rb@^>KS|YolUddUh3hxK4wY
zSh7iwlvNXq=rz42_Bh14VA0R2?6q^$TlH%uPq+PXvXa`N;4%Jq-)5Kmk<+?W3@|uz
zmC0FzW9#Edx1Nee!rW}du{LI_Ngc$ZBV2O~gh-^IT`H|-mfb7bsR2e75SZv&3J+ZJ
zVHJ(R;7}k2^nDJMLyi0L1)%E7w{Nw%ILtN@($M>xVxe-XNx<%Kr(^y%W2TaL|BGL=
zWlWF~M8*soZ^>tJ+_a9C>M9dRI5{5Ag2BUec$|~dYLFN=!b5HC+xue^(w0oeFoiH!
z;}Ec!eku=6m9?bB(kAJs&^^$b4(~w;wR6jsUD6Vuz~pnB=3=s)%msk`0ym}NMG<ah
z3L{;F!h%FDE9vR?)M*Fgrs2ac-VXua*6Zy~9g0`w+x6$b17PF@@dEGPBOhH59~tH6
z_mC3~!-XEx0x`OfJo~Z;S0#8f{#7E<bN;x(`4hk!cP%2TnB*Q_!>n%)@94_Uoaw8*
zh%|u|pL3Lyq2N19>f+yv<5cw?2gYUq&hMMyB92NpIjIT4)Iellc)L-qSHBruqwN$9
zUVzn!kzFcGZELKLa@WLST!tM%+1IbQ!AZz9*yDUy#<BN`w7dK>eEa@3+3<UVmuK^f
z5E`s9&(N!usGt<3mv^kDicQVbE8FO~IFs&lyMOK4xXPkWWY<juZQ>ia>Y~=h?fWuj
z*TKLwKvh_sIOabA=A`6$%{V!^tzch);d6HZhtu!fBZp<{q&_NgqUsd%^Z>IfI}&q}
zF*h5wYxK2-w-f$FYs4p=eywa_Xi18vsTiHIL$C~8&oH1RsP$`>UCrdqL5K$~lG+Rh
zGZv4;zD=_&6{y+Kt|fU%`hENMo%QH3rBo6k8YwC*XEF{qi~ald0bClb<SmkKZh3Q#
zd6&KuABLMnYzSH6l0(Wy)DIELMaQB@r%?nQKnmQ)wV$>0c6oDXtxCzA;(~(iqishp
z!FsLVCGao1g$rpm!_lI7=(e!t%aNK}%a_tY=LJyF#(^uzyVkU{Vs2lq*BYGq`1zfj
z92d?BX|tG7xzC=(8ZEbjT*j|J5|C`cx)gL(ZO}F918UwE7#E@=0hSv1sP|}*v=j$>
ztI~tpWt3qQ#I<&GLzeYyW1;aD)vmKzIE8?s6ojvauSKNZFd>u@1P#UwF8>;1Ip8dW
z@S<z>ix(%%zuhwax!G5u_nm`7jD6stz<$vyIkzb%D)q2|*=IC>bW;{n0zpyE!HZry
zkNM42zM)EhJLDO=`PfW!aWsC2*saJyIlKdvW1~PF%vp9csjf=B$>5_Mn6{{NJpN!2
zR3Pd}W^U0qFOc@EL$_Nut}l`eiy<T8T5{zT*FQiU<gJBWm{Po55g3rqfzc3;sE{RF
z;__S~R=1_WO9#TnhbG^QR)IUQ!XQO5j2k~)pzN92$d=>Xi?DiGGo|adF1^0rc}Ktz
zTp8(mTAn1=$9gbdX2q4^qZ=UKYSz4Yz)LcOR@SR}c`YKcQN5G}n%;ayeJc!1)sE<|
zJ$=6)r!P3S&(^IJ^HlDRLk#ELPD{gz%2$nGaA|h?S1pS&!~ABnT#E^F$8KED6EjV1
z&BB*chFHYgMb&z=nFU>I^uG?4Z>x`!F~28pN=KlF+OXzDOE4*V_S(3QEKGi#-RMbF
z(>(@e{ouSxb{^cf@0M9Pbp=_5`rbX#F!)}UK_?43Thd%`aGVj_W`%3N_tWJ2_iNuw
zscxFl$s}`HEEx}1kzf6*Q6ux9({B{pM9ZQco7D*m^+|8ktlh1wf4xUZa!@rwNZj=j
z2Ezb7z0>#CfdN9tn7e#A^N6AX^seEx7oQ(a&u6-wn?#wwlhJ$neQXrlw`dXc>}X7v
zg$oyI1NKdvJ}=XuoC?@W6$_Mx4~3#6%OO^+^{KmA3zjbJ{q10{=G6D{dC%geA2~Pz
z5&#_;$R)-%_s#ui%QzVh(t-do@ZL_(9L0$n)-;JcfqmBd)e@E)Rbs$P@>pVAc<657
zgt9*D+03*3^%?Zq;obK;Fq)j+Wl#MkbM(G=-zuZG*Dj-rFfCNuwhePn?earDwtra5
zb~tr<jB8Sy6B|K-q}Ca_LD#fx+rl;FX@|U3K}RA&S2w-k`E=WywL0+{7gafZB^;Q=
zHYH8$K-I)>D%Go?f!D4b-yW~PW}kc4L&c$cm?PR`BtXxyg$sM8<fI?YKJl#6*-$4@
z_2Yttq$4xZhOB-0Fum`+oR`9w!E!|VnNCQTZzV=kr}lQZLmNgTR>=*hTiD`!TteHY
zLu;EEBIo2{u_*)k&pZ^<<gYagh55n8Q(E~+^r~Rys&`-<`N)#IV;J~1vWi<xLKFpF
z)mQ@wXfo@6?vcx<cM4`tPL8#Y#VO0_i$=a>dtuua<~}|-xW8#cmgDqT#qgCiO|yS)
zO}%?pJ~|LC^~KSRa+;{0TG#v4db`V0bVfh((1PMF>=@?C4a<qhvdPk`_wJnzU4PB7
zr=y0jnMx^`rv(VnG}+WX?j#nG*&{7o$6U$(2lNHbphN7%X@`cB4BAbpGi2z~@>hj#
z(vG@$!l6BB%T$)m?di4X$=MJx;>-mf!4jZ@rCF?4MHWY=?<wU-e*UG*wt1M0?N}2t
zuXeg%sPzl!mYq9Cvw1izky*e~rHW}1`A_fSYGsYubo74q`JAP*VqJ%rkNCO#Ebgp_
zSBa?x=o$GkXb+vpEcp^aBm3icP9SK~N*}7d2c{!rrZBmysXD)6Ui>!x^c89aAT0f!
zJ#P}%iMJ{1mCFkl(hW|^8INq05wheF4#2_`v)u5s)Kr@D*+8V2^?0dLoqix=35n1-
zo1B=5eyH%oJubjS-H)~Bo=!!July1umWTZ;69x`|?4AFl{|AdkcY;eF{@mQviwJUs
z@+0^=@nn-u>Z7Sgj+jf{Y?{%y?Fv`zoreq)o+MU<=owOU2emfDs<ExQx~3OCf&cF4
z^i>fSoEz77rNX;+Y<~99Nt+Uz<w@#lYTdmi8yk-%xx;ZPCf%@E5Aw+>jtu|d*FYju
zS4M|y-V|O$ZY8`8gq`@J_@$#J_%cI+6)AtYAuA46oTkfSk=`07%(q(GqjI3>owT&4
ze1EcbzEj#BOYWsdWzosAcTB{1KX_vT8zN^0YeWj%bm%&`hr%@#`=#zUwS`C{DQ8Hh
zavIR5WKN0j%qf1#5!|Nlzk8$+S#;#Ji^Wt35=UJxXI{+h%=Ub%k{cNm7MMo4oTvm|
z-SBtX*;#D||I-nS<LL1LZ=vQuZZSR9*eK|+uPov3b9Cz8tgb$?<L;y%C*`9KI%&?i
zToMxKq*L;7d9>Z#oCc+9il)I39URAJDSA9kaV0-F{;s|*>JM%8j;Q*blP9D&0{qj#
z;+)<5`5l<RQuVPxzR}o(4W<cK<^%y?r=8u~9S9ek8Kl<VmKe5_8kryh!kH7dn9K^i
zF0-F8?m4pPM}SY9bcOn&OewKFlad@;>bHM?*rBZ-!WX-b>I*gsx_VR%WypNaGgvp|
z64{5>v3CRvXm5JD3xx%dcAB9f5(}Xh;T|+g_F}LWi5&%;J6JONOCOn-u5KC_eo9IR
z@e9P6;j5QCYLZ6?vR0$?E6)o$MN4*eUH^>h8)i5@A!ere-k+&w_RAbomIdehFbn%Q
z#h~qc*QgOdfByL}+qWLZw8{0%S5LP?YsqFR!5qLSgf?+gYBR+(?9;TaZF)Iva*zqW
zfhG=jc(=Paa8avGKDnL)Rc<KYQ3`{-c8yE)w&yfIuQ!_S;Agb_CJZHjvW&a`v})C(
zA=qLs=WNCm&@;eg!G(1da=iJ<_;5N#$X920MwmyW?YT{|U6^5m4FueUOO*qspZ0=-
z4uNsczI~x4TFAt<m1-r)Ib1-9&)sjnb$mkQ*I6}{7Q;g3jLBKrXPdiqQne`_!H(3w
zcKaiH{gBTY5LO%$H+S5c4jy5g&)Sa;g_8;rL=PGSwGNksbDnC9N2f^hw9INPPIWnb
z{%+Pz4H;y*x8gE{`qZP!g>!+EIC<E?fM0{1^t{^x>?69<8u{LplEiV))7o}RZ6Bk3
ztDL7#(^k5;T>Nk)$Jf#54S5h{#h$qep_<%9C!QffPZOS7QNEHUA`36F<Jn*J2b~HG
z?9g%H)G1T&3pz5=wMWtvBO_v;uEUDxwa%Q2jx<!?>sY(suE=l{f28RL_wSE{e-Zo0
zcEH^J<b}Kc#Dv1*(so3_lg(CNwD;IU>|NbF`{z(s=#s<mxzTbenk#p?)Ze+s1SZ^V
z8#N?;{OXTN4U4MBwWvu@fA?5#wXVIn+}t2v0JEieVVtIg`7P0>p62T`V5uy4ELyxg
zKK*&Tz7LJzP*eOv*G`V~qEDi~rnrpSzKI4oOg~&W3B5CDs|_BUef8?x`$xyx1qwPo
zB%?iZOL`pj-eWY`sD#kz_^&-!6{V!%5aTIR?m!x@Rbbx#@MmZw1q_O-T`!@El`M37
zmktw5g@krnJJTguj--VJ8!mA*-rWj*tnY>qt@fF7=N3bZ`fx@5kYbPQvqS6F6vbyj
z<X~yF8FB;+#pqv*k(mm%((eK0Yr-G-VjdnQ5m^o$Cp=~kW{s5NVInK3#BaoRqUjBX
zYUxLdM(NkP6hvLpdiuXz#?iCBuqSV?yfQq^F*W?<?d1&ad;9&LJ`dXWd;V4Whdxq$
zGXT~;wuxF^aF?7|K2T<T+PRS=Ys26cb^PJ+M>dqA)1XY8{F&|KQyE_e|F_=xxh6!t
z_DWy2%yW6yTN`8s4ymeyEig&>0b;<CO}j%L(#IQyJ}|o<<_Tc87q+KSs|?G2_Hg=I
zgVl=`g{qbD{Lp?m?ka080l?GYpkMH#tjO=<1uO*A-oCvVHdDt!>*w-PQvWr9?$uTC
zvQ^rtPT#gq4wI21U}0^p{FxF6KDqkvI~XlId`MYwHu1~t)KsR{1iYk>^#A7ANR0S+
z{klg=JKxvP(S|pwp;8=v4`LD3WrP$<LCv^7^it_tg^X@z-%UqW^fjXNR`7eS=Kc5v
z^pyh^S(Da)SeEXc3q9?O;n~2m;m(zbyUAB?AGD2DG%pHYz3SGGw2|1Q<6w!QT~$PD
zE@aI4^LphzwO32Y`AI!&VrvV033z^3?c>w9K6tQ(kCaE`re5VMh#>`iW?l?FzV{u7
zgx#!rI6~||3A1`T)-1bsoO3LX#G7IIJkYtG-~N){Yz+*7p%X5yoFi^0frm6262vFW
z)Ei>l+sCOe8!U45_Dpv_rUVo&yFzSh=X)Y__r;Hgqp@n5a&T~OC=$y3z>R^t<oVWI
zEsjEHbHfj8RM9|xqdREc*H53`a_CkR&a<raI3~C#uHi!FwgZ$a$W=6@y;Ui__@xQH
z+U4u*K21rTR$bXdBY45nhMW}YYr?hY`0~~h4vmDyIb&o#Ny3JXbJDaU`jdd^7HQ@z
z4-Rd1{_!)j1)QZ_*XqDOsj$#4$ct%+8YXq=t7fIg72`#e`ZWJ_2&ADmB>|CEQ2{a+
zjsjVDc0nj<v|r30+B-MaI*9^7ve|y%0d9rAdptN*s(Q4zMa8>ca-b$6y_@dWy?gh`
zBj+y83eO_@IJODm_$tm=DU(Zj#|`cMRw(E<Y%rTzIxnA~CF;vh*7SXrgv<tQDNMb`
z@pG)5W)Drxj-%Jl_$H$%;lBZYZhl6JS7fRq#_rLgbY8daF2m6hdafKsrO58*oVutw
zS0H9Le09rZM9H@Fm60C`E7Y_09k&UM--JViU5w4osp8ouMmY_CU4h_F9M`CaU~9x@
z+P61h^AOLpQ`ptI;ftzyf#J!M_OPVm7Cn&otS!k2Ij&O%s|qTuM416=&t@($iksDT
zgx^_FNhHn^j@bA62~QX9;novO$m#8H5_ufyScKZa0|&_960e@s=>`d9*m^eN*D!8B
zi**%hi3+{lYb`Xqavu-sJo3^?(Oz^uc{Vmy9x#+mpSg6%5e^S~8jf&ju}uld6znt}
z&o#%_k!YO5nspjsxy(hU{7?UPhH1-?Pl@44FS(KG*6kT#dJ#l@!^}2DG;zb}XGu@N
z2cbwg^Y3B|F7XT6VNfvfJZul7LJ#^mf|ixqU;(2Olk}3Dy_Y;4&DHIVc06I%s|^iT
z5eBfa&(hP^O=W|z3H4HhvB2YR7Ln%6>=pOyuo#@?NO3gdjDz{{2lS5x$baz-Mm&hC
z%(1^T3>gk!!Kvfp-o$P<uB2jKiMk(W!(p@yRttX6rNd2+-hrUfU}y4dBN`B^MN2=E
zDlNRT4~_5ubj0uo>AfCXFqG{_&CHcP;O`&9WV2(bf9Xm|?!k<^r5J`dA_8)ZlUbpW
zqPMs0=e+}z{ztI-9fs>ZDbI?>T(OYlAvnJIYn!f4x4QT<d_;fkZgjfH+l*E+GVV$<
z;mAT7;`em(yW8v>{*%<Pm$L(*NSbD*aUEc%$xq9@rsj4ODP3=paytexba~AS(br8j
zA{*@8+8A?5fy(E5QgGj-=aX=y9Uw#MG{_Kriynz#fy@vNg!!}lGL3<d{GVr{qxb8t
zU1lX!K*yJjw$ydXK1m}uZAo(q0>HO`kq2dBJNTNMQX00`I`S}?1ThYr{kVAq`%d&d
z86Zx7<_bf)@7|4kpEec6kzNy81CJp&Ls*EA=}?s6td9jmUuqK&?2hUE0wR_R>4Ix-
zpyR@TS%d8}#w#O(6N*BRcI5ILtZ+7A#$aFsrmmua%XBs8ih5uW*(F!U?<JwN(cgdH
zym2Eus~BYiOnf}X3!%%1KG)Zo7@~;&4xe7K1!z>bLIS9hM5o@nC;XZTkj(L1$?B)b
z9`kJkYe38xR?7#}VF&Ryrq@zao9W0Yj}D=lq?q64u7X4FF(>b3(>o4qCJ8T-AL&Ad
zy*~Zr+kLe4j1KM8Ov%2A&-QGTKsYok=Gs#iP@l?VnP@{qnBWs1*uP)U(s|2A{RDJa
zFcget#z%QIREwWTWeq;l?t+rwiUN~256dd%;|N_nfr*93r%#zyNYRQrPOapI$Wx9$
z4k+~pxh0KJYBjv=DQ_WSxl>bxgh0vDrFFf=*s}O%>ht~n&uICCs4TEwE_%~I)g(xC
zD4BjIl0~6hM-VlBmEdgC5~TEJPwv>dbscfWFa)IH(kQx2MAgajXfY2*)b4X9d^_X^
z?HbR=)X_Aqs8wFRe96M=CC_SA)s)pPbT2*_kEpAve!_b=OcqXG3aB1Op_4)%hVA;2
zB}-PHT70=A#Zq4`2_lp4LyA~SO@cnf;s?qkP7^Un<8m0jq<S?s*9~5VXp}{VwwQTo
zy$I}7;_{qEiZ1Zc(!OEO-^{O#Z=M0y8CV6z0B?nyZRU-z*Xc((w`?iRd;%;Q7JMf?
z9W&9e`~AT5_ZUT~_-x<afu@iez1PJ1(&<ZS_K?_5I5ZQjDj54$u}Fwct;{qALI>b$
zPdhL$Y<AHLV1ADL7K#?|vAwI-tdSRQ?oK^RUF2so;x$7<+N!FmwraIEYDh4h2<WiP
zs1am0--)P{R#kmp8J}|Ee`Gg8bc4Hy%GW1yj;h|aiI4O0lJB3x%BPwAdL^Z%eg@;m
z%EADcwPKEil7Dd-_eVyS005L8Vyj!gA5Pv#n~GMXc3A^j#7yP;@bTlUPhl@BJ{&d-
zu||eZ*R#ppY-~G;uc8m@mue@*7z!_b`mG6+?~YCKFp1Z*jtRZr80n9m#jPSp=@-Ce
zTa7sE?~nHFWO~yI1QgM9`Wo&unq{L658(?pqREcNjL047dj3O&O{_yK{up$2i#j=#
z78sPQ_v79#qs$HJmbO01P|QjYCumS_Ka|7@44Ot^=^>Ql`*Md{I8oDp5KSCEUM<#~
z-y%-t0(85#)Gb<X_H_y?HTd{KC$2E?M9-hA?tf+80p=-ZXRn>3A#OWw3`Vk=t?hP_
zC7nh8RC4aX(3NvJr?J-P+_B@#71N4zs|0?p=Hr+O6a3vfn==jK#wT0w*18wY-DYPd
zu`w5S-nxA|Be24)BlU(llkDJ5sO3C^LENw=McQM`3sA@jr1^RCsg!M0?}pJ(^^U)n
z^~d>+nLripg<^ObO{}7d62G_c9oIQKyJ5IeX>8+9Sm<MOeu{E5eW{X8C5k+?@9<xU
z+wA#bjZ?A6qQleY<r<|vc_PpP3YQU}Xlw=gCFp^0)F~02P&Tsn-d6GhFa-e4DYL+F
zw*ohxJaIxrgGs#*SA_C__7iTE-Fo#pb$>C+t>CUXR2apkE?8YhM>lTT6d<IGr_nPN
zVmKxTV^mg9kjYCDE%@|rFXUdiGKbGW*n7tgpy7|>Av+DT1raY`eb7dx1}dt!LD3gO
z(m?Bk%c!HLH!VztU0xQUA2fuozZSoUm=(P{Lpfu>6k{DkHp#AM`#Z$uz^~yg0ZRCN
zq1qvVTy4zo>(5N4g>1mc(ICQJ%GcaT85+rWT9H7{3=Qkix9^$z{Ya*SJOrH|tBGam
zci;d~_95sLnS<dG2k0JfyT}9P27LMaxh-S`8cp!9XyN@lX1Lt<CB3qetcXXDIhm^)
z0DnX-FNF9<u)~{=b}Ein2ysV>tp9=FO~9IM*Up7=hIxA-Qpjn{RuzZpt5*-{#fIgF
zk4hETmfu<Cu}$Wlu{d!~389cdm;ep#B<LfNhaN%iXMv!&2#AazB=HYi;$Vp-lPO@F
za{J;F654A<^v8AJ8E^wf^Jv>-H!H*2d3n{qACKta;TzB-NzAk-b-|&L0#RsCS)CNp
zOq7EAX5rycV_~~4WJCUjIYsA8Z*y5-+pOVY(Uf8WhT)+HoFak@#5H3;^0TbRiuE~W
zdcFN<s`nqR`N!rWf1cdXTkzG$c+sXPJ-3@wHyq5<FyiI#wu;|7nn5XwchN~zV2IU4
z87Ys86#u}VGTVBV>X<<NwR-ycHp&lz#{Df=*R>j$+9#T<+uvSak-jRmIa1qdj*33V
zKY5dS_H(LJM11dDuWKp1O0P>91N;2}0Sn&?3##x=P_r8nEr<NSP{cWryj0uy_5jeW
zrKZb95|*W1e#X|lu>9}(w%821-G;w7;GsUOezjX3k#*g9%{j~7D-;u#w}sIpiAt`-
zF&J4*Mae8U?Jc+M+!?TeQm2;UF1}d7JcNG>-wQ#d;_+!T>+}l_(u#o6PQOlJQ`Ouh
zz5Y2$GChbrFzQ-KaCpA+n@s;7ZG%n#sgmKpfb_q}>SOD#Gp{4lqq)3$0~vl>!L}C<
zwfd@n{~n{c=g^_rnTi57!N*(NTJadO{=YBMmGhMq8%cixKp6c<t(-tSuXF&NxB*Ux
z3+f)<J5w4{q$6UxF->ualq%{W?)oPwDZNe0eEeItQ1^4QMs-*J_wziH>yfzMKl}eG
z_s}^o@%XQ$!^@1(uKzqbGDenQiu_Oe_KV+-|Md=@8GZlrt)AO`Ce;)|)JXG$3m2#m
z<B>z$xTpN<`C4u3lm?zYEP{ie>rFK1C2-!!bmY+k6l+F2+i+7ohixT&@2?b}h#(tl
zFm;p;$b0PD=Jot7Kk@&i2$8UwDRO>BG4??9B|JPzTOLE4q`aoOm@vw~?K*<I($RtS
zS4{nGR@NG_M!=`qiAX>RpDD6xbwx*Cc0er;Q3KC#cp?Lwgs~<Mt$uZD%d-V;L6rxW
zC$C(wLRCVuhrw&SQEIKU$Ugyh5Iz)s<fP;bGt$Bh@^M?m(m8s){sY{20If?Tb5XNq
zDew08M*<K`w<Mz)v^nuIPI{s@EC!x=yje$w-<Eu#{sMTm9PpwhruHLwwm`GyMuIIm
z@V3ct3OS)yI5`boX3oEFqO-mhVkP{P;E<5U8q5^}O?dn9<MRKf2&)@*0y9~_oN~b)
zCs1#Y4_0yeu>c!XtNWU*1ck905k)t&1OY^9ksCtZT6wN+--rELIo_Ng?hNP=Mjp0^
z0-PD+abqe=`sjApf^c!5P51Mi*v*T$C2X;P2B2|oKUBw3H{4mj60AaH;Zc{WUFeud
zwovj7amc(?dnx@lO*~#f*cXf%u<zX58AZ=^qY?!X5m8{RWKUclf9Pqo*XgL^R$t#F
zf-e0dyV})kIFHl<845eHv;`P62R@0FEXbryKc>})M;o7C8ldl;(RA?8Oe&@*BrI0$
zw9r!H1#};@<OM|1^gxs9hf0!viO51DsA+wSz(Q+S^P-n)$mkF9>I)W?uWXBUD*mGL
z+!ikq3%#T@j*jg>&5_(MwX-ujW}tX)ll>2vhb|t8%Cvu00ge8=2_@|ttwv4jxztAj
zaCxAKnAe^WOVzIs@jkxfQPZ+_KMrH)|Nd`V{@>q{_mA*&Ehkz0>+ipDkhdob7T0vp
zpxzG(6>l3_Re!Yc_t`=dwC<JfkVQ_yqTs*CcA|1sRTbVMQ8mssHc$8Hz4-7!O!1Sy
zZHeN#rEeUlV^K@e7R`2jE?8(@Ny_N_e+WVw=0=dR2b2jDcpkFeJ!r?>xL0?yDSpVG
zq3x#l)(O3T{R|m*J&O7JXTOwczkf7l)_-Dj-H-pPrmOqW{nr1#qiR%5gMYkycE6ii
S@I$<q!Gvk!PfKQR_`d+SPvjf`

literal 121695
zcmcG0cU;bY-}Z@+9}TNW)5@xpc4-+QDzv15XirUTi6oVfM2k{sOM53Oq@k(3q`mii
zj_+Bn`+BbDzMto>=e%F{&lR2L`Tc%A?{OT*`}lm_<z=O*{$cosL?Th0J#+FBiA3#0
zBCY;Oxf*|SFe&jW{#bWIM*1XaiTJ;e?1*P15-aKKNl68pfc_>u3&r0}0<D4WdnqWd
zQ*S@{j+J#Y)xAx-b(x+B_g%_=znaeNZOV=4YNfsy!O%1BPrNISVhXz@pKyrx+Cd67
z)eTCUtM<FB-k`O5XEV**ozZFi&!n%vHmKFQ#n}ywiQ6qDW-8GgU-&I%=IiUb3YW6-
zl3z1&asA&PNF?WVC+6t>>qq@_{~vxj#NA8&2HJN|pFS-Un~O<HON)%ut&LN^ShT8F
ztup*VWo6}S0h8u9^~}hR<X@$FI>|x&kM#lD4=ZJjemk6=o_^=f9TxjxN&Hde?o1gL
z5utPI)<pj=^WoNf_CSkEr%#`*awp#7{D3VkF_EWt6Y&G>M!Zu_d||xA|Csnp=QDi%
zgv^Z_?ar&#l!^>xd2r<j9zTA(SCIH!iR^HPjrcRq(HPf;%d#8&%I<gM%BZj3rOTJ?
z#=FhxuDiX9h-jGHLQVVb-McSezVy|`FHPkxWo!R<6`%F$#S8u$pC6ia6f&88-h44^
z4e_B2qyZujNRwB$Z{J=SA(uOl(jYcJToCo_{rmSP+^828r*g$Dy32<gr!qHe*sys&
z#al%+wJeLCpF-wcB>_U_)lrIye>yrkxUz;nHJIcsBot749AzdxQeIrIg^0BTC)Fg~
zin<S4EqDF|MjiY0>zA^|o!@Ke)t<a~L3jOVh@<UPlQp|skf>#EReO8;@cB!ZepV}M
zIutQk^XmQFTW?5wm_+v5<(psB?yHFvXt5vLDkpmJ;>FEDLL;BI;(I3-7v`jQZHin<
zF>2ty_3e4)P|n;~nK(_EMQ@efXy;dA50dhUk8++}jw$DbZ@6HG!Sp|I8rj04{gn}A
z3sWuPH;bQ~IkAeWq$c(%i{o^g-B7mm+~h!;zd>AjSN?hy!>Y)Oe?Lcp9F5KkMOE)F
z7kM%5O8JeAaGLq9*=$-sh1I(FXEj5Zx7cjAoA`WN)}66W1^xZ~;)^5Rfx?!Lc()N3
zLwkH>{numtYc-t~w7iegF*BQ&aAPyXJD;3*dtoiF1#;wa`1$n7dIOnr=UiBIE5m=p
zT={4_HOOSiNqh?Hx%yP%PhOkfzkh2LKE5;gOQ)dVa9mpZND<4_+7St(D>?Qv>3@Re
z+T2*yNn+tA|A;TmggujeB)ohXJC`qmAtg}6I>@|K=$KCFCKuL&2lc)^Kj^1nt5@~L
zdErO+h1aEGbNyKTW0WqPot^ZW_9Nq!a^5SSLACN336CB<+Pvp90%D^i?Ot|vxnQw{
z{*Pr|!rKy@4smc~cfa*moGubyCs`hOG1UK<PLNHz`$ab*R7kr+$h$1DhTp2c?&~x5
z^~AOE>y!pyS@?TT<}Jo#WzDY_pU%gM-(%ISiTSn5B+G2PLRy-MD0HNUd@I)wwPEZ2
z$1LJvlrE<|85SnuosJ{BLS-rXm%e@bR(gD@apy5@3dtw_{^^Fl60HX6Pp!QF;L7{M
zX2$+F4cWCEK7822+dEjyVRq(EFpK5xR*8$3F5O1raj7J*Qo7{dTfek0<Ro|g{Nt5H
z5Lu~T&c~HhRBZCs(m(h0T`Ot&og+1}dG&|qPoIuPh%da?D)N$~J))A(sv|ykWu-2>
zOq?PY6{@mdI>8Hmx%2xdUAD>!wjmcToH_GQc(ZBVOl|HWpUrUV&!0c1r>Aj=Gz`24
zdQTE}F7dc-IU3h>z4O!fCpy*N-yb>Y(#t{Xp5rjrj5wC0RZviPyz^LiQWEdqH|dg7
zwEW4o?b{c+rN#fL^%h^W`Tl|<^v4>;;zG||wYg49tdyf;V@RjL$PVJ0NZiY*C~<=4
z`X^V*z8c&fl8A^J6ijN}k&zKpQNwfR&e2lPhz{pDEzPw=ZY9_F4^K=Rj8abpl7G<e
zM1Hae-R>+66xqe6&x^Rl3%|KOLn&6(s6MGDHnsT>Hh_cU_(auZY##CDRYG>lmwc_!
zyzKbZ6vGWJ*ZS)|xG~;7Q)NbSVWl+A=m|%lu9G~9Dv)eYdzy*dPa@4%Qe@NBRAYxO
ze;pUrGcq!nGhMGkz19qit0wDDbg(Q5Qo0}k#PD^v72@l91C}d2Rl#^qrH6;dJ=PsU
zw{JG27!mJsLP>aqJYc1C9DCjJ{KbpE-*xrxcRgd5qozml(}Z4FJ-j_uHCcGB|KmN@
z>2?n;4ab?^D+_8Vv$A#9e!Swt;%{^bLC#3CI9TA8a`a1Ha<{e1In7>9I!j!VbH;KX
zkXUonpeDw2^C0I2`Ib%0qG^iJN|Ll$v%S$G)`O3Qsfpi}cS|xM?n9TPT^7W|`G9H{
zozWI&{e64)2HgI3P?FZLIZKG0oqeUjF#XI~i(d{|Wz!d%iZauZ<DjIZRCR?Er9AOx
zRa^WK;v=s)y<5&vhg9Uvh1dLRC7%QaGSSo1TQWNLGC7P_wC(2N;;P?E{IW#g%17oz
z_kQ>_O&3{~GuJf4Z`>$YPW~$Ag}~+YooK@CZsW|b>?1C)>CBlk-9O*S&{CiTU&jUD
z<t!Zb_j}R8Oe>-kqjjH=`ZcBv^wAEnsh+|_cO=OxFW++H&?66zf>XA(wpcc+_&Z}=
zZEqtZ53F2QQ^oSF=0w*^C7yk@zheJ&O_bFAg9Z;a?c8D+$B^Ei_&tYt@rOW5qlb(q
zlgMPB-z;k8l?EU8)9AegUtVxreS3jiw5phvi$-rIYK!S+tORij9^OmAa!rR19r};o
zVx<$d>{Xb_^%5COi|nvhJay`eqO!VgP*8KG_0atMe0<g{BK!W=7Ml)FhpdXra>uu8
z=$8fv&Dk6zZjZG44S5+zwDU7%PC@99T|Zu%o0~h0hq9bn4t3|*<#-~o9^m4NSM~Gt
zJ@t~DJI=PU<Uk{_3QjeeOM5dng`IsC6BC1_=Uol~QkACr3jTViAtm+c`9bs@|1FlZ
z<ge}(S<XH34<v~T%l-X7eyTvFnJV;VX^B>jM-Pn%3quF)!7VL}1uv;xxw6wTDLd76
zqBnSH{*N@1rGf_?2ZMmIJ+MolRlOboBTx@Qz19&Qo7Y@P-X_vpg)3L?v^Y%Nx^?Tq
z%cIx7Jlc-db4pLIr#{(WX*@za5XFZ;E1l2eaz<aglkqEb_z`H`JmdR8n)yy=pH{Hr
zY>#pwwfOvSyf>QZKu4M7(!$hKcSXlw=8y(@BPs?u&Ieo<8j^--{@Q!%s8l~{{}9t)
zAuhx@a=B2FE@q$*I!=BJW^$Szi3jvU5B%UTH<^`{waYk-4KGR;qAu^`>FitEXgl~|
zm+^FuoRjLMOIs);kx!b)`iZ{Up}G%y#Ko5o12$MKFY(1`AfHGrayTUjEC;Z2an{n}
zTxzpzUz%A*S7{&t_fidi@$m5_>y!l%m9=GYGBv?D6BoUaGcDxp+qb>y*>+QL*wL7n
z#>89SjX52gENky;VGn=(P;qp06tn`Us`^GVG1s!R2r#p$RwvN1=F|O+Wr7`#S2k;)
ze&WSd<12t{SQF!{kwVpUCSnOl+{<N$bTT|4A;F+Fj^_Ayd95Y`zhND&@6*KFQyx|=
zA+h2qhQBUt&%E{hMef3Y$=0n~b9tWm`cCAnp|PJIDdLtR7h?&I<@6`LwOE*_b+c=9
zY?>bF=xA^MG_l89bW4G}ok?%idt!^zm_@7%k#EsVbNMum<){xky2z^(QuB95ga?u;
zb9&yDl$4;)J?KLU5&M6rs`)j)k!rQAfVe6GdAO}W>nSC18_uf#x)1eCE3}zkKk_Q1
zS=5s?l5{UNvDdQ$Is=rq35$v4D93AL?kxTG?a|o-#9K-F{>#;h)*s*2L}M}7n64k_
z=cim`Hd#G4l&h(#$`D5*+#Q0R(6lMlq(yB18}j=iZ!I?jiJPx@^%Cdq0*@=k3K%!4
z5N||Lj;iVD>B(eStO+JBwe#%R<ve|O-hMvS{-pn_SE|X)iMM2!ihe1h(#5JGtNI-U
z+1TEAEw^NO_LZ2KH2@G*?8DZbei^nuCm<m#Z0{#>DJK0bQ>;9ioV7~PU^)6t*tG1-
z?;tjlgPwL#*fhmHgi#ZXNkdBPe|~haVO_%YsWk-2sTMwbxbJX$;g>IpvDSl)GZ*M*
zZr@Lpwe0!14=*)>b2KeXrnby}B$JH4On~Gw(;ez<mAlr^*x0z*HqUvL<{dtSbCh!?
z%CkvEvD4yMutRa$dImoIDpW&L^@|rj*w2hMZOWY+%;ercG$)b-$4X7uefi==ZhX6m
z?L;m>7}`a_1mLZPqG9Y77M5TipUrW#F;~Ve)Lu!rmY=gO_Jap!Mx_1%Ae5!RvyVDu
zadTu^MXC~8F5j%@h=>-*wm@Z2R#JM<*B6%=SMQI2<WW1Kk(H)D{awRpVS+|%`dYA7
zP1PxS^(rj0>G3<GohJ1@)`gFE63wdQ5*athd;Cq}y?gf}92(7vm^7XACi{QMiH@91
zV?KH<?<KcZRQ9Kyh+t+;fa;Gx**%Bjjq4KHz8#Ji?GCx8&o!C~i1J#{>_%EMQf!}}
z+9f5WNnii!Xk|^yYDEwXZkd%OS?;o=-N#H@^Ny7YcF^ZefAQA|nwjm3r{BKa_xbZI
zETBN6X#u`YLqmg^BA?BTaTK(KO)O`Y=9ts)J(eVds~S<ByhTTg*@Nv9k>aRSa<BPg
z0Y(lUJSaZ*tL(<qp0)X)aZ{DSi@a)OxCLzDG3^pW#tBqoHx@@hV8Z5XI}^aS$p#be
zTZhOApzpDgI2!?3+~+jM{*=+kA$$NIj6Ofyj8713bKT0r!$UCH$^Mzvdo&NK8vyzz
z(SZBQ-xLU_^B+IHz~wX}1NwK5wGJ$AqE@rzgH{pneX>!*<XaEU-ntJRU|CO}J&W!V
zvFN7Nj;wz(Ib1+PQ`TtP7ptgs_UzfVuLS*)q*b%}b*rtd{LP7I?FZCM3_)3g8GSiV
zdx4Szk6xG3*Vm_|ka=-LXsqPe)nt9&r%!*ToBtW<C_>8Gbd~sn!YYm}%uUg7S^nUw
z?f*SS-UU{rl`TJj3W6IOzy-1#(>|YS+=Ls$Rf$;j+kluYp&z^wcXC9cT07lG&5Bb?
zA5Cd623gmHN^t4YB{ym&S)t$0={0QHl4~^#Q2q=ZpTtjpm2(^&0-El6Z6Y>Wya$p*
zP(VQAn-gvw0PP=&nC-x$@xP|eEQesiP&?3bXeoIEGLPAgcSi&}3RAjRw>YGt_XFTg
zpLWzFqBZYsUNf0Ll7E#Xe|foD{l`!8BKWw1Q)+5ze{pE&3Lx_Xk>|vYkfB>chNVH8
z?$5BZ|M^?B*c;P}YHExGIsz{Ghh8x^jrb1d1@9FQs1CH278#Yis!Fac{{8f{%`)D|
zb3DHckZ0r2<lEXo)4lmO<QXd<OLN1iQE;biSaOL{ljr!o2r3t7g7uXvSN;Og%n{IF
zH@g-<0iTOH<oxV?^7kwMlm_k+FlMC4AN~D3v~=~br&5f{W=`A5{+&i!h))WUU4}XG
zEkMnTfVaTd=*!3sPho0_37{`}&e`#vt^fOPmy<T_;y<ER5_DVyh_mHpP0STC7Y;W4
zwBbK4fhrHpso;y|(rhgNt-@h0uAxwGv431z(>lBi>XY=q0r2`4{9W_J*VnR`&5c6x
zs7Z6yGQnw$4g$b2H7neybMiGA>O~r?P!N)tnOW#*&wpG(va;+!=80`z0@Be!gO#KW
z3lB$--TzB=UMw#PRRM@O#4s1u=^rMJ9|3PbROW-<Uyd-YPdZ5HvXLw6=$<`$R#-#R
z3dFw-$*f7}Prz1+e6(qT)D!RW#^*rEI;N(1=eKXg1M=eESP@Rvbs+3z&Y$1p(i&tp
zc<|7nM_5r>ihM{rBVXBrDI`sCL*s)@A}ik%xLo-|^1JJk`-*+`pbjMIU9OgWM%t{J
z>$m_`zm-DL-hQ6Rx^c55Be80%bpMtEe97p$gH4$P_+n#g0|1RM+J9Hqa|d@j8kPes
zg`e8(uO?1&4J)+l!?hu@HpcK&Hc)t>oj#Y6yolr76=Z_7*|hZ)pMjD}wo!Fdwd;nh
zZ@tLP?(@TyMJ%BbXe5{sMgF7hhXIILEE!u#6DYuot?R`pYf)y;pq7DX6BqjVyc)1=
z)lTAnUn@JJnUm$Tw1{3RNjnb(F1OWX<6Bt@Mir~UMj;4P0ELGsT`J|o9nf_yvXcSP
zjiTio<`vDhov;PgDwsfd21}tcVs4c%N1<}9Za9G!2deI|%noT+5}o%<C<+3fz5*=;
z&;_BCjE#+fZ!;b+&Yq|N1%Ofl;R2*p`{vD)fn*r<a78JZykurJg1Vn({--o@_4xon
zGYFoNwDu!k=nEcD%|RXtJ-3=skQs?XgaM@sgo{5xcAO$2A}bn1%ojx^rPE|hz<HhC
z?{B_MnbtHetb6u!lyiM8DKP|U2ElvHt9Pk(C%;LvDAY&*6^IVQ`RjARc7ftz`(CeJ
zAA=S*6#m-yA8}=;P9N#G?8*8S*<tfC+Puc4sGyJpy(cm<vR)Zz6Xd}fnZ#(1Phzn*
zjgSqDqJ!+2yhE!g!xBij5gZdVxG4iV?nk?+L3NMX^@xGDZ`;vVo%%l%eJapg=vBsU
z7$o@+JNK6+BuvSQ&2*w?4EAK!rx?xs<}scQhUj-rMy4Ti2g;2o6b5=O4MqydbLYOH
ze2bwvXJ>B9t2ILZM=P`a?W^p<`q0CJkTJ=zLk6I%Jj+?#pRGCHPH)|4CT*l13)-(~
z{QCj5*mNu6+o8`6Qnfu2Ckxs}TN^T+kTIV>e_sD>9b>|aBUf%W-}w`(p2<(?^7->;
zouE5i*jH$C#P!qeTZTfD&EUd009Ca;92pipqhN0nLoG{J*=1y8Oh5^uhVF~^Cloj?
z^l7qy2U#-E)OiDBG!oPgzY78faSqD{%_Mg!d+1aknVCO)xttkxL(sLMVP0Tz_zXep
zcI5?u1#q|vva)eAnc>JaFtLG#)b*0I^z;JSB`=9JHUc2^aLscx1QV{+up1zd;;?CX
zh$IN36!h893*;VLwSHxRo)LmI0aar)CqW8>0HRRnr`oe;W@ZrV8fm82ZC<{3VKtCk
z>%}6T1NfQzYIB`9hg#Y&LTI)~d~q{#cvx8D?e81{%{E$S$}6I;?QXH<aRIO4!t6wJ
z(?)G=ZP2h>2kh8nob4`(d|=R=YxyqjTiN-I>Uk+$aOq7BQ%y`FR>Y)0IY#AT^!2+7
z(r3?ph5jXeEzeo<-a17^6JVXX-{1Yfip7<)$u|?VqUbq{PIq{F?-Gbp;`Dv_(xzHb
zxd~_zQc<+;#yHK~MTEU$@B0V@SF|jVXAvOMC};~1n*s7w(>Ir!LR`*9szB8Z;4`QJ
zY>g_l?yDKVj6t+7=HvWZV)IEkQp;V|fd>$*t-x()&NA`iNA?S^sv_iq9dfIbW55$2
z146-ZOhpQE1<T3F4gLD41Ca=eOW={9AdRcBnjvJ_?hx@(1^OY-IruSvGkVbc_hYC}
z8Nged8QpJPU0ow@lgrn{3bnG+MzUjwA|xaP7KRr>s##1U0Xvl1WYxe|t9Ci!R4T{S
zl=DDXfVfvOEZ&!Wc8-^klM~0SI8GO^5bF8g9=oCk=cr3c=7Z-J6b^tV;AQYT5si^v
z-J$}O8%_4|Vfq;NPoF+r0q2JT@GJ2a0aSlE+_gObf`aOJZ#@mOnEf<h*6Zibw^{Bc
z7A$1da$yN!KuW94?*o{PvO?x)qhmIKf46_}m>Q>3&#<`D?#`ZrL>Mry%T`uXo5zmP
zZQEvuW;)L^3$>hbjKn7ZDvC)E0%m-CocXvdBZf3PckXPNjjO7v0?N?2d6O3nAGiYw
zWn86tD$z=v7gi=wlC%`)AJIy&-+ug{no9doRu-S)_Ug%#asULx0bYnQCd;aeP-h8^
z1>+a_F}ru+;mwel1ez=^-@bi&V1XiU)y<AVPd6qj70gzu{Q*t~zXX$eHmU#q=SXs^
zqV;!iacOL9OfbvTt$2$>RTilMg9PYT<}=RC$-IeHtnXM!AXP$4Mpc~@=9hrX^UubO
ze}23cz?jER)A0_r8U6Nb)f-|Zo!8vYCJU9+tk72N<_1y=3XNoCOQB3D45%Rcl$2`n
z?ykXKdNUu-nfL~AJJ@0J*Pu?$Sm012vWwxi#K55=M*<+!-||zh!Q!us>ef?`OPcdS
zV{dh|`R&`E3J0F<lR+Q7GN7WZ-4!Bv?^B`CP;>U>OP40QIf-%zg*>OH8nrPxE~7Ut
zb5Q?h*w>(U%R|XXoh_kcVO4_TFej%*;?1*lX$}xeA%e&csGy{MOiaW#a2GLF0s}du
z>tQW&-aFW#SD+z0lg6gzsF4*VQb^Ep@aPUhHH6@uY})n-OM3hFH~rk91EBWUX%LO;
z9VzI^*tKNeH=4PQn&Q9k!O$9(e#&rpdU!;~?pjs#2%LR<+#)xSyp-9r$^%PViLH`2
zt}HMA<n`-a{KkIB7z`csi+GM5`}pxA@&F@7=x%D#cOck2_3^m~R>)jfROHRTN6;Xn
zinphjnl@k43~*YUAtXBJ^uPTyn<1^Npd%`Gay6GINWcgHh0D#eVeLk`n5(IA^&i5*
z!XQE2U9<jyCjmKlwKrg0&~^e@DZRihp|_%)h-;^=tJ`zx)G2HxkWO10t_Twc>&2Nr
z(yz(W9NhzdS9{{y_wT?Y&^dAC0P4|JFBTp_gF#T=_iVtXW6V?}s2_<Hg6x>n2H7oV
zrVG^Y)YDZV`>{Jmff~?{ALJ_2bExv(ET&y<bRiG=$s9)B8v_sy^$cE!;Q|8IxOrrB
z3^|vN+f8oTl?t|Jf>GyVb0ve3_Al$1EXu7BdXe`?RcEeWZ$m$uo*ygY!Z-r>1!&0T
z@2ppnTvA!jWm*l`LRX;YRNKLCtQ@CyWuS=9-Q69v;cICr^{!N49&d~n98{ebE;%~p
z^pYn*@+&io4@%^1yjq>caq!@Kt0Tvb9TO4Bj52%u>J>vr0I}&LlD+zkDsrMuI^im=
ze7JF{E-g^pX>oR<FPaGjuMet0T^f_94FPj*?Al6xle0<0^6k(4`}#J#QjKi<>jf~$
zg$Q$Tp!1CzvoOSk1hWR{T3FaGf`>?UZ0ziRK(L9Gsgfi<(V6n|zrfrzCwUc>G)#&U
z%w9ixb{Vs$0Tu8&%+in{Nam|ax|?zJ{h^78iPTKO_Sko@yX^Kyhf77rN)SSkC&)MS
zVVPzo_+i}O>*we9?Aca+;}$>-{h}>fw`N-O@aUF@;JP4vqTTI~UK>L9BS2^@sZust
z*svjGpnyi)zU@B!=^Y!0D}4BFrHt|JUAuPeMtV+cu8bci2M0&&|FN)DfnW>XEaHpS
zxpV5*uV1I70AM#jfm$hAbsxwRySzbSjDnemVIxq<(me1~6q5m}hL1G!PJUxFj+h5)
z6I!ApBW<8|1|y-6Md+iTr6YgA`$=+~v@Ts%QcBUUjv|-W*TKP>7cY)Xd_b2(fz{U0
zL5_ub37;GKr!`MOLnG+P6Ix4hqOe*mOAjc_|6^r7#=~<XU}f8j_njsSjjXHx-_`bC
zZ<;+#zD7a$7>vu2Cx(rjR}ycc@mG7@($=QNB+{^P5@ZdzTqv}vZs<5{1fT#S_#>2>
z+~fN_(@^Uoj}iM&<qpMrpUewa&jtcN;}@&F_<)x%?=x&9I(Y2WR4+4G67_+htT9<2
z3|b0QLt^6mg<i7}y5&lA*e!F(Iw#CoQAG9IKHb0GVTCNLb{zNlSN}tJ;PXH|>LS**
z=2-ysiA~mNDZ1(D>A}Vzg#W$gF_|~_ZzdOpkn4vJ94JOB-Ia=YT*C(~>0~XGDRMow
zg5>47OMtjIN&;75n1?Z+29l1J;=(K5g9i@i60AL)xJ&0(%jB5hCBB|hGe;DQrjl?C
z^rG4e>x7m~#VC06NYi9@#U3o$wW-_Y=0szIVhBcZw3j?_%VS>d6b#PD;Xj?7STfXG
zyp%;=`1=bjM?WAM-n8Qn`7TJ;a{YE#k}p7irLf_PVZ%Uk$1(%jV1j_o0*;9_dxSe7
zzo>c1=vg9LemM&7Z=^>7_dTqvH0OEzzpYj)?^9WR-_3vi`3E-;&5(+FBS!^d7NFqr
z<;z+Ky<Tt3h0zI_Gcj8Ot=M8ow)O~0d{NBwAnY&_>UC(`V9O{F^D|@6!!!p}R8>{u
zHL~$Ht@-yvEPJ;#yd=px9UsnHL*R09P$;XJeYsDs0WF|%IVke(o+5{8<O$dz0Qi9-
zuXjuiq_)I?`B{yYpP*q9G`&`r2JB)2ZR7^yb`pcDMz(FN%7?eT3BYe4HD)MPu4Gcz
zD)DT2Ew4FU{+|mPnwM&d;Z78^a@y>X!d+b1<3BOGn}7(qUedILy9ncd;uatOB41ja
zGCBf^?)R@>H|$CU#??sHT3+Fuyt?wZG{7k+ZI44&=Q(P9eY%fWIY=S|uR9q?W(U+q
z$cvuWeBi(V70{WwG&B>$v)<oGRrG8Ca{L<+&h@^_cM)>EF6)jtL}0uj%Vy+Hu#?7s
z%Cr3!h{l{?CSo((3MH>lkbQY;tCnegi0yx&%U!8hQUtyxiwvf;;Lui9R&{B|dm}^J
zJ36$2SL|9hmQ6O(;;6FZ^|wM(#d^nEfkjvmV=Dbds0p9(MG4l6gvJ3;ZO`e)XXM}~
zA=5=(1;QJ4QHQvY-5<fCQd=BF3>&*TJMCfk`1kxl-C52bXH_!{$;tvn1orLwjRug3
zW&#I=E8*xOHxr39%gtncz`s%nHK@Na{bT)yf58bXJ?1OKmC0u!qJ=HG<<SBywWo*M
zK%gEUCCh@mCszigy=o9Z04WinS52~$uNOSfFM7pq^vc(FJHN5FP}fV$)ryGEil|RD
zSpxDWvqcyX#FpmAfQN`N@ba}4Xp!}%IvU*ef0VM-PyX9dM!FFb7S;(>{5!l5Q$Qf8
zFh4DeW#xI~`B<Lg+mH}e0RaKzKX5^j7t?ZHF2zG!PXVdoE+w}`m;_aq0fcjQ(*GUM
zftwI(Lw@zkksD74+m&ovOIcZ2%yo>72XhxE;<3q!|6_S_UhpM@$0`HBKXhM|Szyfi
zG&8MaE%4eta7v^=ORf_L4bguDY0tw$l-aUi4D|4JxtUyi6de=uShye+vy(Eh#p#aY
zwlY(LO@vM~HUo-;YHROpT!F^v#UwIVag}hupa|F4W0uhq8R}IZdeugW@6VO$03Aj1
zAjKbd$UmrU3ffLp1=Wn$9f<hb>FJdf6iza&k2D7v!;}s<x}>=Hq44+S1BVY|I8eRJ
zD!=(2j$U9H`tOTmYu*Qh@8m!O_**)3BtmC|Hivq=+42AhP9cnS;Q7IKVOTQ&b+T7b
zB2ebS1*7JyJ19e_6kE0qC{RiaJk*=BLJ0>qM&HhaziO)66Qsj#y3K8>o1;v0>^wf|
z#~&0<CkO(A6&$C%81Ddc3{_m^%$*+v34c|Vs#mFvF_fLf#<Hj`(dbuWG5Vz+qlA`!
z@W}u8_6>1kX{Ll>4N|y|pI>T3vf;H{AGAMA%}^$Z(f5f5o6gG0dI?tu93nT$SpCKq
z&z~z0JMIt76R2m?j&cmGi1I^y6IWIONjZwT^5@U}7hYMQhf4#Fg*+hk4WG)TktGP(
z8$)Sp6fm-X61IV%=HrJCM+4GF1ljfm8S_(+H0eBY{}AJ5*d;aX29h!B1$&0f(yvEU
zRekWFc3-#)0VOpZCaUox$V$cGBg88JqAdmo!SKFi4>_207W;tgX@go;Rk&bNV^luu
zHf$^n7I%WIQ|*Nj&O~oDpa?aLHJ2k~KX`5+4LhBFyrTkLhLGyxvnN$)DX<NF6`|4u
z9<<UKs81%WoZq=}=OL`1Oh43s$b*iCaGHcH2BQq5dt|;B1K&9+dgpBU)PV-<3E>YS
zR5gTZlq>|DoU}B!KC`=Odz!Q7VCKq@PBn((kNF2Nev&i!5YciUqH@CZg2`@myOC3M
z{|es!iezQ!GJOoEfmd*Ajg_LGe?s%YD!wN!9qoVSn+(DkBN8wfd4w2(DD=$w&(JCx
z;F{UEaU*1^5jftI0|U6>%i4Jiv{BlV!E&NnIn|(+^TqrBjW7I<9D#?OlqZ0(n-DM@
z;b*mOzN3Q~1o{Wr+=Nw&n(B>?mxXE!T@TG#Q!M!9%Q|Qloaz}#z`^LnIro^U32g&*
z)?NIDD#$ou(ypXL7gUD!`Tct)#<UM;SR5g62Vmw<(M1t*-5sSN#-lGzHMti}Qw_nq
z)O3(D4wa@&KnuO1K>YeD=XEI6R0VD{a_7zoBiXsqJD);7Jr(#co>&HqV1zNsL<vv~
zgk=2tU_=0q{q!(6?X2A`!MyEsbU70>SD_ZY0zuRXc2Keu<KQrAL~Is$mlYRBR%UX6
zPA<2r%9KV_4u{!swE8vcHVw0r6#zS<e3!L^lhv>9AIx$QcDo=jxw8koMEws4V7xBM
zstOzhEo=<(5A3!?S>-bEF$IKbhU5W<$|E>!sm=_fnm{}n2dv44z^JHfT=DkHPMrX=
znkzcVCM`KuLu*O$wZwo3y!$?34M0Jh#D>7w=*_=fkx*P6=ciHh@4JDKM8Fw_91_B`
zAG|~N<En`Zw{HE0&2psbvRwB3=y%1I@}H;;=vy68(Q#3+Ed&28lxfq91b762D*E!J
z`W(?MoY&L<EyU-{_F`KV1A8#8-L9<0tzQ+n!)e2oa@K8hbYctRm2j)fK=6!~B`$d@
zyDY}PCtUMMy0rS~4kAz{-I#47^|^x4t`VBVlntDiZfG@$_8379ps6lFy)^?bY4b-E
zpeIi}IYI5tzk^V!p@#n{@!!h4fB$}r+uy+hTehIQ#y%R#-+E@7YnWPM#oPR`%d-AF
z?;yO`S?7aOEdyb&wihp6D6P~u_3E?>7cMk?U-kzrX~M^axs}=24*`>=3_j6HKpesk
z9rLj?@$#AkI|Mi340go-D;b5Cql(zUG>K7v(jFR8=0$^n20|xxoEwPGU9`1%85Fcw
zz_K`ucmK>>OPX8*;`v(i&M4vd8E%pFWZ;{{H28L@(4UimeR&gv76i`(3Qb!Z#yfVk
z>ehr8qh3Lx0C*_45g$u5hw04YLP9eHD}||sswD^V9b$_^12C}t3t8|8C&bIyaiA70
z2nXmRxhSqxs<trVN5Ztu&aMN=-D+VyNxv2{6eom4pzuAS#Dremx{2?`XUgoWAzdgA
zPE(mJO%TsvywbDja@{KXQp_Qv%yQNA{^ZJ>1wu1G!9$)uaDy@XCnja(pmYwoJS2&Q
z4@fuEijTd~I<hc+zBb7b!lu+KtbYZe;Hq)@@5Q+xnkA;!pgv~ppV9k8detOO5b_x+
z@t;3`+}!p^dj*Kzxvi>NpPFNaStsb(;<vB=#iH#RnW2QHoF<VXM={AEX0iYbz%}E9
zc8KIB=0<b(fD8jrv97DLadH~NgbB-)1QPyj=~~`15W9)|XO|6tQo5NUpHPE9PR+m#
z;rb2{iwc|^Agl%~OVdS2XMmcyFI<k?nB6(#?uRg^psFg2t4GJ1YP$}NoES}`O`Lw+
zoA&{G^08Dnx_6F_GiSOTbA;J#+v}vU2GD6MQI{ANX*%%-M%2Gr>emupZ*-hdR!SsS
zEnG8Eg+{Ou9FbZ>;*}sQ1PCKH(<ZcE%nKJ#UJ|^wkrKFqAVXlihU$|sNxVlbOmJ3c
z+!zFvw`OB<gf%>J-~gsw7eh}IT@7}ZZ2&91f%SSV>m-LUQvsjs##W%hIK-;al0uqo
zL8~1nqBV_La*k8FK++L~HGgr#wlxSXMwB)p+gK@Kgc!n9%OF0RzgIqQk?1aj!5nHl
zEd^Y>#f|Ca=+s%SXOMSJ^BpWjjOHa6&oM?n21ac_a~0M58HN!MN8`yTfJ}m_5luWD
z2P^d3KfB{!jHcy0I)JNboTg8pG`*!>+s{tGIl_|;3%w-m=PzHHoR${ghdF`H9jU1w
z>u?na9~v`ONj04T`!fYg7)X9xLc&o>7pSu!_B})y5w2Q{2?=RMebP+USEQu?qFiif
zS*$^(0L9O}23aQBEL!Xamk^8r2=g-yY=;EI?>8aJn_gu&F=55}5}q8`h)KK(#AE~X
zlsT@*bJrB=IGhG2cWtOEgQlSl%Mps;CK{8>ffUWfQNN1Qt3x#2Cj&p{-PK0Zs8uun
z0cjfpH)vhh;(_?yzB8aCPuNbp;L>EKki>M107HAc1a}h){yrgL`1xhouiw9KbZHf7
znJEQ;H8R>rlBb6@gUK5KaG-0FsD2W2N#n*e(*>A)#sQx_`DqeZh=C;Gf4tuZhy8pf
zySO_)4QaxSxSVE4;wbLX5NhwS!oK^Rn887Fi99i2f~2w{h9JlpNm}^kW?+sPLSZM)
z75swgD<|9&2GNU^5@jU?(=#mjvL)YnX#~(o9lj)_5rTv;xWEz5WV!pAxN`M0L{p-U
za-w-7Y2~?e-AJVVY)nv)Hde4Wz$y&Dd^tPT5^pEs4CDHioCO4#qRHKi8WbHgR*Qjp
zK1vs2>tG`V)L1@@0DFHc+((SC@T7gOLwS2=yYX|Lx8r!|@*o?_g#=oHYaHHu(u4~!
zz?f8oOo&iIM!%&_WO;S;`X?A0@39sa7r%t%r!43=d903ki3nty<#7*DA_!9%AsJI!
zpidAdM<Ph@yh8O23|s&+z_1%AUK}kRdLcC{%6k)02Ph;l!2rHQ;?rXywVN<DN*qtd
z^5NXTo0ym*Vq%LX@H4!R2x7p|g}l*trv(Fy=K*f9l7vD**fsD@=QZ^lsf_>+@sLus
zBQ(#;$PgX@S_7;GMA;uuDO_3+HU4}CFNQ+b+;|T&2|Ql_CL3L@;S<rcGID7s1oH^5
zj64fb0pJjX@(ODQPBQ#9N^PPe4o|?Y9EP#VAxak%yPT<JI}E+tv3hx{qGDqD;rikv
zY7Q%`ULdne72b{(pEpaN#Q6AOATe7eru!}-m}5XuAcQO8OvEp+Mk}jHSTS5!VGaHS
zlhiYW!tyA)H<Y}(;`64o6re0K@Vy$-tYJL_)ngH&{SJ!!hnx2R5z)b$vGl?E&noV?
zrKRck@KfjB>`nCDC*!(z+rcg0f*3{C7v<c9Q~v5cU$2FSd**Hnr^I}HvG3hIrHgLo
z-W}01sZZ^hDw>#U4xVc&X?z#H?l_B6cH<P^)WR!n^B=rp8JhGsopT9L80g0R;lpbw
zb@lZ>%k+SX%6Fg~O+bo@j64G#*i#t+VE5GOG^zZ|+V$%p@hd4P95{FoL(?c&EPj^4
zeG#_q6d{+9hJS3JqB?HcN@6>C_H0jgMd$$xjt|ZP0_m}DH!&H4+#(}F2))jy&t1JL
zA}T7%%WHvFkH~d>0yJ|@<RJc0SI~o=i}TQ--L6n(=-#E0SWgue7J_;2b_F79kKcl~
zQ!h{FT3;W#-C68L!?Go&ehsPR0WJvBQa9S4Xghvfn(a@Ct0MJ$D=nQI8M%^X`UVF&
zlFXsvJ3nac>l1ODw~>12<yDFiP4_#PFMnQAQHhL?UxH{XRg4aN+MAhU0A*f6Se`tV
zAhpmrIW3_9ZI(h=pxe524`gTJ7aqF1pEWf-%GQZIMn`^XB~1Qe{KxYYz$&!3-Me>F
zvJtisL9@3>$5xYAad|1p$@^S4?KsN1cQ1wVy456JSw8)`XD?sAeERfK%X{)C&`Mz%
zCbdRKN9TF->O6rP&`WsXHN1N2)P0L*<Y@$cO;;4!!g(1J&&QAVpMS|ZU_l~LvVATr
z)JF4neFAed?(ZD2?oYfJ6k7l*s%mPi@JaS!xj9=25zx7?|BDAbioqV&?!G=aEHBje
zkVx}OfV4+AI5x0h962^I;rnlQ^#zDjx3#_9W$-C!*z<{x&vjM)qm}S%@bXtHw?JhT
zcbreda2*~+5Mrt)WSyT^6+0Dv7j)cPIaKBg4*M0NQs|j>7H{s+CJXY(3CN}La(O+n
zDNd6eu=v1%hqOO&16cL5xCiGY!Z-*;Y^&7j)vMozhQ15L{PT?D#9At#LoN=EeXi$U
z@t&2Ge6x6&d~2r<VJ3u@;`#&`z6krzPucaPRwu}~n7R7X-iAc;`SVG=2=XXdlZ}Vx
zHj2oa_0+t)yyK&za10Qkk5e3bV1j&0c=F!8-av9r0C2v)4r6#<<NcUSZO2UmP&}dC
zy=Tuc_-|*+))U*u5RZtj)RX|k4~6fU*qFuj-bKWqiR~mSty-oPd7)twRKDi8k!It@
zs~F&792RukZhzBrd_agQx&%z$`&^N6h-_(MUljPP2h}njO`|`+07<@o-X#4f6Jn&9
zb>{>nSb8RC!M!lD^zK;&NT7?eN85-sX5C`0c<9)&$<q2db{sPOU~N3hf({BWWbLrm
zgqWysc=zshocD931@l%^R3uhMUJ;5I^ae>v6b<)BkH|4zULN&62l)KG1S_j60+;uU
z7;)d?uVAtRJlE41VS>V)W;eyj76ufFG$vMG!WA)yIoy3(oP*pd7R*H~3~n|YW^^7H
zs2@Ij7!neK193-GlPI@~;-6Z?#KpxK9y5t3W%wi8w2X`{tVuFN<h$bgZ^f1D;JJBb
zft}dG;S*T?uaNAaH0|8HnQcHVXU&HXA0RHcKJnQj4JJUYKo#BI$I(jOym|Apl&oIo
zyn<AgnmUE04<j4O6u1NhU#<fh^dWY*{pi+h$8N16NpxJiFkxh9c!bThsKXL@AvsE9
z;j|fM9imQ)Sx`#E#5C&D?bqWF$Ju+U)~q`tfY_&Aw{9KpHEr?`6~9i3q#5s{x3si0
zGBVQpgI!T2rdHL}8=`aBkV4FA)v2rRva+$=rTq!V1KI(<0-3OiKQwO2s9@#fl+EzB
zr-$-&>K^|69Qhkc7{8RL5}!@HSj`}NL7Ln>UATjka|hIKzbgiqH?_6TiJ<W7%HtdG
z8xT8PX_=S;AABRiwY8`s-CT#ket!BQ@$uc3nyyc<9DhH40#$J(O{v1)zLlycYBjHL
zi8UAbOU%#^-b&uM@z&zlDzLLcMTggZe$w~oFx@`GLtK;ib0}@)YbmDfbCS$wsBh}%
zREEGR_VWh?$@v8I9z^;EgBo(qvNYqiKqt~M4{~u?O)L?qT%~oF7HL`oRVIgb9ZA<P
zEy@JIPOlvPmUU;>5!Wvripx@==Ue3lwL*&0yQsiTjx=3Zaaf@zhP=t*S>L{$!x1DH
z(&~Qwg0jbPC2{Zm{io~CViV`5F)Gq4$1H-l%REp2@ZopdBnhoOMIYi55}vHX`9Z3u
zCDvc7t0y3?D)I-3-bsWJ{nFPzZa1~G#ugXsCh69$z2Dk;P4D-bmdw-Bk8fL8j89C&
zCXT=w3Yq1}IkBo#|LxA7E6OhFdS2c-{*wzK$n4$s5BId!-hKOey1NyA2}08;@@A3Q
z!$fj_qpTDXFN8aj*fW9w8|v53mt}wK?%erOR3vUQERk`Axa{HbBb!J$^ife!yb>bJ
zsO1l$XO&oK%Oy^q4wrm-ca{$^jIfj{MlA;PK0ILIcpiw3nwlEoit7`!IWI<`#I96W
zbZGFGG&J6xo(u2~FK|FYIE6{QoLpIBV^+|)EqiwFW)^p3+`JiALoXH_k3&5$B*N^V
zqN1{6=g#Vu@41}b+e!Am81v-i<)LmtBf|l3a9~U?Q3!Mm4Ep-|ym8_a3Z|xANq4ud
zre+JUD3@jqsD+-cE~>&UFx&9(@X^uHEx8+^%%MP!O-(Hgx1BhB`oW_|bZZw>{(1o(
zg@<>6R>Z`{!pg4Vw?`Ag8t{0?2tX9T#a^GGrI7n!2&^sQquy>}=cYS1pp=vh5aH-~
z4(=#~Hx4&DLVE>rfP@C;`=ym&kdH&z*{J>M#(qvD%w9y>u6yV(u!%-QL@c3?K)j3n
zvYC<mqGZd@&qtH$Vm?AkiElP;$!Wq-0Dve11B2T3$H`+omF%jv(c$5fO%uN-Mn>M`
z1MuCM9oOsGU(kl80udfoikYaZa4S4;bydr<;l_$Vi~ZJ&CX5#r6Yhx|cyPe<A!4P5
zZIL~H-onz-6X4CH<SX%V_x8@UozREH2ewo-7P+eW<4R9QvqeMq7FQ52AKEQ+bSXE#
zZqbVvTbkqlIO~HWqQogbSkUm#idJwl(9_X5&W!%XkufwkCiFCLAwFJS{o8d}S)$6y
zwR2NL9|iUU_OP>y+m4^Pdesg<0%j4vx+d069FQ(8E$zRr+TFNg>sE6VVnWWjl6X=!
zj-N#jfzsL2b1?n<s*pn{eBq)MfETQ+5*ahZeqaLs>h9fq)Mzzzb&TGI5U$H$k^GNO
z5(_mP`uh6=o;;DD6ap>8zV~MC>4$fc06&y7a3EpfLGl&iBcgBXGraWozs{%Q{{6=f
zODn55b;+;59^jv)^B+-EQc_Z8XJ>J&0_=uG%zk)k3eFUJP$ihPUDj?)F>VqD(aiXR
z&CLR1`}9dl>I|4W9jBTg=8;ImuCA^J4<6tw7{tDVzzoo~!Hcm;*}aaZqsUpX9W4t3
z6o&7`#b{hL-@Zj*laZyEKcXQ)I!|+$vqAxokdU}}vlI44^n-M7|7XuWf-iM;-jq56
zBmjFX&$T=fv^<Oq(UI4$TepATz8{zhEG;fTRoRkG-`v~`5%P`ob+Ai*etyJUS!pTs
z(-8z9DjJ?+Fax<3)c@<ZZ;k``>S}7;yky5T)>VFMZ*OmPN*j5J@gL_x=jeYvBC?Y{
z^g+*OIPzv);oz(W2xLoyR`A<ZvPFe)IYz}}{OfSA5SJ72I}9ISDO3gCieLp;Y)H}z
zMe|0piH^?1Q7I&P#h#56t95k#l$VzacQNkTWnf|=XHbgDSQacU4oMsHNXX`?`qjk2
zV%!bD*IokY6Hh^a*U?KSZ11$nDtl?nt8{dU*MavgRq^|{xN>nq2*_CVCKtvsQBnH3
zy4~;<4i3t4eT7v*{lks5nYl3u5b2;CR{m@N=zt1`qw1H`)zz<DNyi`uI}*rm#K5F&
zVuJOY%g|Xw3}dM(Y%oZfoSY<vh6)OVX$K<*sMPTB$w(FFE-m)8wT&dNN4E8XtMmr#
z{kwp~pH+eJ@$n@~Teock;?v(NE*|{VU(bU8C!Q3_6S1gS%cod>VtTbsCh{((YUx-e
zkRTKxzN1GYhbphAs1PXj?AiB_d@*IfUvc9;Fh8PX!|x@9ZQ1)4=o$`$!%8ujP{Z(4
zjap`CXsGYNd*Va^xYW1eV#=Z3#`G~L9lEGWK)YX$w5x4IHN;$$-1+ez0fAJ&3#Fad
zGeB=@grn3MSy}4K3rYr09b!nW(Re069`JEz91^oM==keGjBt=iU4o_f4g%v)`aFAP
zZe^vUs2Iyb_(<yO>s_B97SLB}xCh$XZzwA(+XiB1wOkt<`?|Zq>0T6yiHZ66`0$C_
zKoZ5eZ(u`vjMNJ9M8~GN7*FPs|F)tMY0)UYjtjFh)6*0b6uc#>7e$#(0Mm)s26F}%
zt0C!xp&?5}`<*|3zVr9*UHw_&St!@R@VK+ms$97OYe7X-RT8K4Yi_M|MH*UK9hm5p
z3vzRF@82I@!~V~vP4;t>9G)m<(eK|My#4akt9pcjUjNV#3lo!i-?1%c#g$0%=GX+(
z4#qV&y?&vgWsB5cbM{pY4bIy*%({qzr6-Ev0o>*ry|F4+6RxfD)P$A|vwr=b3&kx?
zOMyN<k{zI09*kPIZdJcIv7UiN&@`lLJ<g3u!)5p2K_O&=-lsQ130eX#p`L+3JPv{H
z-YrD{fAY&-0JDMKpK@sZ%dZ?s&+OS0ADi*+%~T~ne!P4A)m!?+2`ymL-F6a=u~=Sn
znri~OVFT<2e^6!4i`q|{dilWFaP0<~uWGQ*(^jpuB)*8y(8{-`HhQf>RTLH$<}EEN
zb3%3$U&oc8h`u966<?TsG&Tn4ML2-7*WWx($rhHFIEwDYi-u&4rTgM}^C_r-?O4}^
z3)GGn5rAE6k;3Wv$hU7V_3hmiF*VeJ8wNHALH)w`L;BpgM+6%tzox?>DN`}<o>5k|
z1|G&xnIH=*uf1!kQ^j*|>+qOKJ%kjS=3Df}3eJ2R!eCYKp!zo}lduIP74eEq*`JJ%
zIA1b;FXsO6u<8Dq>gwM+e#KrD01aT6o13eo{+7JQr|2J2&Nm!5mpX9*l}K}-_|<u?
z#l=Nl3`K8gZjwI*OgcB-bD{&*BtQelQ-w!R2QfF}<>%pP-|@BidrpQ#046%P_pMSW
zed+NMBx8g%p8fI~LGu2650<-cv~bx0MH*fMFUvLL9C{3zYI1Ti5uYz!>>mjGdy^q?
zone6_HdlbbK*)dy6kITHC#?Fse0+hJSb$e0U7llM5qHdTT(Fgwm&b0QOh6nFwjLA(
z2!bOO13LF$PGR9$JPZU^3}67Fk5L<}&O;TUMQ}RC#>N^fwo<HJ>-*xx7O8dAjKyZr
zsLsHhZf<UXH)AvQs026(i$dw(-~fj%rY>i?G!)r+cz)K_`h~Nh;J$fN0qB6B-?d|h
zeYF|7y9H|K@bEA?74SE+cLbVUY-|&hXehI1dIuxlzdze4$oE;}^?BL2NiGemyW6JJ
zA@jd~^yr`9GIm!mf?TJiAiXCu$qdsUZk*}ed5ZEN9?xPoWh^gW4kitPOE~zUh@iFM
z<^UugsAs9P;4~xp?^Z^}0K9yC--U#fqfX=6k>yZCakP(LW&cD(Qc_h*%M!8^3x;)t
zE&(eSMy_yZJO$v~wslUUIU$G6&UW*R7c?$(pwmQ=fJErpp^K|<9BE(67M7GW4ue2H
zXFVVNjvd9HF{@_WgZ_IE9)*h+B`FE^iHu11MjtBuGX|<Z(ifjwvzt3oij|fEG;rOz
zILx&0&>oyIeF<*}<f5=ZdV7`|H*UzHf?)|^@a+2iJ6W%?u)N$9r#mq%(vDl&48&E0
zS4)fdkt2^C1x;XI0zkmiA$SaGjtoXQL?IWEhkl&unBL+Rzm}CLBVI8HLz_Ty5)gSX
z-Rlr1r<l{?JVXIVQh3n7rHdEAf-8_5GBV$B(jW!%lW=b`T_Vk=!C1R}=gyhd{54W%
zV5GsKkGvz|+4*xenmc3_Bn%J`xFn*hwyKJkn|mi>GdbBZxkV@!zXCxFBnifF)DWYe
zVH*~{)k9V<EIh5Qo_&3g_!@gnhRE4TC^{e?iu{Bd5%?1f4cL|g$3=I|*e{X?e~u3L
zd-MnE-YR#8XNYHq>0!yvgEcW%V1i=o3dZrJ@!?@wW-Al~Y@2E4o<UAL52_1N@Y`E4
zRS0fqQag6+sEbzSvbMH169_}QY|eGkyw!ozzlir}!)=rlkROEbI1DMN)pj6ScW_Dr
zH4FzWx_f$NFg}Gt!7TW>?ff)1QEN|8<GCQF*48>xC-%7x&)ACv2S0xfn@Bcr!_FN$
zN&-b{(K}C{?m`CuaE`zI1o9TZh<Ela3`$n5TGh`1xRz@*@G4Ez<Iy853=AZ7yFhF4
zoC0`rga8sT^ieWb-oWPS>gE<KXx0v7-hYEf3KJDGG*6%p>?}XVJ9zL{)jMa!Lv@ND
z(S@U%Bys%4KjK47jA1g33$&QnoPDkkcB?aHQ`*iVwT`0=ptQr3blv1U2FDjJ5LF!y
zeBwhiqqmRREY-@ZT^ieN=OMw0@BIuR7}xmH6JkAB88)4US(IR_knlhi)W<R>lYzca
znd`3Fh%$2|--Ijtds&%|i9g;A$gF?+5e}-@+0E6}sc6gS=^YF%b3zY?30^5)!^ptk
zs?(LTy;L5w?cLoFA-^~8rls6OMHRgXG_4+U>(i7FX#7R2jeWBT|4N!Eo1Mq9UK6zm
z3l=T2LD0^&r#?A)*xA*ULbdrE<$EjU@|qg$cGRPK#g@ciQlpL`2I@d|J>NOFxv?~l
zJD$G8zEm!z<u1+|;A0JZR5<YU#8d!dF)DLmJ>)e$9WVIBlOUxgC*#;$@07+%>WPTN
z#GeHPT4)m`jZ&0E=Hjy)?ClBj2S%ZwA`i2Sq@-M4zivRqz=N{t8eliVOG-+Lno;oD
zty`9(ooBt=x$#sh3>_iHX_=IOKb4kxJ$eNCtKXb8gA+Rg{ryQvNuQZM7ZnkX9xQyn
zCz<Yj#Rd@P&H;Y{0t4@h50fQKiBKL0q$qnug@q_iAneGt@cayQqRH6P!&@-{Jzn|O
zfvl(~JmUqab>AcikDb!~!C+!yVs#&~0r9f1Nb>Wmg>VgVV7IYe5uhi0L+DBm9L4~G
zFh#=)c@wXeej5%N^t@e}`8c!kbeKj3zP`<i$9Z8Hg1+;1h)^)>|AeVIAo;r6+CG4Q
zLRFxm@&v5~0I%3xi~B>rCityb{Wr&00B}N;ATEC+-I4OzT0H$=aen3_ESWg+`vS>*
z`}aBb0Em<~fH8Q(`Qjl#>ea;T3@-C_cTczIxrD|DKn7xg12)>F0bd=GaLr^eVj~)a
z6P3fxs220E!X`1s0(!%B54l(2@?|))>=BcgTceUnQ*NLkUO@R#X*>^wQ8_g}YExZa
zC5i=xkHB5Y8FNuJ=w4{&V0EalF_ZGh$nd)_UJ#SKbyN&tad8@#E)jJZGi0n74jlv>
zx8vWp?@K|!J@vyg1H{Q0-EyE1wi6Gk23cY6#Lg$#Ct$s^vH~(me(GjHFJG!;KI0eN
zAQno=h5^4I9wp!@{dfoAHbIkP_MU?F1SkjEhJKEdg}cSXnwy$%!t71ZpaR6&d-v9G
z=;N3IJUij2%n-8SscL~`2G$g`B)DF2@(bxL_sJ>5kqev=Pj$f(Cg@imGB+PX`fb~`
z4b_JQY?qT0mStU_Gt@@xjc)7b96x~=)dP?u&z*bwAtxuNShgAkP^(@4yZx^<bprE;
z`gehtDcGG>C1%<_0Rd|73<H6VPaJMS&FM=%ll=&r&0m+EvleO*J^(M2Gop*c#=?83
zm7953BP3@4OlGRZi3JbhfDr^B563%U?=1@u`e;4Wj9%{SOae|pmG_wMOuv@Dde*oO
z%3zD*oDq0204)SnZtjZOT5Akp(Eai78O-1AEY7DF5AmW>VNqsBI;5qFkvhn@O`A4V
zRac|lVZwm^<p6~U)fuB+;-EJA`|cr^=xD;aq$>(#9fw~aSb~`0zmqs)iVs7uSXo*+
zz_W*1s{iveR9Czl4Kp$_P$gP2EDz6R*b;vg8H$r#*vj%aP6h@B(7ksCZ8*Xy<~U#X
z<;y+{K#m+ip7cX<gfo1L81V!ofE}{RPbkAEHqH10-;Iw!GH0fz|L*Gg8JE#-#ZP>E
zSbmfV&XVAzXL$4GURp{@Hb78jW@a3Xgal%tjPi<S`7{790bocK<Kh4Sj$i*o(IuE<
z9u*wSayfVwLE2DPM;UAePX@}}9#=sTk(M;Gf~u<LO$X&~inTXp+xZ>L=EF%PG{aft
zirb2pFINJ-0_;0?u!8OJ3PO<p92PpwgJy#r`w`<zfHd^Ety`ZV1#uAQ32h<>{fQGN
z5Y%4x;EqAq(MruivQ(4DZYKfd6GBWe04!FpY#TN3B?Gn+F9rszYoZAsfIxl9v7<*#
zQ7GGG3FOE?c;%oRG+JAsz@C2-78uxsZtPDym<U;!diU<#&`_-($^m%7%mCmIJg|~W
z2ie&pz~STL#W7t(4MSRCKc-QPktFeLtKM<qDSYWqO<?b^yi!FxTL@Y*Z)r(Mr;Wne
z)si^5HU`PGQ8hT}?%Cm;mY_r7<5uXd=kExxg`ru(*t0{_W*C7CYzolV-qZ6n>%d+<
zO>VsF<<AC05f!uOOJkgt3yD{P{t5`VMN0JeaU3q%o!FW)h#M-RC}}+KxF^F1PDIfb
z{TsL&U>KaIy{`Hd7TCv(LbovxhbBo)MYYeBQP?69UyK3FgpY4na{?us7x~p6bJD8w
z)_8BVM{_<F*J~*UYXJ8ate;)&?YoUtejIn0O+qCq1Ww%|^q{FYz5cuTuP=%1!V>0q
zGK7xmg~JCADmL)F2>c0I;Im?jt&I(UynsPXf3&LRiP+zrohW!{haVvnL7u|tmNys1
z-v!d2(KIx~BQE8jnf=g^eS;(eA1s1!i__&eWrU+LM^V-D@+8b^&*2CQ5n&4&As{Ca
z5dwCThKMgj2wsRX&Gg%+rc<D5B#qb9*Uv)jDyd8a$j027fmfG!s3VYG)RC|}7%&cE
zCx8GfQX-Jzc!9U!Krs$4IbxUAuV3GFkDEvv67Ox5k2~<EL>&%({3#2Tqa2x-U=nu}
zZ9*4A1(qM&MMp>Hg=04_nytFa*)XQ;$*Dp&#@GtdjUJ4bG%R8zWo4<!$*2N~zTq@D
zW%uS?DmVwx>HpbOyfnvQgVu>X`DB?2*<@U#bAH|)2N^QVpwmO<jr$!+TuKX@9{*w%
z8YFJy<+=S2l9Tx^pY9kM;<7)1@tk&YHwwR50G@H@onj1dqCfBy?&5ivWpo=c%s^JR
zz0=Bsiv`wD2=n!UQp7<s{I$>aL%6eu!wi%ZE-;Cjv-|;S;5~ZmSaf1EpiO@M-512U
zX;c*V{$E0Pc%f}no7Txwr_{4;eG9R+H|WptKzc8Fyz}zd`WX~w9VZ@bjI*y{3dt*W
zXLPT0+XPqv$fmRxBcapWuF~Lr`Q4AJ(h#B=215#dCooic?GD4%ckWT3r~c;D@n53@
z&svWi;^XtxQ~iabeHgT!vMGTx$DhX-044ks_rCWty(lH<6TCsN)nBn~-WX;=j+
z=DTmUjV&ISC(Of0cfN<?grwwkkRLRN@!+_)xbykCYa%Ch)mVchY~%&BVBx;uTFjkv
zN?6f|<H%UcNy<0M&h{LWu~Em>smi7?o}Ar+<Ehap4qRuHw6(*5iJ*&GSXh8|Y$e8v
zk0ri=P9Zw2ef(ZQ=e0t`=LM6&0~8|D?qSOGLB?b!su9@N4o}F#hi>oWvAw^Ejd%dd
zZgCSRSa(oG+GPU+0<vy=`G?{4%a@QFFW5ZBE}Ki^_!`1&|DY}r2g@&c5$KYTt$0z9
z(uO}pU^s~UIx=AB`}+0atPt)o%Mo~fPtxtM=&PYxn~XQHdhte^%-H8(BZjS*z3~K1
zCx&AzzI7<CFBe@f1Y+c41m8k$$Jk;4y1Kq!C+Hy#VCx!cX%U0#+jTgU{&Ka5`%zxr
zo19O!@N1XsV3<c&K(Mwy*g&zm{@}BRQ8^BvMcI^(X?f5ReOUG#toQx6Xk%x0?04*W
zy_ln3>WVP(@7lT3z2Up4Ef}NXKl^q<2?w!X8vQK;b_ocwFh8$v(`$?KzxhdHYN^J7
z#g=UbrJp~$x(AEi`NYk0^yrg?vY~ggN-kxQ#K?x8zS0KJuw9iOe(h&gUGQ0B3@d8q
z7uIz+{K*+f$-9%sw-<JdyoBI;1#RbdnnBAE;0_;y99k)Kc&v$Dz^hlYc;d~v?@=`-
zDlIuEIrRrJz%L0HJGEDb%l*LUB`W?&=lUPJ`ma{sOt+fc;ZvOo0F>lChh9XK50B>(
zyFn$LTJu(|S+fRDNS2l^0t?a8p}6$XYCyG3raltP%GclHWlh&7Jiu_bu^%4LjMl4r
zWb|?)fT?a?MQw1;Jx)WE@e<x-vyRhd$0A^q7#nf6^ZhqHbXpbY_56A_43;0x-M@(E
z27o#65|4|37E@=}BdiyeqDc=TAOUO27jsOXZ~{lElQX=*3H8y@Q7n6#`;Wpxf{-R{
z?*=q^+0LE<@IYi`ezHc_G&;-ql8V(YiaMy6gvU`zlSMW*fChAb>tMurA-Q{y;6URr
zzws$R%0W#&MOjf@Z3I;1n|<C~Ug}|bx=>Ni?9>n!JG-)+(SCw6Cb#1E^sZ|9WLOPE
z;X%*}G7m2ggcdm-K&1$ch+r25%z4m6Cm<~R>ix{KN64b`-88s%?O0FMT|m|N6$fJ|
z34d@-A1dxPW+fa&d#8LjGqY1b29bd5hNZJS4d8_F)cmVguW<6r-NR##Ya{jqfTh2;
zAfrqfLX@`tZ>mk31oeOaQYXO`{bk#tqck3OSIDh}V%=C&<W>k=NHv*itPUmUmc3X@
zQ&V6e@(uX0{WQ<5ZdY1t`(Qz0@KXP-8pb)L+qbKol&(kY9es=z_qDiKUO@rRIh5fT
z^?6uVSBD{vUZ6S=7+mj9eEIxYP)G<yG}ujULe+snsB-7yhqh_aO89y9><LAa*(!wv
z*~-8Gz+B9nmsMd7)8fVfQ#r1qk%TdN{9ZXYCR+UasHj7t+IS`eDC^c{4KyzhNd}x$
z1+-9A1M#%zhA0UMi>(uw1^>>MzH%x%<Pql6a3m470g=rp4$dauk#_#c5->Fgo`#n2
zqNWFDc2HQovO{)`74$)WfT?9}qL1Bv3?l}|jUK<D*X9YFc~zYX;7*t7ai#b*E$9NP
zT1nWucee};8MkdaNqJ|y8~)W7rUK<SDTalx>i8P3sH%$Qyo+JXMwEbf`DC5qivC^o
zy61s`9C5RgME{{Rwg*qZlP1n;p;n=;dR6l2{p<vTPQH-BfWmiP-2}mw9vy>VQZ<?|
zN{01<5FGA<JJSh;15cFfm@N|yO5z09J!Mzk+t-JOzr7z^5Bnkz(LJyswy?CcX*8^<
z@$s0%luQX$Q!6V}kz;lp))$oy<m*(yj9X-pg2$|Ph$+B2sXM_DD1d>Lzq(~`eBF0N
zbEmR6fY0~uXD|qtI)ioqKUP|)OUsc#{}jNi^NicKE27}^c^u9;M}*Rq)pxI9uf~6o
zwYsTuR$nlZ*g56%YmL5ga+#R=g}8-l-kZfbFj<n}^2TBL`!MhjQ3Soq++xHqfMT`e
zHR9C@9`NJG4;YFHa>my>UV~jx5AOaV63ULMgA66c`f-;Wu5K++coG!EnPrVJAgarL
zSCGAMKrDnZa2$k%I~Tkd>F8d-p`nEbXAN%L0+=lmZGsd31)o0E(SKd;HC3>Cs^d}5
z?(SbPmq6VbZE|QLBzbwcx<H1y=-hv_mh$kdvCETj(18KN?$cgUP~gR*4RD$fXQ))v
zRuRSwiS42XgRTPDH*#A2@=Qy)Q?k1iSQWdtm06;}3W%s?IKtvOL=9+5ys)_NO{q5Y
zua3#Ula?X!y$KB^&Q}3M2thzU@#2&R9rqrg`hR#>${TXU|9Sxg?nj$Ed-lvfQ3~Yn
zAb<NKZ*QF&;SuqIjAj0Z5W&`H!lrFfP#(0jv<jBKY@2#$Hx7*vCsnHOgg8aTYG{RL
zKKw7n-aIbH^?Uojjb%<UB}2v}DuhTSbA_aV%o)pAhz4qJWyp}E2`Lf{Qidc=W+FS4
zq!A&K5=AQMd7s7Sv%kORd0xNg{$uZ6=)SM}y3TW*YaPe2j`bxnIXRhlqj;9AiV`8x
z?pC$@<*Qd$o|+1lPQIFKU%8qqCJrFa=JBv?ooNh1`9btikHr|o=V4B%cgw)JhF>Ns
zo(1sB-)p&j!@>Rg!!E}jt~4zhIdsH`UwzECy+37;MFDl}*xJ<hl~bRaWbGZFw0!3Y
z;r*Swv<Fd-@L#>Gc=-@lwa*8O)Utb;$1Q(yX-%*1>35gwd^KGiP9!E$%IL8$ooh?Z
zTMTT`0V}BfK;#1lp3Pm;{b!H6+q$elOYFVh>RWg;8W74FD$T8*Ud&~^K6sN7O%dYK
zafSAm%jj3JLD_$wX8@!46sGL*W$YCsbra18+Kno(j!Iggs%lhL`}-)meM)<$ysWI=
zZocz3>fOXpq3YCKVOF_mon}&a0EjG=yVW8iqgTbn&rubz$T$+&R<THQ&J5*%y6G|P
zc;cprs@#<;#(K(T-*J9;M3F~S_v6a|#kZ_-Qq%Weent6>8$P@YU4g4<m6@1LV7O?*
zggd!id+e9%(F2RG**7dU#@wB#O%?->-_Lcel~tSSqsSqaTb9^aAN#TUq@Ul1q9Q?3
z9X}bc!fhWiY!H-7gQdcBDE&l|ec{5h^mI+o(8$t<Q!Qv&rh4ZT^hN0mAiynCZgDjx
z21taY#O6!U)EjQ~Yi8bQ*RdnVU(EHyVdfW4>NsnAei-FmAYa3USA{<~{xCK5!sU>|
zm0c%zk)LHHw6aAQZ65RPmyp7$>Ux6o=ddr2_l>k0S12D!X09G$!neyW=rSneFj+_t
ztq&Q#-^uu)G{QG1rtZ=(UAQuJZo7q(KV`3;WqmwnMR>x+YA*Bm@o>1?Rgb%UFTMC$
zSjRao-72U^!wJSS=bD-Z(4+%*O+~PB)hce0QqG$@v0=Gm*COj1jFW8mWqNV5|M{mJ
z*h0g9+tkBW)WO_)HmBq3+pQetn!zLhQUeOik}d&+A#=dH1<R&huk$Xe{;UA5`ua6|
zj8yoZz_KPz@&$+g9N$cP_*BKWL^K2tN9>G6M9DB;ups5vtZT&w`XUjfd1>UVC@zbG
zxxA?8GP@12m&02Q3`jn1JTNb1(Z%_f{!XkAVl&BP&YW#3f4}~+gGy4Wqe4Gd`~qQA
zI`-8*nd6AZ0*L@;)W?#NtXjEGOBQ5(SMWBO<v}5T<;tkY7Ha`i34{j^9m<PY_uyi^
z`@w@NW0Sj<G?qPM$969?&>3f(F}$^<&cm$L#$~(FDo}76Yg7DrdV9y9A{@2);Niow
z3861yRsoW}{FVD0sY+C0;!b<}I0|Yk5D4>wcC{sWfr-=6Nq+U}75Fp2ILfna+rCly
zBTQLfVeu#>WpVPZfS#(0?$>uX`f8N^%GiYC%gkd(4GDkjH0YLL($1`;)SQOMm|ZKA
ze6~i68T8NN$E6GzaFro~b?@$bJRdzuuDVUk@Vz-L!Z%DBeiQ{fXebD?v?BE#AW#0P
zoKJxPFV?hg*Dknz8THoL;NW=&3<s{|P@vxP-Pm#4%1xV8!#?IA%sVKr@#y*U=cI=5
zYcF&>YFc*n&aRaA@7`V7{-wqV6mZ`@>DoESw@5iaTf!qE9;ej53>-rZY*@F@6LvB&
zF_9VuiGU+C4L8Lj$NR{USA200UusuDW5j8n*4Nh9+xKDPItl7V>=R(_%9Nv6l6-oZ
z+(0dlCc@{>VnJ3`djBf$&L?Z(z{QHdDT0CmPF)>wrlzf(-I%i5YNTH57X51rMW_Ey
z$=L-v7j9>l|Acq@(ElEMm}PhlLGeRw`8F#jgI}>LmflfuBkBK$Ql#BvVlbt{fXnaA
zW^FcqxvT6<;kUg7XQL1MQx($zm!7e(XHS2?BKcmuvNd#uB5*Hg^p*{y^^Tb;JC79_
z!_b0YQF@18Z+3O}NFfc~Y1-Y?%JbDI_xh3k6CJN{j4nQG=3C<)kOI-Qw?i*8U&EyB
zpBl{CjOo2eOXMjNG>-fE0jp(4M_=dUF*H12sTnO5z3GQ3Y0e{sA%N)|B^sK+bWvP6
zuBAATO6Xdm@E*i(HGE<TClTjf&cbNpgq2@*B!DI(=>G>xHX*T`EtRyG1z~vM%$eJT
z?_RySd8`m-DgDWl$Pq(6(#S`Jd*#X&cxiy|gv3Pu-RPDkj2oB0jwFnKIuTv>l{VmA
z#Dr}##Xdx<J9y>Fm7<GTMI|FMGh(r(uzd+qH-Z50@&Xq+-;nzFam1UW$?(%eNJs)K
zS<Xd8eFjYV{4zQ1*lRM(a<T-HlBotF5nc9%0uqc2EWM2)7(AhQ$>fl#uE)o2+K8jk
z`}fnOL2K=~*TiROMKWC!E=H!Ny^#|4_DH*%buBJDCF4cb4VyQux18i-vTsy<Cy-HL
zy2?OhdmiW)9ia)4vu?YOyruS6cQi}w24DO>X|=DV^a1(;4jc%+1ZQ8p(198?j?H!S
zs4P2{Zj+de1s3ip2t_k=F_^ot^pjW%XWP&IJ5A~P<Fn-x%}PfuojImUM_70aU%Xd(
zq)cof9dT{-@rkRlYMW<9J^7eta=`I*S=r8tSCjPiaqjHhD-}OyU@Tnp<42D&qIy8<
zUzfTf_3VcAQgidMVr}=E{%r%1m+Zm{jAgj0>eBd8dWrN^Ws5rK$qDn)v@sO}Uhirc
z<odR_c;qFVm%5<c%XnVkD#Jnm-kBNljK|pl_0WrG#eoCwqW?=;+C>j{K|$3E<ml|4
zJ*~TB-{^$Ek(X{+bi+WgGuNi_Y^jZ*Z}l1$RDWE!=f;ikkw>7Ea}ph6$&Kn~mLo^S
zSYgD75we=X8{=n>>*EVi4<Z~u&RtL#n(w!fkm*g6!?`b1nbL|hJw@0R*l#3bArBA2
z&1C%cx`q2MRQ#f=eQ+P_Pa-|;j85V~b<N2ZT>z=33(#7`)TyqHy0g&CZ1Bb&+qa3t
z-`~&bigL$zxykdXZ@Of^OZU2njsj%m2K_2zahaJ{><(NraM$g(@8H3+m$(SIQDK^J
zCy+L`?gHh9udqDZvC{kQC{O9tt!1QJUR@D8W9h<|X%l@{uUe)4?E#7X?U<5p$4pyz
zJ^ZBn32h_+M$xs3(Mb>+OTTpsJ9~CCnml>P_$|rn&|w^J4Q&G;RvSLBC(EU)cF+vS
z+vOLv#%O4)`h5Aym3EQ;u+;s(Ai4>?9BY@-BA6-#6!O5KLx&C@z7`uhy=1r%h)8}5
zC=GwVor(+Ki6%{Ic8I0^$jQhzh;EBQl2i(&R3FHyXemHQHJcdo0ph_my>usrecrSV
zo9Spk^uB%jt~T(FIv^gW_0__v*bD44!^OGmDtJg-s>t$r$aGp;JlKu+t{MRNe)GRx
zanLL5eu;73gG|qc$X0JZlsuekJZ<8xpMI@TJocBW*pWa5I%>LOWpRfG>1=hD8ZUC_
zyM+)#5APZda(#GWi$Vh5&oJ;dhRNk-hx0p)3`|>N`TfjI)2F5nlV_1iET&g+`SR2F
zSo6(ZxSc&nTO;)WRdDNv?^#<{<&0sk4hNj=b&y0Fd~4+hPxM(EVxlHEIy%<scQ*gF
zu(amU)2B!FH`IOBDIeXk`Crry2pNFQsvnLIPibF4X;xm^&DEyieDvp}hxZG`6U79p
z%{-Tdr8zMIFxatUhm0P@+pnLMb37jx?Xb7!XhTi9=xt}Z2WIXjo8Y=^hH!XTUK}WV
za3Zr`mXHgv(bxVgy4UWoC&HmMMf7n#E8p600d@C$Y)2^5U{7HcA{X|%<Dsms?zzHt
zfNS)Sp5LuoQxqDemhT^SaOk7%K3-m=#RHIasNc`Z@{XJG*8(XQd9`Jd%Aphc_dk>C
zAq7T&wuP5$&5EYns_|JXeUH*p&{;ee&2CjhE@ew}Y&FK{$fLueWX3?<z-P&u*W3{J
z^{e<Oh<`QHbxzu}HN{``Jc|$g1LZ7@2j$f@z?+9XSs9UaB`-9B6&ByZ?Oe0t(SU$i
zb>7NDhuU0q2^hJmZI@Arwo@(3e*k>G$V1R_>uyo4%{f~3Vn4D>SL(YGOEl#>Afd4}
zACAgfO+9{Gusbb})};f_!bJ)_F~>K1gp}nYz(jq(I$$pJv)sp`K6c+fEs#$=G1enE
zsOCifTWjPz9-L5H-uw_p(w#eZG+S5J`~<Vm+jsHei0KcCa-Wz(xLQ84qmK74j_RMd
z<ICUL4W;qzJtwws`t@_c+aBdovj*0xx!?JfX}(gY*x^z5;VIwyz3-D|xK?4_+9^|}
zbm+8$l|%|fMg}cLME@RavW3$;RckYUVRMHZ?e2x4X;qFFTiKb`!l^-P-$mIb7*e47
zbvvhp#zQT`e1GMSIL+{d(TV>{U2;OV-}UXEPhC$$X|l2~i3w=8l3Iq4F&U<+C=Hf<
zaCQ>7HyVund3k4Nb^$_@H;;4IH8ZorcVYDGIdeKmOTQXtKvEMBOgWwxV$i)@Gy(A%
zC<v#zJi+L>Y;s4_@k9z!A>We6lbyG{s{qN!yosb{hwAp&%RJ;v-IbaCQIC4^uXm~G
z=zS|geCekg!eLlfQ{#EV^k+2lV|=VGynFLzz{OeS2#{me<!86dwD#>YFJH~s+4=Bp
zss>jxUs96IDi0^%R(S5zXU?7?Ge^~-Lr=F;Lds=2v^W!W#bMQr3o1C_P4d>+0cjih
zKA`s7*Q>a!?8#20{aOwOImay)E?hW&zM!B6zbU1W$%+L*u>&Q~8M4i4LoXQ_HT$nC
zNtmGz%m0Z&hs}B^`rf)#tMuFJFv3GCzaT%F;y_AjFUUVDww@pnGkf9R1HT>Y?~1AU
zLTK*$09^_Lkon9W))CF#6ZH$vE3?4Atc(^dO0_xPviU83w~Znni}Wh=d-S$j8&{`P
zjgKs%Xj|&|g8JCz&}>}kJ`9*9O<jBF;nyOudi-?m!RZ0lTmjb&_-;^mWz8J#6CXZ)
z%y@;Ar+;85Qs}JbK66n*5z@Ozw}j||Tm?bD_+)24_!76F<KmqgcA*5R@S|2DHGTSm
z$W?IZa$9;)=xZY=*!<j{dRzm|TEKm9j?S9*)6zWb6~YtGNIvcl5C2Hj0_j^x+OE^^
zGxNm@KY!O|cRQb|?g5dTnt%rSL1EVUIepI!+Muf|am<pwMhn&P(Z?r>j-<r|Mi%5*
z8~OsmKT&i{%<TP=WZ;b)$9GlQUcQ9+iB4jbq3WWmcPU^j)OZ}?t=Jqdq0MwTRIn*R
ze{z37JCbpbrqyET0af6%H)2d4ohbRs(N-amnkilKJTi~+!cA(k(_%yE<-6p%{JP;M
zlYyZvdQMMGKq0>|`CY3wh^6fGd6(-)S{d&lu}>dINvET;6khMyGn4*OVqSw#{jAUq
zOx{*%_FBVQd_sBoub9XVQ}GE^*kbhzL2Yk(Ftv)w8S@wK@Gm@KC}rfyxkd+p)hfl|
zL2^sn7x$CFaAEg%50Qd;TP(rFm)uxwNY{;{diL(U6JpYOm}t-TYrU6ilUcBUlNCv6
zt8yuOx+ei7f^DWuxyx#Edq78}`>@yHBiD-t_9Q2u5>1<;mh&pi=g{v@t?J|LJ!rrH
zVGPDP%dRydeWChwBx7B&q>y)@Legwa(F>UDuJXz8*UvZsMNTm_-9}oYuqvO!{>Jvt
zOD0Vcmho`I{-=D62ZpVYQ#?!k1XpmzX2*^n)K^oq7eUwjgiw=K6ecaBR=0o9Y~94A
zOl28aAI<|A`Za3$U%DVK-Fo+)C5>k4B4}dF-pB(aergk?A;(k}eH6JiO008EaeH=X
z6%uZ!9K+YsK)G%6X7=dmhOL`7FZ*?PgUf}Fp(HhismHc-*8AYxP4O%}D5e{4YYoGK
z$1SQSO8aeC?9J@W(z}Tgfs>dFiY#%0mxUh#Z^*c$6=|~&dt=1QmuwzJ4=J97o&}67
zI$TL#54(Umks)wE5jNX0Kuqc}Mk*<-B0mOw4Ihbe!-m{zG-TPxx^~^yN;sM6TSn=V
zL54~Bq~TDt)-y_9vYgG${o{!4UiglZA-TmM*B-rkHGF-n`Y9353+|%)p~|mcd0K;i
zzOg~+{=jtddX6O|lbz#`C=N%hFM~+TOZoAU&U#4O9xU>zL<r6k*#C3KW~3x=^PTmW
z(J>w-`*>4}rKH~^_E?pwFfq)@&9&oF2fsW7#wD52IrG$>XzL|y6j=;a-IZ!5cfFI9
zB}{Ne^B}K1>~@X4kJnvMWeul1h=xo~_!wGZ7R|W`jI5rVYF6{;_^qEA<{|bS>f7ER
zEhdBOZa<l~{!v&jofYdULu|G4jlmOt#@TK@m!qTo#%}6!s73SUyMEQI$+eC^u1aah
zg@MU+3`|=JzEzNxg?<(bnG7*jlx0BNL|eyKphhH-hWMkhE}owdgfjz$R3qtuO2c|i
z?3%Pb4$X3qI3cPkdH>$`)G2=Vl$Tc}su}Tyw=)S*bNtjM{g5V5joKLsSV5fljsg{B
z0}qGjd<u#h>H@Gqi2lDH&dbSBf7}+O$TFB3FjhA=H?D&Ox&{gx9}C#wI@mZHg(=v%
z@lOm(JmTZy!GzH^unv$pdL2D_xX;8J7_MO9X<f;8S>AoizS2*_I<rj|@Qo=pgjC&@
zsI})LEj71ot`hC-x^AO{sLtVL31bT6!@xIu9fpS>RD1O*+VY+|7%Qb{U&JZ((+{hi
zb2=hJ(mU_Y>XYAQ^-oi>Mrg-w2ua&Qb310k&`mnzu*fCS-Y4()a=WmwFePPXKN+g@
zTRisMl=GZO@J+%8SUM~;R5*w6_TU+wCr$`;6bK`W2dYPK>@t?t){pA7Jf-dJYpD4$
z#zD}+E_d$SnOlTZo>fH4P(61EiPqh%QWs8>YEWhaXJqE()dBGUvJ-Wow(uGw+Z{J*
z+M;<`;!JDZsZZ*$Xh#Q!T|^;ng_hPyvsEYy+hpg9r#E@>?)%6PxqNQmP*v5mM~`l>
z?orc<4qZB>$}6?eU{KRlo$0I{s6#zjUg8#iYn0*?=yq~D>WCq(KgmnTIg*KKFiITi
zCMF>5r6ZOk%n(f@d2w6L1DRca6aD@Wsn}u7=gd}MkB+vpwaw-GLTC~?$w<ku8?aeH
zRnkR|I-PcchXDz4B1B*r^S)DMT{n*hqtw+A3oNCbz{4XAn9P4$fW-PCOdI4nnyLU4
zoq|fSwMF>`l97m(2vPde1qRcnzenJ4dy{8*d3kD<8LliIFsxBgo491|JDL^>W*EE%
z0?qTY-4cR=ZX%*6EYu!2a3FE(%$dKJR8-({qTYK048tI&zgjA;gp%dr&y|!F#Am;F
zA!I}JYmvb{#)hD}8bL@sY6x-)!hS=|`@Sj@1xbRziN}LCaq;n^dgoFFmiX=!fA;bM
znPw-2aYybG9HcvSQdL%V;P?RpDEWF69MaMQH$bw3)hwGfid;E+YV+pJ9RCj=KZZU*
zc%26a%HL96wNXR?p|JB9E_&){Ymj=aS-l#i`&&GU_Uy^c%Uif$LHeU+L=>9?K_lz8
zBK_Z#f88@Muz@@m01w2o?R&uh=*adw?3PUiPMJ4LdI50Ul@Io1bS<byD!D!JQ{O$}
zz5^B}phz}%P`ci{wC5WDe2L{dA`1cC@_BeSn$#VyMxCDlg?R4U<n<^DCpi63%e3>1
zB-BDxvvjLoTZ(wSAz)ood#%kH0e#R*>+g)nv4|gT&$0`nEr^&qvVL4oGR;#+`ID1%
zB6UZO8Kb@VmCN-}myyujh>zdv=C(io@tZHjX@SW=x;Cx;=F+!q-`?H73`raJ7#rKI
z+*sxK5sR)rCG-C7L+cXf(d*}Xj-fPx7-L~^8zMf1fBgWysyTjq@!Pliv<_4DAa2KR
zGkmK)T(wGkvpkL;k$)zP2fGKnncAlU(44#7JubqXqqJwwo^3W&S5_k2T{r9SP1Fe=
zx+J-yfhEeRx4KR;Ew>;r7VTPYg!gW5?=1EVs<c}ZQ<03?Cx(WH0|i8op!T2sy#7+H
z@==FrbIFJc@LZX`;SwkU&xQGHvux|aYuqDg3gE6Gi#*?<#+8%>6M;1)86l?OP70(y
zJ9pme?c?v7>;naAV-sidW>3Wkz8q;bDH%tTsnyCnG(-+=p${)Wzmeyl&c+UIkkbo}
zdhG1nt(@lfU67U83zFyk+Q<!Ewfdw{ma?CBm{SBpVMUIJyiLPTK|wt%&TUO0x=no}
zM)*)z9WXFfpaQlruQqnlc1EUC<ny`YS(w*>0|pdYnujrRX<@e}pNttr_?CmtRUbTg
z5D%x?ld%XMLUrjc)5Ye<HBp_1_qIc%d3=$saU}N!VJ>15sg^CD)vq;X-Xs1nS_>EE
z5h$N!4kg{A+rq@u^iu20P6LJxoq4^RGhV8!OpAmam#z=4rzr$<auF9awH%0s7QtKu
z5>v}fk@C{tLXWMtRk-ka(sl~EgesZE%Mo$&%lRX8a0!Y&)lYGwMDcD~+9m=T+*DC(
zG+`xlB4j*!_I8}K(|NVu77PP1XJ+m5(GC@_npL~oi#>mgb3WZnjKEoZb*<;s%)eU8
z%E^tkex~HvI|kpO?w*k=e>e#vb&=~`yn6<Sw{dk%%V7)91Pg>a)#q;`Y7GJYCGCvA
zuHBABM_=XH0{aLxHMJcGbvEp4t)rG|PE+D@k1M*-V2qKgd#PJad18=by^*vStm--h
zJDoRRkbp)_Z8y__7SI`V?dnZUJ~Ln)l`GeYxeB;=J*gam8?ekhd!~{0A+3|@AU`F_
zQ?`j=L0I;>1h|ETBr7jpOUgjA5YK`~gCc<pw3oa*E_ZI%H09RNMMG*E4`EAd`GzOd
zoA70Qd-dwI+I#PDR2ExpZ7*NC#Dbj$Nx<>$@c>WfQtT;-3d2=YtY7pM$T)jEk%dg%
z&OmS!P=EiehlvG1pgNc+OeitZ-0=t%GknEoo;FfOnibAGynTJsJaPqEWr<!#yu4P>
ztx&Fa1H%ojeWbGTM`|WYs-~`M*eGCM9_;8bWBO>?4$Pc?IFD!te8D}yxp5(gl}$)&
zhdIV+N{Dn<vZxX6OSf(7P+z-^QwFiaPXDU1yllog#y^jCw@5Q57bLqBeG`PozjyEM
zA}dSoL}pF8!)gI>!<Q@j$RKUnAh}Mk3m{N@LbrjjLjw^t>4Fr^pg-hfSFd01#D?b(
zY}K+QZMmo=l$4bbD%v52Kv#P4d&lSV%(jgN<b;bsqK=OE>E9M0-$Yho`D6f+x3hRm
z!oL>P9E28cfmW1%!7nh=dKUxpjMoPfm~J4+;-1&k)^b#U5DC=}J^khR_Vj;>G*rIc
zT^RPj;Me^4@h~NY@is@|t9)cE0Rq;p-2sp8?A!n?L{}Z0C@oN{xm%<eUCaZeWX(6k
zuW~t#Aw3T(nW+g?LcZRr6*EI%p{%o3e?Rg^3t(-Hlp6A59n0vG_7_u%b<n0ISbT%#
z0upX*){Mqt_*?pHRTf>3W&`EdpKN3^RMs{W!_3kcH4<=$f)v<$-aN0QD|xxOW_3G5
z`AQ_poCVbPr*xcuQP+V>fID+9s5H{hDfI99QykqA$u^;cfS#E)efohqdBKpp%2Vf$
zSoT1gbh6StgDVz-=t$>=UnI<x7;p{zI+*&2y)Q~}QXg@4^GFUJY)|RRK?TxBI%bXP
zqA2Ne8U1`P+U%X}Jbeq*FI1lS`HMhL_<o$QcoGtTA~<vuG-<GAAjx{(PDC4qM^Pw|
zL9iQ^uUK(K{G}po;T0{`1r^3VRl_KSeZ-t?d#yqiq4kSB!oufcTBppqYt4du2TGh&
zn_`&wqR+-w9c}6IxmEEnteHxOS?Ju3RUJ6c_2TrGY+8(q#J~Cfyk;nbZ*o?G#X0jA
zN0tfxhX4EPhkA<5CofM=t7t9V{!K;;XB)l7e+F{ESe@rLWcM@;gO%S+=M+n3<^<@7
zwF!;p4ZRH)5#2hNe+FWhDv#~F#&08%<3dKhPqb2Ysp-fu(M1nz(&%|>$;;~qaP{_4
zRa|Ylpiotk><tW#;tWML5`ys;u34$oA2}o&jZ0|$?bBiNHyrv_+%etx_~)@1&I=m*
z<D#ZE6i#z9o5k4Lzh&sL`|GoDvsb-n!kGN)Ws@06#bnMBw&Hll?-yvq5e~v6+rmvM
zD1CX7W6F~!jow1}epYRl%45nm2G5pMjh^&!PC6>?){oPmuTM2vZVp>Fi$OdmTj%8D
zG?}R|tSC{>+h1ruK>)SDqqZJ8&S^lpyats;RsYdoI<4<GRsUVx4}TQ(e>tjPLW8IM
zXv@3%=swn%uP5F?=O7!N8?HBlz;ZU=@%A9r(`aUac?oA61!}HeI^WB+Mua<jY51{b
z^$X9blUlW!Vr*pi?EdhZy%(){7_-sZD(;%qY76U)v9`m9Pn{Qc+-iYO@yIu}S*dcP
zL)uK8tF9q8b)K}GjBC#y^)}`2N;{>ulRo*E{ltdxquy`%C&7PuhrXRY*C>=+N&GNR
z@J*ri(AirlhOjb>W*_VA!(`cQTY^Mln)=|u^jv&E{!W-FdG*Cz``q=l?Ik|<KdRS}
ziisnB)m+}d+V70>`mt6SQwj%W>0xVk)rK5_gWz$m){;IUAAI7c1`0DB-9si2yTS~Q
zNJ6A7&5Mo7w~<J?J%BU;E#RfCy`)@4QZX)L>I$0aDefMzM2$lF#5Nj=pPG^`fAQ)j
zgEzOnM~VwlZ!$qY^CB-V8B>0eYM%+xlAE#<F=Eu)$9)C`iMt}vO+ktqaP(;Z>z3^$
z-5Lkm=&sU4@GGNt+K85&M*$)7n5DXr>d6$LLCsiP(ClV=49b^EpEgo@W*w8ifUD<a
zeHkKiW-p3|BV*f0wA{Deo3B;j*-8?v(CCw{e2+?vAfxJvHiFtCKHnys{9jllNGn45
z{_{0flA6g??rNytuIOqJ`Ms*@*aQr>c<Ii1PQ_)lf-2_g;ASq<(@Vc)Oao(l%MkKV
zhu?3HYrxr7ize&oz5T~{gWO_+0*XHFL(r7*0kdn@zt>Z4daG6KI&|1(W7AoWtZoQF
zw|tz$Xc4FxA<Rt%7860W4<8#8-Z&*aR1SxKr2UXLWx1y1Dp`BXB22?f^!8CY_!|z6
zdn3`cp)B_v!w^SJdoS@3t0SVLHxn~4zOj{RCNXgYe#ki5fr5!--IuqPtEP@Vcm8}k
zMe?dOt5yx#t{~AhZ8YaM8VVX8PsR#|M@IQDAS8D!XD5NwO$KV)Gig|RN%0#%B9uik
zdRNh}fneD_5PLNB8EcCC=_R7jNHfPzFVwj>Nq8)Z6N6r`A4kPQx$q&_it~}1D0%p|
zST(w<<|=zz$5@ohWu;<EI(UN$)5Qu0N{mv${6=g@_*Gjf{xNSa=8q`4DygVEEB%H8
z4g6sIuKFAKrzMgsQ`Y*~t|N9)B7?|3WD@(YIpB&M5N<MnNS_%oAXf6ophjoAQ8?yp
z($UdTybIbwlGC5B|HFfGhn&?ydM*eVpEQ@pLgc8)rQpP28jOGVMC=VL`Pg@dT1rBb
z<l4GbP5l670=kzXJ~54`20PTMR72v@s%g@}s58c4KomnaP_ry9RFfEqa(P<tTxoR#
zXwc{4_aqXvr9cDZ{Rte|=1X^G@rgiOXlft!ZYwc51b^`yCogGXV-L>pnBQ@gq15lf
zvXV}Rr~*|NZ`7}oTh&BSvEDTFGdu_Chw5Oq^d)c-!dps!v`feY+qW}B%XGVTp!*F!
z-gKHieR^DD@Xr#-Y|2Xf9>f+6a##rOB#><x5(w;V4;ZH?uaLZj%$AXr{fgG4ts*^p
z!ZC0={*_vO-kf?j6GBbGL33v2Vft+5;BX_sUNZkWNErB={>w!H8tb}ciHV*;Rmb!P
z@zYH&M-0vD>+9WQD5AvrU}YL$0x;Bc6R<nXz)+rx%lf7u@~P_c%Ld$)yXetu@}^+>
zOK<-5f$tc_2Ix31k;a-x7y^g|CtmXvQ17sUhsm{7yi3gsj`ePK%bT`~!Nqw3deR}r
zaTGQr_u}iXocHVd1`>o;#o~ASR59b`*~P^Rr7y9%krc3;T-<6wF}kbF@8=qQ=@JjT
zL#&24`EVJWN=Tq9WPmHiZ{;Z70)2gla8rDgbab5AIMkJT`-EXGEtlm}Boa;U_nr1~
zpwji{kA`OJ`&qn6&}}9SFwbX7?k?uR$;!&UebVA)b84=#z70S2rmU3JV`-fyk+dBm
z(hKV(PVTq?V#m>V>9ljFn+z?zr8LXEJ>24^Y$9&&NQd>LQDlK=iuKa(%2y^#qew(k
z1U1EU$ZcKw2w$8dv`&4+(Y31{(ljv*M?=Gu%T$?xsVCFfxB-J)KV(OVfluOFajC@u
z*r}|9QO+k;z5S=TL~R1x3jG6@u3W(yV&Sz<5m8A;(OW;h%&$LuI787jF|nTI2te?5
zc1y{T1@^hI;CdMaW6UEhz7dJcwhrW-U+DnVoHmPalVMKR2O<^ueg1yQd{>ekSPWe1
z^`z6$2x4KbUu&+$Z@B2k;7LR9i(GJ}7@Jjxns>fpRf}hudcYzQ$CkLO6DcJ>wv~h*
zoPC~aK6$lw&z?Ex`RpsXeUt2eIrX4AW2SRkMRo-bm?>L-xz84@`qGN7w9k=*CMG1f
z%Al-4^uT;S?+J1)^8c(Hd5E4f>UO<-;O3=U$Ag~4H4YN#_L`cQ{j9eSUl<M(6njgq
zZ=O_cB#~5#W@R$iwu**>GikyAt5o5pYP=K<(zn~G_Bb?6@h<kZ0}A3gOI&P66jlP|
z^H+}>JwOEgN*vRhM=&&LLDmcG;yq#NlWgt@lCbB<i=;1+T62uNjN3Au5466(C6JOc
zH+ZJ>@=cqPDYbbHkBBM~NwggBGNtbEiH@9_F>W%1R9HRRz0D<l0jOA#Fr?Mn$1{UR
z`IH-Dz^bqwnYiR7;z$fH6z>vux~VP^*Y6T%gnfk1=ZC~X5AE8u)RubEZhBvyFWfY9
zCt0b)0A;lp)J|x=uqbY%lG2RwaT+fH&nd&~$;}m~kZVze&r9C5lbaZ7h0zQ^UPB;E
zsPyu6>*`n|Wc}vi9!gvWA%#Ni(!Na_=(iK%>@OC34Qq4}50`J)kjVGJY<Q-)13|17
z0bFN<izP>07<2jZ(Ft;rn~RPm?;Iu$lSPcMygK-Z%o&hU?g*7^M9_LZXZ1(QCzNi9
zw(s0!Zqxb>r^7jFyEnXs{Ggh<l7C~HL1X--t66*8xcTDq4ub(x@QzbdoFx7F!-wk}
z(qwECS3A+=iew=+cDre-roeHtL0$e352T)F9I}@mFf@H@G10}{a<sh8k5{|d?KAI%
z^Yq9}PIc-jxw+efGI?nAt+sAOm=g0nYB8;Ll@X1EBp$S@d@d?7Ufzt=kqSrbIC#z}
zWPi%8p^Oa$umdT1+i_KA+A0Id69Q&$r?jTkmJ<)jT`mh?o|2sH^cF^Ci0o%ZWGzYF
zwOCphDU%6o!!Y{GUWmm30|&08!V=USSOwpRL*i3eS;Rn9?iWucWS+9OMWh%N&*7ve
zz5CW?fF`~^uzX7I4!SYEM~|M#vOeH5fW2Hq!D^!Hoe;+3E8hP+X}B#LPt27CQ17Bg
zD;Zm7BL(Hp=X@azLu>1gggVB7C{1iHEj>SBhYh>&?W|Q+q>dB8H|VD`=md;KFTy*T
z1ve>6va&h&$!C$sO)FVWI?I>&`2KyhPA7b^zH&$UI86&N1||s2BKznqFP}?P2r*tT
z^fVvHt70i3+Mel>)Do^TV97|4aHDP?F-D>k$V%t>k_fc)!s<zrL_!E5wVqqfdf#HA
zUbawiim`D95n0G>N|w{hfOU5-IXM%>A)NR}eACzFlY8LY39lHk26K`^=Z2jiKK8p8
z3=U2ohl?TdP=k3304F(ZX8Ec8HE2k)V>9XVPi77Up@4y;Gt$tpDq~Xf&p#JO?{y~l
zi3rcOqLsry0l!+|r`fSV<VI}|G^MdFQVKe|oG6O`M>B->!wLU->hP;P2a^dv0Rn3V
zX;7J1_2mn^l>tZ4+gYt7Y74tQPwJ{i%FYHBp0S)@Vvh^A{EQ1dSkcR_QC5COAHv*_
zGmq^^Z~RBD(%-S;dcrL<T1fy?kT^3tfzgd)8H+3@H`n6%B1!mPz?nR$GzJgB9b}9G
zZCXl;BPZ$7!&gF>1cj-{^y$6Y@1eWBjL<r$V<l<%^Sd3|v_ZYSVd#V;@wD_+wY5)e
zTQ0e2DX0=i#$1|Ts*J)+M1K%!g3;Y1$L<4Pta)<$8{4Lv=_%Y@zc3E`dDy(^qT{gA
z)U#}13O81#_Cqk5R2F^Qi^bzu3*nwj_Y9a#YuU9kg^_xvpROIv`$HP~jO{g$G8BCU
z;b^qKVa6zu1qIrjQJ~0=S+BHdjHI}PxCQpRlxH~i^EJz;GsW-UBc(H#wTRt8$dlbm
zf>2Xe#~-i=@z{AH!$-Od5#=7BGhQd_DaJsP%$2>x;s5Z}ONJJZnH@0OfZ1DybqpCH
zUb=70Te$4)8hJZ7yY$es;HtBMfiID;AbflKq_srNx$8@wo}c01fZ5sfQE^g+-=4#Y
zFv?*9kOyWeY)D#aqduf?xyj5QK(#rt_2m6zB4n>|#DFvYk>1{>q&gb%vnYFD#`<s4
zk`(vT)D)v+9TI<0mvy>0i~ddA7!w}F+q@8UR9)HFu>HO8TxG&(ks2x_Le0!s`3T<4
z=832FXgSFo+syMC4ihEhybKgE3^7h&0IAV8tQjacG_mi2gE}(=_#R8n(ekqhT&Rgw
z9hADM00r3~y5vf~vGo)1$w;px;9?zBOC@n%s+A;Yv~tMKh1!5F{MN8cMVj*wk*pd&
zkc@@wsOrr+hG#xM`h--G%U*D9G82<GtXMIx_Dh0!NZ&L^)Bn>CrWH#_%>;^-am_eP
zOyA&?USwaOVzQ4U9uGO)^O;?DYAh&v{kpT9+_P{~B*Ll~!|{@ml8V!79x9I@2#lH+
zG$e>Quc;A!2FLVQA$ow&gHT(tDvTt7^sPhd$>5uqtMYTmLcfW0S<kU93eWn;%AUV)
zq4|Ot+{mEXjM8uHEA<}jGbZ=ybpq1b!O`(C_pzTZZzi%czNo-WE9?nY;0a)f#J=ls
zaWlIgiW}D1=nAf1`sN4GMIe|2Kn8{Fv*;X$*t>TcxB<W)($JUU-9=(l<~O(QI~R#V
zgL0;#m@NeC_jv0J0APag;@U3`>(8~m48GCGfwoqxqaAI9NZ@}P2w_b#o^rvIlO
z%%3=FFrfo+1kaDo>YHe)4Ik)D<Ek67lYfcXZtxP116sLg5|Y_qFW6@*lT8iiHo+<7
z?BH@jU*B)-ZXitN5{@3fE=Ka<J=L{wEMvGYlt3z@Nwb$a_oi(<-T1o1#aeOb&=e9M
z4tBb!&-`OGT7KL3#Zb3m=*Fuj<q}=X6)U>1sHrUuO}n!JSWr~UV(o4b6OwSpsn}U<
zeV%JW%g?F%9cJk4!c?a2c!zE67*rg_N};qN?@OoggJkkV%9?mc^+s#Q4<f5}oiTH!
z=ke>Uci&p3hPYY(4&XP04X?9%QC3;UjHhz2P{=MUTym{(+Krk4>@qJ{km~AcUogO+
z^#n7U9~H~3@(K$JIStrms42k_;I;VyhLMC67=cxvAz>LpdQ$lzo8|}fRQ${j{5gKF
z;P=>_(QzxR+DDDrf*^r2U_48J6VAOruY=3fOVDi9)t?c|^Oro?6w6q$)ltFUH;@l-
zC3DYBe&nFK{>#g4pY(MKktknKO$UO%T-f94g;<^9MCFNv5gBj($NwUZSCd*tUf6r{
zf*7;Sse<CAvmPz|m9**rTx9HYBOnjhK3?L@(8ifJtb))GKyQTKgUA`ZR*i*2jUcTE
zhRfnTaQr$?W4Juy+BzgIiOMd$=N#3!yM_&A<&(4XMTz`ygqGH#a%qXn$OHTKDOtaS
z-sE-m{CS<%nlK^jH*64*A;N4pw*)s&@NUW`B@!7n?3?8I-MWY7KshXGc$}RbOQyz6
z7K!t-mS`yS;zo(}A#pickYGP*%osE4qWgF6YJE(b&Fv@A@;ZK<73TzoAYLY_R6Ekr
z(xK0MIauib4~dU=3`r6hx=PdVY)&94uM~8<HD?BIq!pM#b^icb$IlMFXc5VdOFVRF
zR@doG9%f1rz`~HgX7~Mxk-YOb1hVD5`6w6nrup)$d0d`K0dS1FR8xZpBiH?RphwxB
z!!>NcL!VJ!EB~HLXe3VtjGLs5Ti|o-<iz9{a%8tap|<on*a-TOolQcC@a!()?q8g}
z$PV-sim|@cuz`gKv+J#(5Z*YkaM>PK!MXIcK$Gv7BG|1A`U<rD;)^XwRT%~_!56sU
z_fCn%mL{AA)NRFRc|21>24B3jY#b!z{)V6CVJ3Kq8Fv{=-yiElKjiYqOi>hnrGrnG
zd;_*ZObB7x0c)B{(8X<}zac@NMby=H3~e<KTeMIm==~#v2y#t>G9@nZ<?xymdE6Dz
z%x7gas3S(78*?s)Ji9B5d8n8p8ox!q1EPav`gDUlm~zOrzOZ1WubR84u<|bv2Zrc)
zSVaDU_$BhyzCAB4O@B(`)KQ7#v2D%QRbl(rsr4YNpv?Mnqq3~GvB~C3#LCpjBA97&
zE)QRLZHBW(#L{FzyKocV{?kh0vzCr4VdZmll#-mg7~PAa2`yn)Z?Rx@^k|x^Em1LN
zvoBh7^4Z;qAU+V;`|ohmcmMr&)0)9dFRRh0QF-xPsqMT2%$PtN=ZB(UkFnS=TK@6&
z9eiO?dcupDt{1@NtL05ojwLspA=YNh%Etkp7X_m1UeJs1h6t7lX|ieKnB&la+{@t>
zSC_qBC<)c)tii#J@+>NwM9wzwJc|tqZyPzhvRW_Ryh7>so9nhYIBf;K>W#}uTRib>
z853nwU0s)FcVPlIpj0&uyHI<T-(i(F;POl16FB=kJ#N{cyMb>_AQ^?(Ans54A>pAr
z_-8lYGy<vo)R2cDoA?kixSYa=)_IE}HC6;OI>Yv(K=uVqF#SD_ciBIH7crc}iA$0J
zb~u%!wh#4w1UU}DUP5-Ny3^|xCS_~~B(dHyV%6>#m>bLgxUf|2IrsY3vL7Uy<sj8X
zYA}oBLb9?Mkfc~B{n=rulaR_1qDr>Hf(?IejGl#o5AkLZSY1P%HA1YloVFpFVrD#}
zYZkIn^xJgcQ?%rNKkkYtO<}KTo1j(VabLLbv%a1`u!!u~zP*W(oGD{=#mmGWL=G$R
zri7N5XX%KG8w^=w`dUor@nEfcB|IB<GS$*UCk_Mzi|E0#PtLz3>LQnn(@nEMB+4{`
zxtrWMw12-QhZ`+ptmTg%1q0mpEsd$Xn!ch|90wJ{Q04@uS7eQpC>{0jv7<AH&!^@k
zH|ds#r6x5PiRFwL4}-O}pq>DK8vJ}k2jljaJG)agEpPnrk!SvV_<@v_<B|I|tr8?!
z|05=rKRvx0=dY%i_-Oe(_pY0gYF4vxFcT0>)#I;ARC3r;+r|ehiPG8n;Yj*gr_FT0
zE?&DfbXqW-4@WRQY0wfq8+h(fj~L|F;Sxkn@#xmA)0Va^xQ*|AM^lN3Csi~npR|@t
zT%4A7mop`i+<;OTH=;5W9((srZ^}XR2f&&je3|+#nnW8>dn_&;U(Ypn312zVA3K6`
z*`-)A+H=xbIu;ZZV8|MrWzFg<d;8Yp+$aw!V#@D(r*IXvBS!(BJEa7>B{AyUh>`M>
zedE@owpgatgZ$axcOVYAXH!O<-Wb0sl;B7Fw0AnJUw@&>DAEk)yvxl;xX7zcm{5mu
z(IIgc_>DLfE+t|QA%AFEL>B2p7R|Fxs;9F{V=T&cr{P58#Hc-{-YKu4c9I0?{n=q(
zF+pUop{jKe$<VdC%h`>9N>gX$V>EJHEZQRlqP>S=clWJmkUJ{WN8ppn22{)Zymfp5
z|0TWalOJLnhT$y>gXPm#MyBFzl((1HoAUBq@aBrH#$hMfifHs66}%HipR&U5y|Bz~
zV_`ziUcH8C*bql(*W(^g313`1fpw{%@Ts`i?4G8m@MqFzM>$FUGIYC)By`!oVw4a_
z+-uufP%-V=Wv(3!aztS3T9*OJAk2#qYa}yi3uMFeQ%YmG_9wvmY5YH))IqBe1fbna
zFWE|N4(&elS}Zm4>({U44+&QX@9L<}d_LdB9(@kSmv@zzRiYdB=SFAEq$3{|CS<Vr
z>kh;IXaUM$JR0x<lHH4B>l*x@b<yuRTuot_)Bimyk1tKlBvQ9Wj~q7p8i%bXu1*V1
zw~n!nxzYAEX(+A8%5v_?J0NHzK^J~6FBfZ-L4&4R7eVn8#)%`1kzapSK6bh&WWXwZ
zM>Urm{TVZ!mVO%_AX+N#Z5+2^)25ff(>q0)MPyk+J-pKw*t1`+KjCtZnmmDIKg$G7
zh4+qvJD9Y*e7V%Ri0J_2gNwpU%yu@7`&jv(jY2&x%ug*H!TGZ>V2Cs$YAhv{T_7vY
z+~PmkNvJ>DsY~O6fPk+>N9j8*H76$yTEw`Eh(T10kwqu@m1skd68*-`lp~NJkH7M7
z#q5Le0UN0v*_cDGja5@K;wy|kP6_L#r)Z!jZjZ(9Y(=7So2b$lg!dgf^aM$wjUNVH
z&+eK=WRdq>iHZ6BIWm-qXgd|K>tYh?TktA~4vqXjA*tb=?@|JK-~3f;)^w-+IDIXq
z#UJ3G%o{(>;ZF!wnf?EC%IhB5hNy&^pV>gurB9nEqf-(|Hhs_!1KQ~xvK;A8hkjUQ
z$u2@ZRZO?MQfv(HA~`fI6E!9ELDL=0>i%bol{B7i8!1Z-4G%v1YlrxjwaL^%Z$Eyt
zXxv>!D-f_wpEeEoE0zYc9EusF3nFvJD==s6=)&tZ1CV!c>-9svrp%EJ%8eroQgNMa
z3V*;$mX0L2qva2oMR4OHkfErw5&KNu<#)>22%fTHNCUBcMz1hviPqddiQ(wcS3yoj
zi@d_f40r{P2S&8N=IF;jF%zFe<34`AzIX{Za9=1X&!0W3b=`)eKR8{<70Tw0foh_O
z|CeTdzKE`&G|kUvLN+%LEIO4NhndjS3Fk~q0JZWRrYJte>uFL`{=MTYC)$01@8px;
z(BBzzcRAG;#}jJ}R0&ikqH(Q-Cb25Fp!cEDl5^B%Y7>7$Z2>L^o?<X_CiIjLUQsmu
zqF}90%dGwJgQncvoE+2H){^4>0L;`iTW=sp>sD8qOamT^_o_u1qsYw#jNZ5jG-Q82
zGM5bf-&N*o=C}{L=4aIT-}#8V=!jABqP6eCw_3hwyy5gPlW@4<x*tC<9axgEW3Kci
z1~o7;F3NUf^CR<6Fd=xHeZKC?gT4$KH(S36G=M7(A&L0*an*_y1DdwR{U#ceThs>|
zU0@C#*x<p%D8>eL#Ne7T()ajvN?f+pX(JOG6E4F<mC}12Lbv_#<gC<oujr?E_b!V*
zbR5wDsqnc&xJ5f7t}<;DfvDh7SB><i4MuQ3+qYMTAL<lznadV-^-<ZM3*yJ(mHDC&
zT_z>s+L&A>Pw#QwA%cYWq2KFCw-r@5;0f73N<AQGwr)hkI`GAi!5gtX7VYkhr)Rhu
z32_;3Ed3>QR2t-21_})20(=0EoiiEBq>j)~CB?fKh-o8m()VZ=G;%Sj{PUS5-)e;z
z_RxK)?KydQdl~IJ`Z!lZ1kk<c%9XgZ`jguj-D<*p_~0hcr(N5&`U><P3~4N=$ebgM
z!@|djlsq*Pv;p|^;Iv@;IRT6d^79{U{ef5ueD0{fztx5f0Zguk|I~lfCg>=nFnf+G
zAyH=7Shqahs{poWT}&>B3pka5PvfPy{pX>@flki}7(YnEhE|4MD28SCimI40jpa<J
z77Mg-Ijao=v91B78RWX7t}GUg+2Z*fe0xTb!7JV+t#Br`xTi@4$x>p|{QyE%vh@Zr
z(9)5bJjNn&S}<llX3E}|2a2+|-=7d?6!IsLR_oCo(5wmkJ$^l-=sFEGO=sFyf$%pm
zH`qF{9{BRc=kBn)$bgY_E4niEu49`vVJH10$~WPZ;^<E!{REYU9+TS6N+<QQj>%;!
z{-~>a>@C9;=&JuNNF*!0um$bgcPX{hzcOnO1~AgLZfD$sSx7-2pPZUl7ufLgyi5^1
zF>bS47IgA(cbCW5M&=ARo!ov^$2NZ6JS-Xz0%o7jE(O=Slt{IWTy*cWV0*@BQroYb
zDGtWp!Wn+@JAlX)&qU5xYAtk$vU}-ZxVkQTwslc!DXI9lI8&Q{rpMuqPfikf;lf+$
zYKhTgxyYpvMoPH3Qhq<)dOc;OHL5PO&@|A*ZtP9-$-BEPjaq|dkP0>))3YrCYA5|C
z-7Cqs2RPrj7$o|;e{T^NP3maW9i(>m@7?PvCwFXI#vdq>lmg3C6PsuKLB_{3G)Cwr
zo-YyKrOzs29+`1>J*~P=SD>X=El##0#F1D^<8vS!$a}tbcX7kB@i}Ypn?pSj7ZR?o
zNB}`&hkI{?pIIcrzvNh(KjF-}-~T^`5#GM1f`St&6gNF}22}uGf%TtPUW$27&@{da
z`8-*Qj2;ez0?S=BaWdd5$*HRhXM!{7?J({BF4GNW%sAo`2dKaHlFk!sA}~B(9Fa9`
z$`k_7!=|ND^C$X;?rJ(135~aw(JD6l_xVrZ+M#1dyUm+J6K25M0s~E*nO{(Vs&h2@
z(90vzKq6c$w~k*tWs0lJZEzMg&Z@+oLPHKRfG{IIzK-1uE~PwVuuB8)4A%0zWGmsx
zVD{`0*RmfxIL`@6<_PKq6xFneD&GF73QRUl3+B#W3f=F>sUxfUgUH#)Ie^3;p(SUS
z_oKE4F+JZ8q%7w+k};LI%K$%IxY?cM6s-%`NKE#Up+%@Gb+f|)h-;@GV_!NAaHZ)@
z2xOGC`$`ls`-Y$k%`71d+7LtkwYS@}c|F0MC#c&&_cs(0G>zCbn;L|DMFB~swYW$P
zqz!(W#m<Hl>d!Vyx6?h_Ww7CnANB}3IJ(GLbHFm^{1Dl2H%8uGgq9(7XZ6q=eFkF>
z|BlI6%qJ&f0eG024lKdLW6q#=EmfZW$*=w|C7*B8A~R|IJ1doFh$8X-n$8V-P(KH6
zlWXYjr<ZMb8gz@!8sNKkmm2}BqY$^yAF+D-u+ed<N}s-Jq7$M!aQSX}m|0u0UAy+_
z6s<Jai$UxwD7wfn1x0Z!yPzNuv5-aNHJ&J`L({V~%K7sw=eq+%5JSTDJJJC_xd(%^
zaG_MIRx<^IQ~*huaWa7tlMw9i6G6ch+(PLX_0`bihM@w5cC~BxwG(H0cr_ZiM)p4C
z>6uOsAX>Z-W9&KG#8@nzsri?@%75xX;ywq3^0jQXdvH1jD1NyXcpDvU43yq~V)e4q
z|4IuDx%B;aV4JfVz_W%hOKa}~4D+%$n?`p(4rvTNFdnMeRwW9F<FeO<JF5%G08OO7
zFb4Mm-NeVHucEaTs)@XaPB8%#=wZtZkPs!iySc{ys2AM}W={z*-iFgxL>3gJ4n2j9
zhpHsJ&+mXCkq%BLfGTO&1P2FGcbgZP4)E6RDkIac@lX|WoJ^G6WE>KgL}dB=dUt%x
zh8rw)@RzoABJil_Iyg8G+J612+vKhP4MjFLyldGFW?Ip;Wo!GDQcckGCr>(|Ob*+6
z0z0jWFqUd2*R7n|(ZnEd=l2|hA|#;mKGpC;H6zlnJ>VG=4KA+~kU~GzKMPN(oEX@c
zELD^HVc<z1fipn4X%U%3{=wFGT&oNWxFE^No|P0h?sFFuw4#*`JQY1X)x*0EErQxY
z*>GH3zj1^7<Y7Q7U5#a<<qw=|>v6+e*EAO*kbCPUL)aHwHhsr9z2RVU>D#!5^T5ai
z07H_p*@i*#SI{lceaBquiV8p2bAHv`PGe&}t#pfm(=k+jOhf+tBLvnSTTaBb2wmUl
zpFB>Zl_`IAyTpTxjt4+51|*FK=IWn_anph?ota(!@#A=a8KTL&5M$G4ng(w`yTCj&
zdbH;j;9L*0YFhJAlIlvM3|9BxIFNn$6e`@t$Ee!kXi@pd4I@SA)3<Nnkt3T~-vGZy
zn7tYRfJ5Y@8JUxA9JXgM7Q?qIfj2t)#kP<UGJ{s=a|j<O4af(e2Ti|c{5koJS`)<T
zfV4XZat2>O(IxH`Sn0p!R87$Vp4O&@x91(S05WJDNn8?$+j~w_s~`vXk+vN>_8GbI
zA%(=<`ru{<wh>Q7HIL&gbcXGkX3Olz1aJ=1;b02?xrI5RJ9<>wNj&CySK9q&6(qB$
zgd7_Pao6oiC2ukCnzn5&I!^EJiD+ILPHakAfT9g1;?iRBA@J56AH&X*N6h~&8!dxx
zcrr;ETaKiNt?yp8ZR=KOj1Q!zr*_p^X=XG#@*ugP{xdTP$!+8oc7FNjk#fI&H($QQ
z3bY_6XTe3s1=^obsS<J7&%A^H=0P)Sx#092L@P0=;qF~@5pwb07{vS$ofuctvrnH9
zqEb~cixF!<t5|K*`QNym%aE_swqi<oS6gMAzUd-O0AmK17rm6XAiQY=*+SpKnIYg+
zqy(@9V%`-E_1^lXb5*4%75PTA%3q^7ms`%TfT{%JFsCo~x^q2s&*sfZ>Cs8ID^I$D
z8w~*Deu}bCScFnZabmNDFu^_eMe=YMf-T%SsOliEk*jcqOpT|66_xSRr$#;BG?OgT
zWRdJ>A6H!VXGK&@!LM7)L2IkEoRDaCaZwKC<Pfbio(!wH;Z^Ss&#b?}J%9Ib{&^=o
z{Y#rUaDaH0q_Wcx%yH9tm-OT$8(X;J4fDAnJbV&zBqn<xKN3(#1MMn1P(H3fY(qhX
zWd4YH4@u=#>Qt7`v_?1Tm=7tU;ql=Ns-d2{n@NnWsBe-o8f>chue+SVodnZ2`=m*`
z7+_mHg&TA7E-LIr>^lphmV<oiOB=>L1hSW|q-ifat={aJE@B2+k}1E@|6Z#)h=CQC
zEda7jYLc#@K|#y-HDL?#=+V(rZQ8e|&$b*)dcIuWn{WIL&v6ioESTkzF^Z*v5X!kM
zCi|Fv2;Wc=x}1Bd6dUdzoB6v`)~x~Fh`(_|{?PqL<Lt@b%`eRF+^5fybHQlrY1w73
zttwg09f7PC-cF)=C<Yh8w6Fw7nhif>VkN>ogahYcK)$waQSlZLWE83A7nH+3eIUSv
zHj1+A-REkFe%i{BJLxj%>r`=zcRRa=q~j82HxDzVS9_p^-S!h)hM4F4d;cN3*v?`!
zURvbVeaU>L6~kBPkJ`c0Acpzo+Op*cL%XUAw#O-T3c`@3G7St*2-6)x%_1;SEGjJ>
zwoXYhXW*DIb`X@v89M91+t7@q(zuToAQuV`;fw&7#*4%F#`|fYeH7qzz-e(?c(`Ta
z;r^w(sc`o1yh-vd7A?Fe#GB6vHB#b)MHs=W7qcQl=MG>p1_>QyyVkFpgcuS55yl<J
zF&==45VEf~ztSvPm!4{#G%yr%eY)yBL;XNX&I9zG(B0*$fIz5%aSwBoq2v9fQ>D;c
zyzAI1C>O`2r~EAky{im2odCtu%<Dpe9eSS!9NOMn{lClbE5-X$CX~)U+`?4ZlzS)7
zH9{_)@!7!Qc5-rpWqs^0#w%qsd<{BBV~25Mj^tchcJGjMeFsH}tTAb8$x)fxY^4W%
zompDv`K)8J7$M1)4cWGlweS@!usz3H@3y)fgq1vr7Y19pqBVe(Q&k@SS?C>ItoQSL
zAI*FHuji`+m_Pxmv)ToK2|(ysl(+je<HyhQ@@Rj$p*NDrdSXCZRCYT>kO;=@3T|6*
zO>39__!}T5z>n^T%<bpTzmpGm?Avz)VQ9v2G>!l}A!P_AR1owcX6e;Nw*xdL&e?M4
zJtm&yU+_hQhFmx1PC{?pRS_@jC;Z7%j?JUj4*>ltDwH>6Ws9%6N1jFAjj#xh((r`#
zgeev@#PSvK$#!fTF62UjZ;NFq&`dhSJ)XC;A}2zX!%o;xZr9IE)H8u<%m2PwU+DOG
zIP#Q2Es1qB^-vNEE#v!;K&O!y(g;DU(IGXK+bU)2e=Rpr3k;Zkc*fK`9w3KMc$ykw
z00cAb6k(04$#JNv!qX%-ZNV34Oj*VD(32*Ur_!$BsPyEvJXk;?hbq{DiStHE_z(QL
z-)qyu0BPM0=Fb$2mE?E|LcWGXTcWFRlY7!N;>dsRiHUij)7KQVbd76{pz|p!!WU)m
zGpA|6Hb7`36Eb=`9305bK7*7)#2a1aCb{k1+e-*)DNhK8gAuBBdySCv*Q?!~TxB@O
zC~3MZ?c9>S8CLF*dtPaIRxc@ccm-M1a!jG|16VN*%LhMcr<%}@H}L5Xe*-#CqkK0#
zWMwty19u_UKv3#Z#GWAbg9*CHkSSVRUb$b0myd9E*TEKccQTh^oV%O-1WC_{uM+}(
z7uAyRKu8^}%G6OA+iGClo{*Xf`L%(ybvU^mvv_1I(SPy%#k>7<h8*SGp|E$8fpRO1
zSqFB13^{8%KwKI=nD)9$aSEFi32LZu*xQb+BrZA23p?sAnKN$F`xc^^h*wrxuT2Sx
zrkk*AQfHbPn)&E)7y{`0tk})rC}T;;%UIgBnV-iT?V2gIeKR?ac&0zw^wsS;FU;iS
z&T298Q>F;<09ma+*CkP#o|@O}VQHK9Wf%UO^w!0D`|jF7KZctODi)N5c3ZZ1AqsEz
z4e{zM#(hesb98b@>91NOxhxdE!cEZ2iyMygg#%mM?`|K7L!`80{0$Ma@Fob)#hXZ0
zyK*B(%`LZ|@KX4G@zULfV{N14I8Q*j4()WOiMkPaqniw<MIi0+bb$PPKqN$=;3}R`
z(VQQWm2wISD)n#P0X>MA4hb$P77wZMz1RI&594cCG;s1O{$N>B;-+l{fF`;6=gE_p
z*w_ZXfe`&|?|(C+<eNjJ*)Z3&`PaW7DD2(4x9B}M1OBY4q=d=2f!;nI-uI7LLQ^gA
zYgJq(N!70@{@1;G1E4~liw^e13&z4DmzMNv-n{t|D04-OLH?oCe0R9T{2NWpCo}Uv
zmfwjl^Bx}$rU6+w(&KR95D$M@VaJ<4kNxZP$hyz(($?%I!`B5IyTnb=l~xp)*W0%P
zUFim7+>|?ic;FfY33aC86k;mEJ30#_deZ7^pY|h#b_I<CeDD&t>+1?WqrLZv|AvgL
z$xu_IZLTupjpPgBuKwH$h9RWwuCyU9<VlN<7#!)T>iXkDum`O)3CktAXN;7@O}2{v
zumU-Q!Tp5ZE<KeWBrLv_C}=2NM#EV%tI-VV?c+dWWUd8wrotK93^e-0zt7NvSTQsS
zKYTtAW=V-<^yncQJaAxRSC9_jQEN-_L75`zkBq%=;yxS<k}YQE>?i!^aXTnk=j-=g
zBvI?jwjjRB>47E-ZCm*`7dMMI+mNeYNHiXwXesF<hC^5lP*&EbV@dMx!2kAE@-y`H
zt+~xIXW(pUnL5I~kgFyjQ28yER!6pkczg-(dvB<UN)?KbnKNxRZ|0EP0|v&{deuNn
z=<AXa{}U%lvDAF`sB8J<4lq^hH@~=%Px2Q0Nei-WQk9?WV1;L?+E})X9d-|TKkFC)
z`S>x<X_*aKRpYS@n8Q|f>1n%dn|D&E7h3+leFgs{d3%fd?LRE@U%}}w)~rD<+%r6B
z-8wcS#Fz;t3#A3NgMxr9&~0y6b4ewswT@Nrhj=&oZsf&cURv`0KPj_O2ZafChSKSf
z)yOOwyZzjM*(vQ@*(_8aRMdOS**;nlztd+rg`V+Hp=jtp8B`oq_1|M!Z98ex%sF$q
z>%j%XxpApVIiu3F25i_o!7At@$h^MvYy28XwYGPYDES8l2A()^Vp%xA)iQCHk~97r
z*l*@{UHsM(NlNhrQa?h<-})Vl-ho>+-WeepL`|pl4-O~8U6~9#8@C<NVrzPc+~MPy
zWB<M=4!84H(*b&Exx2X{)I{0Y*;(-}8WK8U=KGv!DDhL4sBHlw&X|^-OzT;+0vBV_
zJ>=wn5@46OIueJF4%onn_joh2?>smX_$ul{l^OI;UA<+}j0aNdr(FB0Ip?c5n-?Fm
z{qIIA7Mi7uJ-9?PwtvEagwRCLo_Fi!7S^0A{;<NG2XwQ!NFx8|Z(dy6CPnn0ooKoG
z9SNiR-YTAK`J(8xeCv#9bl&ie2X33(GW=0ntNk+~?u8#%zmAr$+p&E+{7S}8mz!@i
zCrmKtWylul7CQJzo`GiMs^)F%QF%}NxpW!n<hy@NJN%EUy%-&Bl>6%IXvwJaCcoZ?
zI|6KqPcg1?R+!gHvijox^=frv-oAc4_s3G|II({uJ|nQPs7=(OG?{*=r>5=7^r+9_
zsb-4$l`S~HZ^|CCwRqf8_UqSg--g$AiXk}KJ_!5k4}jJbNyL9crvBMH|M@ee;eV>{
z|NN1z)gQ|JKYt|AKk(=L{?DHoS^Oyy|MN#K8Gkq(2@6k7Zari^HfyLRa7l$)T!KQ3
zz$;j6+mmI0bcmp>AihYB0m$&5<L}24At+avTOW1%P78@k+hZqA^kNW=yu1Z09}NRQ
zcp}ph^6SGxm9`ZO0ZZs7nK@;WiP8_MBP2Z~<>ecZG?RfLSlB=;Vq*DGW8__>taDWK
zU%r0591{bAIf;Eu5jOXD*i^<dH?~VHf*XfaBK_sR__7Fe4!r!y%8}lKZ<6;hB|%m}
zfzdUKxfGOCe4V5pMF)^{&onR?_HNE40BK61n6>G*pFEjWI~2@BXG8Aj-M5S$36W%w
zWdK4Hw6Q4qD5g-WLig5H-04IILth22W5-5=N&`hvm<rwrd`q+zvRS9D!h5oiE&`3U
zPlF(`0lGjzMIZva24@voD#k(ut|9zdT;IH$FON?Mm;#3}M+N65W8fTNCXDYZ+1Wou
zqY6oboK(4fYxShAyX-fk<%QuDkLuDTp<JFn{{=Vf;IucpSCD;CVos&}MraLUKx5r6
z4u1xssXVEnibJDS2Ru|;SBG|#N&;7}WEAuiTUs_nQ#70F>_<7lZtB!?<doVuw~-W)
z;1iYkcroNdb2iQpiGvJ$TEg#KzfK15qoxLcjv2={ZhM(0fUJnu%>*t^|Ax=D;?$mT
zS9DvW^<0bkXuq!*K_&P%u9%VACH|uQsMrLYS<q7$i3MNu1<a%+EbPVHITOD<3%(Dg
z^y8K%S(B>G3-Xm=KR5hkUsT!l$W#-PV^zsZ_;3D#@Q}6*vlvD8>C9pe`+_s-)V(9F
zt>Yx_l>7){9D7h)TTCqX(1}Dr&;6ySCBuohs}8cntdd<{w_kYC2~}<%GHBVQj3%r`
zxNrZ=9<);XXT@Y_TUifpTq1;Jjz=3|J`5X8tvo&87E{iszezbfG*h5NNcVsE^$?{Y
zkY(IUb-CA-Z6(W6QeRiUVh{Dzcx|(FtIC+`cQp$+bKE-&%l%ey_1eP~s~-vg=7*`C
zo*tckf@2Q3>>8tidSp8sQAq{WujV)aO}xNnpU<55I?r_e{Ojb?6p7Olt@QTs-m3IA
z&dvNJM7peN5C03{CgBLw#;U4*{-WMzi~ZAk_ohZ=ch}x*Uoi=KafDs?*TsD>+7REV
zZyzlv(vv4oI{$prchISDK2C_Nc<umSU{wYRZ%RuY(PBRM`KiGpvF`AJBI{n-yL^_I
z=zh|$Jg|1kp#7~rO`E%2v74;pSUdIoh1Whrrr(wA()CbcuS1DtxsH<zEu-dkxw}NB
znRin3=f0I%mE~7%y0=Yjp8UCQjY9FsiCfAp<|Nt`A0I{4w}}lwj2YE-zH~ZmPgc`c
zkKUL+U!Lc=!s{Z5#Ol$)<Zf2$P3T`C5PBZkzIk&Gx+kkLQ~ms%-xIqcjZW*`xPn?M
z@2A%zZU3JMmVZiEiBBo%D=EIG&*tbWS9)P(Nd&{?E0B?=0K1ftP5?1zp#A-owYAZE
ziM_#`0)~Y51|dM7%Rj~`Nfa@n9TZ@<JR{^WW!UBF1<fQbvDN4Y*z^FQn2sR;84naO
zaNrbew*+K&tPlhp>=_|jhCRYIXy%L=!rh1Od{!MCxCbWc&!=^jl|5q?L0H4c6<@|X
zAerF(;4Wci1GKdA)6&vbS}n;Tg&bn&SMjrEi?=?eqk*Ke;D7qZmlHd<Rv95tr$8FQ
zMAcvk(T%^fCc2>jq|=r`X=CBt7^2DI;WxM2*vy<g8~Mu$lKWA|UlX*^5CeX9*29h&
zs8@qVF#a{SPp9GM7S07X8Ps9G;BnkH$0X59`A~?S=?MRh78{t(&eF1otssn&kUlBi
z6|nfV4ZS*dKAl-Y5ha6g4hn%;qp)L^pasU4*0TX1UkXP#PoU~2i^t4~AxWb30vQ*1
zUE-1kY6Q0JjGlr{VCH5LT*%fnqMU&qZQpvbTidQkYJ_<o{1d1?U#(N`@z3txA3SK#
zDsCs?(apckWaZW!YW@4`MP!Ltex$$ti-u@S=-A1w^m!SJE2Gm9ZaI}dhHpk6@da7G
zY*~<uCEfs(!N)Nss@f===%}!dwzb9^nQg!Jvi`+ONEPTYR)Nmc{i`{&VLhP~NiV8V
z{Ex<C&R1fQ#Kk|OYlMy&|ARFF_4GI)+DUM3NRm0jY9kE?%ooH7U@~8i)6wh5kqTn?
zwRPDecwnQthyj?uqVZ6aW28=Punq0;yUg5SeqoURCKUF&;Pf)$IRu;;v*^_fbvf_s
zU9L2|!>B9YH*kD}sKdmfU7Ryv8EEd;zcR%gsFGJbe3{<KN>2zN#8~NTM^;bq++Jbd
zO6j)j^QVtmT&_4#UZ0)h%_I}!K4xLYg8=j(>fSVC5%k^l<Mim8>FG9XQ+f^(x4b``
zqD7C3=Y0-|1e>{`V@K}#o2o4m%@Fr>)_wRgU!}Qv@^6ZZzkf&{Fsjn;g1QQufp&u<
zt&b_2GPZR@nI9$WqZV}YXLO`n)n}_V({v{mytk!}vcGf{I+oJOue}%`MItr4!w&>;
zOdiS%kn+F#zN@64&Xtp&Z_%KIt+2tZV<Q&lI_>EK0We(mMEaAKBfXxl9yu8iW87#m
zg^}czR^6lM!4J_&Lla`)qoPT3wwF4|;+0F6Jnz%}m6}>!I3wxvU8=Lp-ZFA>13O^x
zx#UKRvxV&W^NV*sU#3Qf-LqS_Cds2M$=|wW$D7x;@K8dm4G1W@_T_zc?v)aTn_oSS
zZFP2eI6+p=kO4F-&a(J8rZL;6LZ6Mw_1${wbmP3I8M%cF<bWHP;f1bZMDWb^tGWN8
z<5~H0ADSg^v^dyM6H*to_<Uwc!}lbb;|})sYzP2i<=|f;oz5B}e9@xFYh8?m{f5QQ
ze<R!ym+e(ldSYOw^_-LA(&_tK>7pNE5xr~@y^#zRvfdTW%@<{?wFgq#f_E?RsILX-
zG!TuB{PSMbuv4IzCf2I+qi?1rk$Q|>|DvBO^Hig2LPb&G(=dxc2-n!$BpYuIQPMyQ
zdgiiP7~lH_%uM^os}^1l4wEZbqwQipuU>A#>Rd?nnA<BGC~g;Lj`0L+zMyVW*6x9X
z-3qa66$=W=sh+`+CjEu&y4AN!-lY|LpCNxl3u`51a&b`&A?Wei?AE(a!^xoU*OP|k
zCDflwJpI2I`xCI7*Y%AX|HeY*EOVwZB+)=-QX)fTN+OXAMUhCP0U@&p6^XDEDv6Yo
z%1o1~BuQncNK#UgQvc7L<+s|$|9y|+eU82DjXcluy@zW!uk$*u&|SM8GIKKUJE7?)
zDk}S}sBk8ziiY3OVtU7;54#9=^wEMMwjzbI?>k!2ePijHt<0l}=n!&+&eRSAO$_TF
z8<>-bPH8DnH`Z!kWd6R$obg0AqBy?+G%U4;=DGsIs>2&KJE<*DUii1wKcL=qRZq4A
ze8*)FET$fCdjEDjT}=d4$Tlv#zUH=Tmy#_TH9|?^`e*l2c+j<WOI>xYAYgAA0qMF=
z_TUN2-kp?3AP&?zb+!7!m~u*<0pBsq7#W+*{`)V^j|}F*qp*S>v#y)fTu~Gx>dMN#
zejTi)d^8erD!tD}g027@r+G6CP~U~G&rfqT?Ot9!L)OM?e(2*B11wuktoRq<kPIL}
z3YJYI9UtiH_<YfzS%6R|cI`DGd)u1AJrs=5*@IzL+h<^T17mvo#>K@&JU%5Sw7nk~
zeD$42O^_lW)f$kNiU+qiC~_PW!3(757Li30u5uK?a9U`q(y}s@d6!Q=XRN1dr%4^V
zbUF5Y!Hs)xZ_ST7QV7BYPjNgJ@oIJqF{@zX&=HYya>FgQFmMv=&q+rD%;qR$axfOm
zdItSA`tV)%NoNmr-o?bI;p=8wz-xIeirlF~PTlz&xg4Iek)p-pMN3q37d}K;(R*!k
zE0T_BYSWJ-bS;C*n6oj_c~YM7>}4lxVLB!b47Lg&&H%(oxG*<<s_0Wm?X=RWtn@Tx
z`~~IK52}0hG#q%gmjbR>BYw16wqqK|Jr#1JY7cFrlp32Agt@L+9{7V}r}{4x$U9Sz
zxq>7tlhn&Zqw~UR{oT7T=}13N-|-~_c>^N<ftj^t&E=|J$hyJqP%7>zdKhz0-@P;I
zUvQHYlR+%y6#a~DaMtYa8_tL1UU2>=E`(08zL7#`;>(h5f1@!I9mlNsBRM!u{apw3
z9`v_r)$}eTjey%-d+cl~-5O&0W5h!Ia#}WG6;9E=68oPDSh&+f_6+E)d?q!;<%^98
zs15Bz2Mip=sV)VZQEYCyeKA>+mDPoLSN?fErP4Llb<!5BDAjU9$DIU0U0dzs5S=d`
z5}#p9yMBthVrQ9MyLR~nr9d@GP-QboT>3^HM8kqzcGzW`6I+#ebvbaCNNw6o>QBb!
z5jI%U>clTAFU7eRW{zKdbKUoq3&xIS+UfS&o?kALk;u2&NwMWzmS|^MvV>xe=Bs^t
z$m{8TU#)6OPoLPhyl+UGw#O4bHruV?d_E=CuWXEEfi0x|W4A3XVE24AQtE6p?pkQL
zLX=_iVKcw5iNr)rcRXeuhS>~!&L0h5*Fy|{T>1QhVkDzCypA1PG0x3l?wrflt{w33
zfF<CqGAHSI+RQfTzk6@MkY+K(?=|`4aLsV4h^f<jHSL8M(aG<%4i8$8C|z4+8uW{-
z`BR1S`yBMHbkwvmX<Ug}xvM+8CfoyN6ohv8)jqtL=Z-CDb8y7j#ScEb(UkAwt~~R!
z+w+akg*#1xmuh(~<Hz@VM24GpRdHnlCM{dG?A6|8(ch^q=P-aBUk$0>_k?L!#wnmG
zo2<;tO=XGwt(tbzp7y>k9T5tZ@<W#Ge*O64tE-e`S||eJ_gQW^>MY-!M=0E2sZ{Mf
zd(qdPnRbMn`LCW5?%;bqzm87Jo0nM)sXP+Ro>wHRN^!p0W5w0~77}Qc{&swOtIci+
zC0@RcnjxD566(gK#qYGj1<;pKi{u1bO%-GK?AcJlMvc0I*?(?0DpKQTv6x4;L2Kz!
zq8GabV=@~adOWw@>g1vk%L15UW##NKcavv{--$!C?HhPNGr_R=l_ynpHLTaqG<~(p
zVEM}Unlb|bu5V)m3}sq2;v{XGF<JRnE36*KD(1PAoeKz9w%EAfh}`?g<7h6P_;L&~
z0<-|3C8+PM0?!yvuF?l6-*YF-RmSrkd5GlrLbYRTK2oK}aWsU8qpec|sM>#fQZ7vN
zqMJUisPk?`&D8g{{kFdDBT=36Jw)^zy(20r%2g(n%=G=ccZN+D#Ji3na|kpN(l@)7
zY7(#fS?(~zXfgVE;g1Q6u@tfMt_bZX+OKS5j()PtgQWtkoA|w-zSkND2jgFh?A@(%
zx25#zX&hwC{OG}&3N72Vl}}C?<L%pP0P$hL1o%SVscJ%P$578D3_ySAeMTwChyAUB
z4~K}-Y00L!$3*rDjLvT>i{i6F`sb8qozdOV;z6Csx<0{nf{ny$-ailkqHQ-@n|<AK
zw0uhP#`Ikno(m~M#wph@=j2EbnmKu6-l>{~-^=^!0oEFdt%l0NMv{W#l{~wcl#3(2
zKVEs5BA)u$*SBg;6ddv7BNB_TAIHkM)^OImhzuxM<FZ7S0i)6P+ZXGMcn(_*5f9bZ
z{H$z#65S!8J4u~Xd49x4*Z@O<e-4}ONjPRXZoE-QdfA(Z=1}!?)cT!|B(erh(k%2=
z8>{))a~(A2SB}Y>8SA%P>8szkO)+D|7;~EG9<2AJIL(h1VPR<@=UkyL{*DpDn$>W9
zccNEKl^kj-N^2sDvT}XpP#M~$t$bLE6DFN{_8jUHs%_MlPvZ`w+WQG|8>}!Wp1%Ny
z&Tk6s+tqeNyN(^xF~cIu*@6t~eOK>mUqhk@55I-wP-DouU9d=tXeMRi%ESwQ!a0jQ
zdG}vVRBB_?O}+Zii!r%3ib*h(%xJZvSy~|yqB*SQ{Vpux#f&0cG8jXeV-_|f>7Cyf
zTT@ftF}a^UepHT1Q|e3Z!6XAi=#O&$ppQ<6IG6-fQ;jY}hI^|dj48t=T*<J=_J*x`
z`E=v}bQ(ob5!H4)Cd?MYp2v3lURu#GO0Z>6jQx;zjB~9r3V!(L!_j#eL9+z~n{<aU
z<vM%*?w42Gbo*QWM3jti91c6o=YtdhlY#=a6SbyRJw^{j>YEWj!-l1gd_Qd6yb{*+
z_)XslYL7M}*qBK`1^IibCGpL*^g;)bTTh=pd~<79%KlZOC=($@Mb~e>!H?-DM-L;v
z)NGr0ZtfFm<k{yUyBq&oPm?ZBd{8v!*do369{Z1;OtXpHX;?&A>{TYxa|3&QIofIb
zp^p{Lg>AB{YA&{t6r$eyskOZS@L{s;k;v;}`Z=S+gR-fSkn9p)UFGOvDN8P&Wf90j
z%Q>7vZjB-nwScwl?m`!OLCzhNGS}*af|HBO@^Rw@sc6hZ_{kPqk=msDQ8iM1!F4%#
zc@@Xdg*U!qW3eCO@%tV2KHfHT)yp-s%+fv7qIGK!3S;&tPmC-Ox7GCOgFGQcPCVfG
zrS7y*5Iq=FoaO0T`1C2*C}>FZqJ3Ipk1YiOU<`YcAT0_;sBq8;joI{5A2z&%HVe*W
z#)_fLxgfK>81a?LDNKlYliCj6ToM24dsfO#XL2Q{4<pw#`OgCM#;isIXWDMQJayQf
zJsjqwVn_4f+aoX2Y8S-twcA9ulWkzN2}u|XDUNmXlZAk;r)#RFIYPABKfQ~e_9MFm
zi~M}P?!8(TZgZGMlR2wVb^VaNcXgV*qhm?t)elrgk)3K!Dt5{nHF1WUr0t(<WxEZY
zzNdlUN2oO&7XRIFehWvi?Shi^p53To_zpOVR*vDMw%9`4mfxs#R*<WCXoC1aPT^wA
zJ9qrSc21C!=!q^aDJemmr_Gu9pWog-(XMl6A>bN>BJ`|p$!YQ40=b5!&BwD=N7b?n
zrcEoxPfs#PFL*FleR*Ew456bZUQQEek<9;2xV>`onq|uZd~=9&5{iL#D)$-4o*h=#
zs_AyuMY@HprV+{SFB#lg(pi0P#KHpK8M=9|EWd79*G=&rO!<JFI4TN4;bSl31RnDz
z*rPHZ5%sjaupl}-);*nOKxQR{j}>j?$#btPxi6^8#Sj1t(svQK2^KO5CYWqTkH@;8
z(@`rkz5uMSWS-Yzfr-e%9xUD8`R~;)E@s{+{>VksFZyiNH@R)w0yn_e<bL;YE(AcP
zLqcIO;xPh?%DG<fNc!5M;C&{xL&TofKR)HRYH%iKREXY$L)YxXn1!!DzAn{lyua;N
zzf=0MhiG;N5Q9d?e8T+?p#01j_fg<w^wJQniXt6eP=07aO%3j4TzugWLH@$Yw{IIg
zbi6D((qBDjWw_F%=HK+L0B_L<Wwp9pQAtTI)187gL2P@k2C<ElMc=kimC#91s3~#*
zj8)P}-5(v~yd*5lTloZekwD|gF{OhvG@foud{SdYyQzw|{&c0ZaH{AezxmqJ!l%8T
zKTor3Sj3yvlSEY0rDMkm0bn!^uPL=?laRMkIj212_Qb7?n(VLZPOM$??xRBtG`oOJ
zyYcU&RnOxAS>RAqEu{9Vf3ru<Jk66?4Z?SZYzF{H7*#_DU_wmFiagS{M>8t;_~d>Q
z0oykH`Wfkao6wm8if=t?WVJIfvYvj4u#khvIRF@Z`0!q|>_<1sb?}X#Ra`R;a*CHc
z5es+CCu$6WDFf%~O`Ldal*=Z(&xi+Nc$Hrv$13<aL%4?*LWB&<`%eAsQHdq7g%FZ5
z+p?u@#7=R!_FYK-#L5{I^HErr71ZJHTu>RIoxCh~K<Y9-ZnS+haDCTD1P~K66W0{9
zZF_u1vflUZJyTZuy0+FU*gS06y`-d1Fi?)Pe1y5$R9oY8WH8@O6m&(z$SED6$)F0a
z@(tjRb1u&Nda`kH#|Hk6EPx<UEfNU^+-+J9pX0|zgSw1({5`j>cCHQ6y&geaJG+?1
zH6k?lv@c(xE92{X8K?-9z(}+kz!B?s!dphlQ*=$UOoe_Pkk$TtB#{uA|I}idvatN!
zDmTN_GkQ---&NJTTd!Vq4?_04%aNyQq#-G!G>mg<nM!k6-rw%{+{qIsE*rYXB*gnm
z4<Odcn%G8^uM>{oi1R`YBh|((0JMK5ICm}C=32xeS$&C6IDfqNv}*&--?;Pfctk|X
zqQ*Xncegj)u9)4?Y}&G_x3@nFElGN_42oe!JiK==V9aOyNlEm*RfHfJ7fm6ixXY1%
ziGs7)$r?zxN&5<>A0L{SvWoU^LQU_UW#eY$xA@P=%|un+v+Zjds455+tsn01HbON0
zu>l3=Md#a-vxayWpkV8V;Ha(Shkhi??4XONS1>NkN(tHbPcYt|LxD3Q*Cq)XCjZFK
zzda|eF((e%bqF{Kh=%VfBnmFYZocg2mxVu^))ayDxvK>v`{PphVZb>$&0uZkc??GV
zYr*^zBSXoydX96QX9GnzH(+j{(J!b|MBN>9u^Rutox(Qik4Yq+5U$aEjickKA(lzq
z;KWJ^>+x6pbC18l%WiA=WPdBf?x^7ue@99H=-r@LWlk{a{R!7apF3<UqiX&bim>6B
zyA;P$GS==*>Cv+WR^H=N+KZ^neVc757T@Hw&Q|sKAG$UIK&KRGozWcfh>v|mCG$G^
zcTUcU-!!K9W*uim9af#qDP+pPtLWIX{$PruH*Q{BR3sD}D%++s$XIk4mVtza-h5?X
zU7=M5c_|wk4j&pQ4RAvYb`T93kk1|@TQBR2ir;99(2Sy+6fr4-utq$bQCJ2wjE%&c
zL=E7rL2|lhpC$Bfn<8Mi2qpJCm~GBW+i9>bvh2%X=)RmvaiTG-u0PAIfuQ&UuK5R=
zc(c2|xP5yugevj|4JQj$nxU_1k#KQl0m}p5YB6FgEQ5$w_k0?J&5iKysBic-UrIR6
z;iJ9Ef*ey{KM7Ze4;e%KlqQ@h%Rkad2W8sl`_?(Z#+gVDM?C(@j!<X@+^G>z7qRv5
z;omueRG5R2JH$PL<gx^0S{38eJA>9(s(|cc_VTppHliFEY%#V|KCuftP@pR!6hpcp
zr%&HZc)`CT%0VwvMp0N|lQ47JvZW{~Z2(Xtc}zV;UE&M9+4xQ&qSU^<Xsd_ql0&&f
zP3>__&Le7K4j*TdVhElt#|76^<?*B+9?&X^OPgHz^at#Fn5_Y6h4i4JK0vMFGKn$F
ze`Y%kIC4`yMRcHwVqTC?eWd1ctkp+m$8nKXA=Ko4Ov_~h5FYDOPU8Vbl!=WTZ%F^&
zzhsZ0J=Z8uUm$bwKYp#K4<AVQ{bDs<k{Nj)Xwu?`00|%)K5O@wFbGj6`d3ES&ZSK&
z(U_Qz&lS@<CSB#20Ui#gq3+0)L_JNOo|@I;Pv;N3-t}alPP^2^S&H{Ky50x+g*k=T
z#bDL_Mh%E2ucCJV)iTYcG$IBBr7rA>ZFiV!?G4Lw>TD6I4t>BVkx+9Fhf}9-TOfC}
zGuG(%9{xEE6obw-UlixuFU->02kJfj$vj}%1`2X!$3?k`sO{T(pFdoHN*}w)erf~a
z==!(vf5i=;!^OtxFlwEGBxP4&uei|FTt+I>Y+2V6OA&CQMqu$ZZw${x`X+h;h_R=+
zBfVqQtYfGY5JH)R4AZoN&tO-vjjm)e_gL5@Rs|h&pI<ahGyI1pu)Q4y9DXJWT`n}1
zMGQMm`D2*x!<4t<i4s6EVN@mUY6d=OS>P1+?d@%@$}y)NL%uccd!(cQ&$_>ov8&T4
zzV2^|AAN+esC}z6aA8bDIo7Che@`=l8S50`I`4ghtSSHSDqfp&?Aw(oL#MQz%K8=$
zv@6Jsmiq^Ww^dH$iKB^6LIz!M7SS3nx6iYQ*nm3VgI%OvLSRPDpgQxk$dd$5IdbJ(
zN-rrh=t5*Xs+eC-m`?XmiRj|-*P=svErY_#%U8XMu;%wXr(ad`U9GzI&VIzkez(qu
zX#6{K(L~d-j=s_`l3RiR(K`3_ogTG4+Gt;ZpK-8&?w6O}YSg5~D$pX>*K`An#Yc00
z2$tZ13^cy*loA@1FXIpsTflIu4Zq&*>cy=1CGc_Zc4M&t_(g7qT>JJ*SFSXp-JOSE
z4wRT*XJ=?fiTy4lUS$53aNUTw;C{}+@zYfV+f%zL?+E8tfSrpjE})aWZ1AE*i&%Ty
zcymSSwE+$zli;Hj0XNe;faMlSvXm&lanNYaf(O~8MF-`%mtI5CM%U5>+*5Op_$0EA
zIEsg)*~murQ&CCE$%&tGsCQ0I4nh;Wd)I{ztnTEMz9Exq(P_}UVB?@C<Ek%GUaqgL
z_U+%_U^^+!BSY`}UCpkVf%GwQ{RVcNDEpL_%W$fTva@g^pxT1#O<yx_jts<;D;wFC
zG$MEE3$%e(L;Kx`A%+$WmV$|U2{nL-&o4;(v5Rrj?c)poE$>O|@NQAARrZSaFaem(
zOAaF?PsMRzlAcJ-w$#<??#9b7&Sys98cAWo?@JTk>`*9fuN>nBeusSes>~sQcgKJ6
zNqjOeq<gF8zmMFXUlo$MG3@$Wo4G+odYKlUTW0>jmH+&ReIWW0apvlg#0SBvG!nG`
zFD8=|R(5ICGzfUu%j2(vxj(eq=v=znth1!h>s?XXk^*{4$hd$wHR^At(OJ@P$iq_)
z+}q)LyQW#^KN#s{2e>vyeGn(hoC1}E{L{dL^QYXcxn~$$nyIagnn;~6-gAw%8_P(Z
zZ4X7g-?YD{)@VFP_9adl;%^gV7gIGBoCc4UPP3K1ZTORhSQH8(8r6aQ;fEwQC$znd
zCe*%Z<k@&iM=7sU*~1P{(iD32wtHATa%1Uf<hn@4iHnX<NIr2wpjV+Lq!rrCVYN;n
zy@qvkX4Q#=ZbN*dvht?jWX0>&jdg3+9+I~=?Iz7rDqKoi$lfN!ZYL%NbLTv@&*h0J
z-(_vfHpHH%4aKqOg=+RCEIr(J1zx)*=$d;c#-98n)W~_wi{YT?dk~Kf=+<01wpgP}
zz`-Emoq48fi$*DY@ESOGx{arbba_qLL$8`{(70>1$Mz_pESkr$qPDEfp{&OG;#+tT
z1Jj<;X`spHBQwTXrw6Cgkg|CB^79S>31fJM*?;!hU@n@VK`<07E&IqwnxcH9x*xU0
zK!-s(wMOR*yJF@5`RX?U-=QnZ@A=<>Js>2G^56^L8sg~bpowzUXt}&TzA0JJKYIRt
zA|LUzR9kg-({QT7zgRY9wAyc~tvEKjXcbX(7vvTqfYd9J&QEv;B9z>(rv8_ta8uE)
znMKu*Qw_fBa$DPl=co5jy!ZP)g@{`bkA2ej5FGK$DQ>P)YU|KlnvgW9k<?Xj4h9x7
zX@0V6&h$T(E&Ht`>;|F~!zq)NEd#~Um8F`+UFMlQOA<yXde_eQOKOR;5154Q@D>Vr
zfu2!~FT46a$(lo`UM&Jp@ybc&5|R`2b1!AG7~Htjw~|{BNwbTSQ*%XzH;ESiz`HVK
zCvo{8Bk1pS7+{nKoQP}IpZu3pH<`L1mL6wDEd)s1YU$Fm4igc<5Z`-;OGxa=$bYd`
z&z86}fidas=7Gh%<>uddjuQ@?L{Ww_UUdY@RmA=m;;4Ood4{#Iu;KfIZ|Lp@@-T94
zDJ4WDWXa6C9+i|8^r!tv4XjAuT|)nnv5i4I!mYY?y##JBU(p>s59)^AZJH&V7xDHm
z%v->!zLKX;&9mo-vWO$2W13ftkd;|}u?iKLM_+$0jb!?UppTeQbcPc&(RN9>KDC|9
ztvWiG5<zI)<wXA1`F0b}yKp&3<*CmH83l<^;53>9_G#o+CwMdXa~P>uhPk83uDm9~
z@?H{WoAdY&K{BC-zL)YdC}xWHa0`OV#Dcrk9rE<lSpUg5NDHR8<#w4eO>g~SM68PL
z#4nIK>Ys81qJ-9S1{K2ccB|RXg5P(}YOvrd;nZ9O6?Klwp@)9Zpp^9VIfc6=of`V9
zaAF;Fp@~7Mm{7RJ%~e%V@r-?b5}~}DE^1Kc0G`GE3Jqi2i2rR1Woozh1-yv&0r0*Z
zhd56<C7w}N|1dc1Jzy3RJI;{>6Ydu_w7JxRoL?ZF?sC*v(V?6*y1T;7b!*oAAZrQu
zj%Xp%x4b=?Nzl%Pq+Y)(aFHe!Qo!9N&b(99f0iXrsq9b~xWox97ZXlRo+B9{TEQK3
z9pJ#6Jvh#YrTqt&O2FPu_XMLtff@C8HCDLkr@f`NUax`P2}Cu;&q;%tz%Jicw%X~2
zzDmY#xf#iRA8|6|789Y<T0yi*)pS6Qc5bdVTrXs;WoHFUIYgd-)PONdD64o6-aI+D
zl}48q9aV(LFs4sM`~%TJi__f*?m^)L#E_n{`W0)b%g*b@u4}OKmPAB_q%!W;0ptiw
zyxKI=8CL<LgWF+H%=6B4ZaMD&tHR8NMY3PHDx`M3;q3Ai@+Kp3Lo6r&7P%#4dSYUd
zg;=?M8&5Sj1l0?ePDdq|3cK!Z!239oY<5;*%Tx&UR6x;c6H*CVxF!7Yn>rXhK|s^A
zJjpGg@(v?m&gkA!)nmHV7=x(p4gh!^mINc*>Yr-g1iE6@mK<qt4o-36i7RG^q*ySZ
zk{+-TU76!;>|FHr?G^Ay4!A|q{rlu-s#FMdcQYLf7depStFffmo;tvF^5lmElsvwq
z6T^*HF4FqXFM19XKrayy^B}?&fp2~yUN8|~@;(i@K57e9+!6nK8DV_{{yG<S$}Ul>
z3Y`vuh}PA)Rb%mJp(bN2Zb?Cmx~?G(9oP-s9v^XmjB>Q?&i@n|guJ5D`SxT@ecd;L
z_<&my7oNtrmBmxJAt0C|JU91V1@;ox-SZ5~Ovce%5u1oF-hc*x_~XhRWW6XGGi^n8
zHwt>^IH=-oxdryIYv%bayl;a~^adU>2??Ej{(xL>$aAneG3)xxnkcCI(d-BVO*K+9
zc!?&z%#Bj)Gm;L%byfG>2M^lO;mp}%S9HwJr4<slwWlG${o||1B8rj$!YJpp!P40q
zndBG$0ap|{i<lU@9c&V+A|`LFCOan#4{?|&m7NChfaA;u6Dld*19MYXO=*1V4ni;V
zg0CDZuQ(BQ1f8WYb?g7ZId&0r7B<v$*|(&qEX}}y(;qgLJO_vmHtbYT98mpxZ4<Kh
zc2?g^jh2OKp@`e0=jXZz_qI3kJU<b_fNj=rKm<|07hHJ^o%kD5+zuY$h)F75`3qyr
zFvE-_@jlQ}Cx@(-@gWEW*cCpA1YH2;*$0}Gx|X=h5AT<-+SETPBp|u`x9@XXPz*{2
zUz#BPNNx9#Yu>Py6rMY@ZlPl?xPIie!b!=BIcJ34EbYJ^9o(V>bjP48V&|z_7%D6H
zTp%d{;j?5C8squ_sLqRKtSvOd)5U8RXf&W3%>L*ObUv&)+~dFI7zK4@RvywO#5uKJ
z(e$E{Xh0*Fn0Z0Uv*EVVSVcw2Ti;IbnE^%h?Y1r;K2sSvk{aH9@NC`P0BlU-W!{PV
zdK)QLlO!0%A67~Kd?~CHQL|e9xVt|7cjI!_KYYRW{Qr8R##%xVr+~G8-@f*`44i=|
z1qV<x`Oh7#CPk<4>=<R5byR;978Tuqd`>9Dv$$1OlAM4BLn>VK^D65AOcONg>`t))
z>%qj~FFTMF6+Xe8*k*?36TQtEj>=M5tFjG}X>YS{8~*URbo>RJgN3R3@k3S-J_o}D
z$SFp`8H7F=IH9rU7z7dl!7dP<85M?Z-+q_ng=?W(sOmZ5Il3m4enh>fcC71<%fAbH
zD7AaFZl$xjU8hcjNm0W2o-d_q7I=smtvbTHnC_14amyuXYXc+MEb!bp{(24GkRLpJ
ztpnoYBz!Cntwct{hf6*K6crbP@UR!uuDnMen!SdE4?{UeLqp13MncLDw`$w=Xl=xm
zE6-L8<#($XwdLKhC_~clMj{QM2mHCvVKnK8ZFy;_7`?!|d3m&E?hmHtmRVwhP&&_)
zODG^291ctkWxI=*1m+!uq?7W2^4~6%=571<@gubpw9Du&nyjanm_;T$93>>2@YnG&
zd5bwjEjtYo$}&V$`5vN@4=oI><L4I+QC8jdR_5|wq@1AFsHSp_V+8ma$4LZKqvcqx
zR|iR2HiBKV3z+g>{8@Hg^jRMp{9fSU8MnP5EC^i|Aft=ML5D*^LO^mz4!v6Zt@9X7
zB+?dMm^^_iapMH4;N{if{a_5IAcjFa10DC?+g@k7Gco*{HKoLv;Dsw<v&nJ69K;&~
z^xwE~gZB;z8Q8H(CkMA1V62Ef2iB2!uJM$JzD6>Gk7T12R~f%W=3uNcol~dC_aD+S
z@xOR6z7!-105Wvmm3LhE?(G|U`tX%`eyYzi!_2h;m&%-$P`ggI5!x@j7i0<HvDP9O
zBU(3@@)MM9qxMWjQqTk>KM6ukm&t+!;WpFxtb+Vy4c6)?i%ap@h5y1MLMK23f%^6i
z4i~A!fgZv!&O(rlas{5^bSw#b4^*XyIRui&`7L*j!-LJ-4?6`V)<Au4SU+885Fl6Q
zA4m!#tRWLK2`AthL@s;>9*>;`jbI1(R&4)AzBT^!-Mikc*Q{SpgQX+#<uKLa;x_R!
zCr+NkQUHx$`MT3znV3lbDcL;$f)O`CBt;nIhpUao|4Bc}!pJWL7V2Ei7()G+*>fll
zrH+?Lm;}yDt%7n2zj84`jf4JPnIWUm2O>71H{_jXbpz+BhXrO7ovl3@8fps$>vcf(
z@<>eCrMe>eLSM2NjOJ5V-;ukBVL{K1EVE_jQV^iF)tnnfS@}k;i=mk#E2}f~A&ASw
z53s0%7eqszID$^cspaClE;}r`i#b5!KHNS1$DQMN5@rJgVdZkcHV6l*VO1WgbwTE&
zFvWsUpCYO$*D~<#UxTuV!q|SBa36n)k6Z}mYoV-5Jny|1vQaX*a47MvRgoIa%G4D1
zfzpM#$hbNDUvZ3&%QK<QL1%)}j%#db@(R5=qC^U{hYyceI6}&VOjLjvnqmtrWNH}V
zkM%d4;vo|$jQR@ck4(}03rzF*SY}SnX1Y<Kpdj3xIaphYZTAO+;FEOJJkiV{h6A4d
zfZ?+bEylDGhYT4t0ep^$Du=QMddfH<Hmpy?ht4x04AG!%su`tWU?wyw@x%hU_*;=i
zw2#^lKM5PW^3xzsTM`t&M}1u8E2Hr3KL~~Ix6eF`Lw!_#712k(j<1Bn1akj&f+Ij>
zB$Q*&=|S_n8LYwlP?Pg%u*XyK>hxjwQeZ`8i6B~};vL0DzGDMv#YuWK3i}dI_pO?v
z%;}MTzz`8N7<^^8DK}{hGkAcATb^IPVFUBJbpWQr*G!j~NSJ$d`9YqI&E<p|I-`&Z
zme00v)UFR0E-7rGEp))Ff9_!quP(a=ydZ7@c!lYZZc=Rj|M@JTX+fAZWaWp5sHhB^
zNLsW_UrcYQ<AW{^O!ZKc8x+zZJRd8E`y0=L0>tqvhj}l22sa~gpJ=>s<*A2RS*YD^
zfdgDtJ)?<ay6q2K!V}#x>e6Z_b#-F&!68a`j!gI!)F-dn=t>0y_tVa&Sx3GiE%ZBa
zfO{*NjS%>gl~p#6y>8U5k>q&<8pq#Vy8IZYi3>+G9j45T1@{`=`llfjHct+ryk{%I
zm*F@f$uv0lR*9>moFtA2G_UjNHl%&r>J_gofhVwl9+Tdelq?FJKP<2}Fh(=TUDWfC
zfV`P?R|F$Cc5bSbt+7;E`#-0_fSu5tFd=t~KNqWtx{_0!lN5Up55cj;Pw_;IueukI
zarf?oP?MEIkK<=(>{1a&&UE0gVHFe#>(2>g-!CUWSig!WgmYqoF&ebLwG`f>!GU+P
z97Pit_cC|gVNh^av_nk)ZtFH^aeRkF@EkB=o5-J(FVu}Mc=l$cO2;vYX%jUTHjaoH
z<(Ep|qz4bI&{3k|3J3CWa4DEz!`nOW1Ym(6GM*GFZKYfotHooba1)RK<DD-2!@Rs@
z@73sC<KjRszlKGDQ=_jZE9oeK&aA?~pm>WPN5TEULQ-I^(CU|$-%>ZyToj1rO*DB7
z6vSl{E=!)4|Fe*f1%U+5=s4zR1}B-pGHmalgDrG?L}wG)#HdG8R)Z~Cf5?Z)doX14
z>({qiY$P|mXuKhvS2mG=%)=As<DUo#5dcEP(ZEYt(ftB=Ik3+VO<Q~(hxbcCfsg9=
z#yT&~G_aJHC&I+#O>KLnb6kkO*6uS`$iSR2vKzt@SL`MwvZ;0$7h)_b%7O}fT1Q4t
zoER)Vf!h4g4IE{Xz!8P2b1$8MLqzC79~2I1Z3b1(^M<}lHrpe1OzMpcnC051lBd@H
zEW>oBRpFvB`&A|fZHOg#W<#4EQr_q?n{5nKq;&LPA#R6k^Ju4gs&j(9#HA?nhWZG;
zfxsH)h?&f(0V6aAM%H0GXbcVVT`0hREE<7DO899R1;78<bWf5V9PjjF8gB_M>?&>n
z+Y?gU%FG<)*Ql}QH&PW!ULk3P{9669V&!`^2=U~Qs2Ou@ii<;y-3jPuRx=Dq|Kt!;
zFBIzhUaeWPW|RUjc~mNhQexMFHO=JmLJK&TmBmFQIK-w|PJ1nR_W7Sh*k<fFhWZ03
zZ*sRe!G<TFiFyYJ98UYHrjE3C&%uLjn>R;t8ap=4EXEkG4)$*tYC>k5L=WB}SsBPY
z|7Z6(gp{oeopD&HG3!_Ioj-hlLU}vtxag^c9yK`gV?n_+%mm&kcJ5Q@^j{6_Vm`gY
zE1*mh%^Zz_@&froBob6whSt5rA^L+&QBKM1-bdQEhQDBIxcJyxzY|}7TQF<dlWyMB
z&i^5~rI4U8SYh{XcZ8Xn=Bzb#Y(V-8zoFJ*u*2#bPYWGjZExK78{JMis$O?sd}FtN
z8ZMBRpt#~lNMcQLjOa}R4Txtob$8=O7-2IE0^`|TE*m$V80!>HSTm~KK*_)!e@yI*
zVk?0<mH)1eqU{wf&)H}76Sm*(68HG>8ln~&-8Aavlo5W2DT}`<JECC<7$hVe+(CFC
zzQ80Ze;gSr{jayO6gU)CLWmVaaikpi|4Jd-T0^W8Vswr>$*1T}O1*oxk4xhw7Mb=I
zw{;~QfA~et@Y_O7{JsqNL@)(spH`8~EBk+fO2v~bk@FCl314G6W|Wkt(O`GnPKXW<
z!a)*I<D4>h7z{i+!x2>RPFhw<wm5K3aNp4ORu`%65SUh$m#2JRBa`<xv=eR<#tP~?
z&7B&Bszw1*2yn0~r%o*b>gl*gPCD(yc2iElfCAj$-F?eVF3fyOch~HYbiwetdNui6
z7vT)a6}ekp%Cma=ZIF>h?QG!i6h54wQJF#}=iVpEIL!Rl&AMN9G3%qkInhE@dZWl4
z7%M{f+iUsKi<14-%a=8*7vBAUxkt)`4hqdCPMk<Tf{>Taz9Hre{IRw^FD6&g4ZYO@
zo&wSEqrP#lff{<oJa;l=N_uS>sgd)puy7RhKWxl=Gc&&}tHAUjMFWNhYJ+-0uN4C#
zSGDS_W8SW7SEsy$cB}+AP}S0jOlb*vwgNG0u44X7$#XfN$ZyvT<@xz@<M(RdSy&Sw
z6OT!=(Z9m4y1hQoa9v!GH+Ew4Xj44&<k6f9GZ)jqNBwit((F(%xePSmZy~d1OCb<z
zoY_IuEmj{YJcK~PLt&2T%KKxEX+L?%%sEC733D8KkiITR-AMKRW()b2>!13D$SLnb
z^-6>*PR$#*R-Fyeq+Nt8b_*QMFrRE^MPVA_4GKcfA&<Krgidszf)Im&S>n9P+Br=V
z*tMTmYieF>z2Vts6lz7wToA^`x11wO0KjD<0Jwke-m&uV2cx8A?St*sUBNO?2{tHH
z76D#f6Pm6D%#XN`I!pXk$KM`)FWw;xdmNu`7j2&}+Qw)BiU0P`VY(YtKv3vi_g=ra
zQ9=p#9~V51r=+L&<;!{hkA~R+!7l5_`HXG+<G=sUFyrO~hRcD<vo+2wSOPd%lza2$
z@2`^3QgoNaIh);E{NN84V13|O@h#oPzC%5N$A{PJ#4L}9Kx`uJr@MxQ8;C`9$|{X6
z^RBeg8IGYAim4F_U$}|CE!~jU|3yFw{rcszP<Tx?6ku~i;d)0$JEq$*1qZ;sWk;2O
zDejUea{BH2?XeU>{yod4V1=##eh9Raai{u_K?}`P*4z<7J@7?B0A3C9(3GM9%rbr)
z5Vhv*9W3+QkaPer;JVDt4><PJqqPCr;1zt%YANNlpcRd4iasuJf^%8ko~dE3TPJc5
z$qzeV5XuzUz`@Z!H{9FZj{9Gat%G!Ni`PLDSO0CJ-74uah@<23_Vq8<Fgs#lhOZK(
zl<+Gux5jXR9^c;HjBtisMOtyc&?={uoNyJNI+uiZQ0{u(0MnZe{TvM_!Cf^Sr!CDB
zEzlGsW`v~o9J^6t@Ze|H4Ql1SS5}6ZgdQgh(Ahkx!d6XbZZ3TO2sk>F1K2Eu%0j3I
zjEns1_kl0%Db&6|7{I-f`?aH)16Io~^d7FiRd*YU9?d6eafGBO6;Gb*S5Cvf%}{x~
z`04SDar>3I3(@(6)<z9o$z9nfAc0IU-KL^?MoTIB4xZLw^huKttxZ28rdl3FQWpS;
zWJE_thak>-ubiadZgFL;I79WXHw%Rj2i^R>V;omA%qAibrFcqZJ;!WVbuGuEhh*;$
zm)4^G^?&aCcN(W!L3GPqDPm#|{R0<>wI-dlh6xFuD96c>&get>c01(AkeGBPqS^gj
zH8F@+-y}+I`82YQT>r_79&Vy~KF`n@o0rS9J)V<aAyo(~a;#w<y>Se0b!ym36M%MC
z*Vu?aGI6eMS1soV5D0cw=Nc2RzgLLYNen(AGfP<L0M?o&P;g$X4oB_<*bu_+p(Ygk
zuCTLqH#(CT)SRctMGAl?18AEmPJzo1>2w4^2Tbb8*m*V)tXV0JvJTQP*fLBJ*^P*G
zx;6}B6D|@93Z4UK8sJlEC-l^~=m&af#}lO_J!mS8O2Gju*S~Bl<}vG>0FB6JC>XEf
zg`T9z%U^lG0}w$or7WZX!+-cnI*;{TDz+KMJ1c5vTqdRv!NSJ>x=Jt#BZMdh2_{Ci
zQ~UOPY}dj;MAnmNff)SEs#OdF>8-byKuRzO3v)(^%>y?E9I#g^IZ;atBc?lKU&80o
zv?xLe;6}3Q%(7*Yf(tZ9_nkWlFG;9H<9vY7GOU*!KL@e}hh!$`!7a0fE1`i>y%qI1
z(Aq;qtATsOU^+PTpH|D<Iz|f!i5=k9U|6Ya3@{~w=OKD*zpYR|j~isxXOQW;&z})-
zo9!{Fy`0=kfRZCqQy2bt@e;napp$R_d5Gm1nlf&K{_fZjDOkn&IVB)dSUWAP`YdJj
z)6-k&R}?8(PV0tF=Vw60>g6fNMD5U)0oY;~2{{7PmQWVU;%zW^4h1}xD;I>H)Qa2$
zhz9slcQ?kK1UIb^vC9Pwv)96{{E!DzgLaLudA<UJPk(WMd26>OG5=3iC-5LJ;ww7T
z^n}wY+0(!~-!gs`14Ng_Aw!NKk?b||B^^KmAnM?L7{0!UmLmil=W#MAw@^3M-Hp!`
z9blfGluu}ihJ(yeU<}JLD<p2ARps$lB1#%$I>hf3PTG`Q2cBre{Lty=1@CsM9n>b;
zK`xnuG;Y+0I*Ty5o;w;wzO^?&fH45D>L<q!y9}g8RC}}f%MD7B%H6tAMu56N#Buf+
zaP}LtG4L%S!n4cqat0^g241**d#=r64m_$bD><+np-j0Ue3{?(px%6Ks`+tcYM6ud
zRxAplH<akb=7l$Iuz7BBUBev5EWYE8`5P!cxF6hHq!yVcEZSuFtbp~sVe+BtT+`+%
zx$#9p4>B-6M%jDYO7<zD0R<p}r0R1fBW70C0}%`j4El(Oq?y={%)JfvBhT@9b|W_<
zU?A=zFUs?4nR-kFfN(-wabGA7BS(%LI<#EiF5L4C_nJ$GIo4B?U(@IGikVK1<XZoK
zfy$`u4L@U%_>3LOR0R(8*2_6lZt|V5ofe_x)RxIBAK$t4{~^&8OJ(1)!BQn0yl|t5
zWWYG8+;uyFt~V5>TH@dE%?=wjJU&eA#|$DEPLhO<V*p1p@IPb_S>=C?T#J7i<my!h
zlh#LZwgvt!Z13jpwdqlZF|f6yyzBzK$w`Ns%j6Ag0K*wFLCz7K7G$Y@k3glxeUeyc
zAGd~!(Ag-vn1jO=4pgmQkL!RxJbk#$wVRAL;Yt$tv~AHMDR>kGhyU`3x}_yKjqT&>
z>JHw#bm{Tb$vzWWtlN9#U|LG%<-tka<#+V<S~q<Eqh_W{_s>c)v`Q!#d45?JlZ3@9
z60F-8j5@lvL*U)@ziKybxml81QM{)4*8^XoZu-aNgjE(f$L8A3jdnH<8`%3D(NdEG
zud=e}i_T6@7x}M`kI!BPUw933_|qZ95Qs;Ub5$j;wea)<Z&+;D@SP$tq-6w}n22r`
zv#;?+O+ovE=)f3_*R-$N+uLtFOLanU#+hA$&uZP;I=qoo)2qhfUsnxs=Ql`IRaIGe
z1+E+w)70IFzk4c9K$3YNgMReR%1E_0aXm+`e8#C_HzKhu;IcROSIER3ARQ5WCycj9
zb2y4IyPn&u2Qr87B{+nH-vbzlF!TGfKO`sNQDzfsKpI}NMx#2rkxtj6US1B^w4)Il
zfq`&Y=o*S9(Z-FpA1zmw%v+8hKc09{413(V7LbbiB~vc&4v&(O^bf6)+%j5b|7r9^
zH9PIMOQC96_{GscESM?B@A-<M>UO^%)dItbQMF)E!v_yWL}j(J^G`wrG&{|6wl`Al
z!|XN!H>AobBjDE0WjUJ>pY#8?(BB|v41po1(N}sCIiZpY+`q4!?sywt7#hUgC`1hM
z0E47+PUucyxRQ`M=peM+uTNlbN2gAm*jyzgC9treNKQjTzj9?TgMk=H6COTf@Zcc&
z+o=S=@27o%uVaWNLTKnTL&LoGl<iAVh!X%1fdijcgQqpmPnS{d!%vah%~A1&Cj=uC
ztOt^kX5a)F$$W0ZGc?W9%PRsOE5gKXUFExVLv(5x)oX%~xx>3jIyV<%&bkf>NKl&Z
ztzYpvnJ<d1N8zg&{3@_>r&~oTi~+zr>h1=}0qyz)o*>j%p;b`dJmWA_x>dC6%fCcA
z)+aUx9egncbk~j@=sO$q;T<Xc5#L=Pz70nSy5C!dM8&lHg0K!+UD$Qn#bVy?+_@7w
z9zs_*1s*cB)~ySVOF6Dg%ixrPmr>J*ftG1*?mx6=XYmqRRKNxMp{-&elL~M}=0CK=
zREXcNun^Q`a7>IoHmq&6wq?tGJSu>CBphL}Mau7R!@>!IO6IqVoG-L-=`lLXn}IWN
zT4rhVlq`7!Sl>crh@u@7KY0%&iVk3D>@P_nL5VeOX{lXhq_Ac&f0}cVm;VWv`3k@k
zIC;5KWjKzFfJk&1Q1_|-^xlI9uO(CFta?pmA>S4ije+pEd8p)h?D%of;|Nwk*e&u5
zqJAOMK?bXMk06iy;zL0}CwX~#NiR4UK?7TU+TqVrV>Y_pd-$-rs!A}`0rPC)A|oPB
zFo~VZ#tr;6N1EBjT8uCAgkY5z8buNXg>dh|gQteu%4^F@9fDXNB;mIpsYADYo~XYu
z>_oIck}u$tp>MbiInZ#&dy;>Zz+236G`q4yFQ{Y#WKKb|Yxb9<tT13e5U~)6<CBz>
z4!R>pkKPG3asNJG#|$J=Pqxu9J#P|HT~Wl2g8Kz?1X@xb9rEBnk7^jsO8xuuGt$fr
z4Gl{87tSXA2!u4i<<pR=le<HaiH1g_+)%-skdcv+=nuKabmh*3W$sLYr<0B%Q#TT>
zO;M8n9^$L--$hphTiUu+D|b0`dh9^$Q7`j4&{R@B`|(fDFWG$NdT<{+M&bAD_=*>1
zWd4Gpr=#OWX;XaG-sWxF+Obzm8mT9nlNykNhtOb~8ZE_Mf*H`Idw0Z#YJnMBC;~r-
zeqA)g^a>Y9ZsEHdETxwom><D%NnE8lJq9Kr9d(ge5#u1yb0L8Zo_S`Q_(XlthYFra
z$x=>q>j2fE{T7#cMv84tp%tccA=gD#5s!ckgjapqL0QuAZD_-id<bU{u!-js<G77-
zZMlB*Q&VRY_%`qCpd-26!8nuYlhwKlz1ZBFTW%71iO(4Y-x;}_O8}TH6lr+{=~@2h
zc%b;4%q15ZoJYiEBj(#8*v+OfnNZ#7X=bL=9rxyhZ0F9MYr#$Q>H(|RzT0a&Iih?a
zt9nXyeJ4z~%Ms-?&^!e(kFXspNm`|HvL!R!p8n@vq#c}Q9D7rg<FGnq76h_1@3f>&
zH(^3;KojXlFZ>hTR>+4M2mcOhl~2mv@-zJTW(O>{mWoO<{1+haz=(EXK9l}x94Hd*
z->=g(Or>t!wk^V4u>Fsi+0msy>*gnpbk)zFsnEPUnFM^W%V0#VS}0mQ@t;<t>NeK`
z7C)lt9&-GtfCNd!_4_wsdQk(*tRsW_<h;_^Nlhc*Ha?cFHbBN#$;k%X3_slHIjHnm
zWO?f9Gi1MJE<d#i`@tK{G^cHB+N<?rP}1X65k<@y-A>YRYVhuFyon5Ek>rAJpnFIP
zqS9Kh`+N5ms?D4*!m&*7{%|GU`^!ck(=2j6ujzlCcCDgG2iycw6&NA(5LDp3k>#G*
zrJ>i=<BAf7v;SH|&jc`2x5Hclg<Jva8?r<ASLUR@;4QH_?sCuy=Uuh(*ea9VMlv4|
zIX@0@1A-03dmt!qb@*7`{^miXJAc341kznWwGi0y0YMM&in|<zDh@0Hiub71(lrvu
z@#JPw{HDuvzU&zjF<xPXon4qiZz=y!XlSu$J{96|cpgY*bHxfxZMZ`HfFh1F)r|%y
z=)FKO3ho*j8uXTE<J@V6peajS0Ugs1MMZwz-u>Kve7^bwAE6caUQ)hE)kur+CyMvD
z6$HIO4&52|-N0zvG7$3waTRD{lE!Oou9+vhc*&9>jC&#$L52Uy`0Y)Fg^X?5yH6j&
z)E#n!^{r%OXO3*`W^YIIK^WXg7j)Q>LY)3X8A*9(Lqicx=Hcw*g!-kzph2HS;wFoS
z<2=(6w4RKcH!~isAib$Loo+*Rs+dli9FF|z@V3%1KR+=5C;0p{zjNs}-j5nfGg~y+
zTV+vq5bdU^0g2S#7{yvogB6~Vuf(SF{l&uED$0XM!?<uSxqt9gApYW8D!P9J(<Ov*
zmm|*MV9XiaM$+lpVBt@9z%M&+OEEkm6pv^&51a(9oD+YB1kK!F<Ee#ign3U0lLJMQ
z9|Js+cFIV5^aGCrl^Q3{m@T=LJ(yXD9Q1SqSO8`}o;qt1!UX94>3nF{u3gI%z4a}v
ztY82H{=HepOmNP0?@%^@G6G~~y<%QzmG+1cV*0rek|vg&J0ES-o;cGHwq(#HzBE~-
zLD_X?+UFAx5X7b4h^r^x<uj;_#=7Me7H_acdE?n!N>;1D5z!wOrY!aa3=oB1;~(9>
zAIRlkMcnF22qW?ELMMz7G;8QIqDNZgq^wjLEWBD$LQ2YKW(3!cB2@JC^))K~@S$_J
zZe|+~5C-13QB9&g2f|J7&P1<YFp<OUa%yOWw!ZiH@oKnP6kH<%0|UpZYH2NeOoJQA
z`%H_IP;lwPomv4&L7YzQvC`@8uRz*KiI|Clvjfn5Dx!712Cq{4{JcE6pf<x;B+xqC
zjO6Rb{mx=wN*6{J9BVC=Hf4TauYe^0KS@e5Zg-f308vuieO|n4bj?EtNvCQ&0kU$+
zg9qn;-Sv7hH*nxsp+6eL<Iz*PWd3|HZ%9zvsECj=cRognm`{#@vh57nSwJKh`#Pds
zpa2FQ669nMm@&Pih4w@w4skP-Yx^B!Y9i6uxZEa<q$EE`b^;>v8xh9;&}zyK3lJ6e
zBwm4oG&R=|Nx~(9lyh$Ek5hFu%W*su&Wy3vVz;{DxcEvt<k&v40u#<^2ZlZjL%jv|
z5ky|0y$%ynY?i**tQIRN=5kUDMw*72zy?WPWMyedZ`u}u4;s6;u9ZaAOd-rWOH@mB
zG~Thqf6tyo0%!_^?s9~)q&GzI@l3T~$TR?a%drzN(g<Dxorh&@a$!%oK?b}0tMz51
z{6}1Ez_n4}E-V_PNq)_E@xr8Vrg%iR&fbR)|1B>s&v+k_jJB9~t0b=h*!LDqdA!Bs
zZ|keS+bT|EQwc*ZV5O83P*a7oBpNGeIi;;XgZ}-%_9PAi>OG(TDPP}AU{(&PlKT_`
z11c%eU!IA<W{@0ocMAruO<YIaF7op9K3juc)U&gHahICB_zV}Fd=kKjg3;LPtFuQd
zPQ-;QDoCeg!;si{^ypU^8Bs)-m<yZ^O=ySy-K{6@Sym|CP-Faut@L^BywbwrDMpUd
z!Ucbdr&V4k%!RR-zn9rpnt6iYe~RAQO<|H`H<r-*P2>keLp#P~y=$QeuXmuZ=3UV0
z*g6~bQJ;=RW}#{bsSy_8RIBmGsZ-3&b2vO?TAaN(uQFL}8e1P@`q8MM;K(f77`Qa(
zql>tJmwsOqopN(;!j9q89mz4aUbbu~nstQe#V@bCkE_g?Elh4}<n!7A0EPO6Po=b4
z*50eetBItnjdyU%@TZH;cuKGMK9JxK9bXGuaVFKFM(1IxrM`Z>Kvw7_-)Y&Vi-u0U
z1W;iz4Mu%vQ|0bM#0c!!lknZr(o%2IBr*1eWP&&tVa$XlPv(w>^@zB~3eL-C&JtVS
zN!|R!$&;$?moHx~t*opAR6<6Rk322Rcef%CE?rqr@+!N<(##i(|4gRpE`%P=J9Lt6
zg-qf?$9@ku8oNUdA+$2VbPsIZWS?u5S*S@ysM%audR5d<sa5A4nL#!xv`Z><r&hnG
zgi@D(ubJev2nNL>+&;iMRTJj%5gM3;%=!H_-)zAPFqe!>M+lISSIEwSlJVPt4O3Q5
zc=l|T@B$-%7lR{83<-X{9ygMHLaLo2i-qx^IvIFX;?PbYT+?wn%JH`~GMmStA_F)T
za|inzpj(-~i$SqL7cd=%mV=PyWB)jX<luz!F}p+`Aa@j-L@`B3G;{Gt(V27>$3)TQ
zYS5vUBUTzp^8D}EEy69rZPn7N^(}p+Nth!Mdp2#Fq&~KVq6%%BSb8SE%ri3^h}E3A
zxdw}7bbDM82?<^X(<7@0xS&+Lo%7sZ#0xu4LOkHy>-7ZOJwR|>o5pkoDA0XG+9Cu5
z9#Z#E?g)sW=Rc1ixRK`i*2wwrwJ_2ff0QnsG-)Rs(;YkRr>QaUp|dyrHwZ3b5yj-U
z8#i2BTo5mq{o7#7G+$<ACE7;80)e2*EGV_W0>I{3lD$|AXC>FcDh#?D6*YHwbLnG$
z?P-ZK$WrXL5q!)%=HW4orQbK!c;kBx!9Y7N=G-AFQPm69C+1tl`%{+P%}r!4dA^{g
zYOCArA#*ndrMgi1Owb&t4)eV^`ntQJZ6B@Kn-t2b59lEybx^XK{Pi8qQv(N$CPYJH
zcP*k*y|_z<H2{v_`J#fbdH!lLF1U@b9MA~vh^(FRDTciV-T@`TJe)pNGn9su#+FAo
zBZDyyI*T#qh($Oy?+<^gQMdf#zP<>C($g3NAFdvMC>x-s#u%Xf9Xakh4j%XOV>lJj
zfZC8D3<wCP+|kc)p8tBF_CnkNyriKddc1JX06%G<2|_v2E*E`zaH+Kb8~8CU>8ZTR
zZz0CPn0nSEhgPOcK&&#H(`*&<qZy5984kw)HQG+Q7{Hp>O&aW#YZljQ-H*tX(cg!~
z_;SAM6ae<#i*q&tBYEvb<NyC2?9H$=(SwERVl-=si6mHE4<VUqXrnafJORBidkO0)
z(Dd$jKhA2f#Vt3$wC0Mmz2$k>Ghu4t=FPR-JJ>lUWJ-<fM%Ej<v8OncqO!sXiLra$
zOT*$8!3WI_7gi0Y;Qh-IDNpR1j&_xG@O6(X7-1D*gx4dfygs)ErbKhv6={XU1!0MT
zdBU<YctL~vT)4dpRG3S6E2R+xv==yfOe}NzjGEUy@I(A6l}LRtQOhW#0SYf!+t>&}
zRN?tPKHKiNT`aCn5Y0Zm$il*O7;AJ!w+a6m^#CT`t8{5a<d{LC4S}YgnER_%I?HDW
z$%fOnXIORyL%s37f44|<>k!d;U%9e&#flZANB&E9ATJh#Fzzv`gVsAq0qgd0<~MpE
zUB3i67)wE)URP2X-X?DX(aMp%cX|!z9gQBD+&}bL(84vqe%qBRU%3pkULDF^hFh33
zx{;*(nGkkB=od2pPAo8k^<eXD1%3z&R}v57wT`jI^oCIWC}9lyfVL3qoS5VVjSbZY
z5vE9PLY#(tnUF$vH}&Rofq`9_XU|kCsl!(BCF78@Sw<m}AfK}#Mk6v-zOM(<Y5`oR
z$BBuG3>3Y!s|>VsN=oIZJAb$U3!?>3M`5TW-KU|JNdRaLGN*Rt==xDXv5LfT0M1t^
zy9*xIb~dXgayyLwAyVB2m?xb!2uV+J>HFcshvZ>&Pp+)@aye8Sfvo&}eMb>WL5-fd
zI)d?$qGw&qi0wY|>6N9Evu9spsA$J0l~el#GBhF|w9JUQZOCC)e=%JO8#cgQW7sfT
zI%sEq_)s6`Pgzg7(K@e@+mK`mSF9xMV<5cOQccB5zIk+rIt?Ar#D$C+P27$Uln@_B
zWgD_`{kNStj}~&6o)ngG-Vta7Axj~GAGd!*co(LHJKVCRjx+><N&$6~9z$1n4jo6n
zMv9aF;R9&Iu+gLEf3amIC%91u-IuRkk%kTMG20CWOlE3OA<py3lc$`aE3ZIBRPIh<
zB0U&ygT7(b$(H)2LpkftXvl9nn@F)8AOUhAnAKt@V8q{#;X$dw8v+h>U#9g<?W5uy
zjPc<-kYdjQw3VcC0vnFVS+D8bRsTCWq(W{+8t_Wd6I8s1lVieBz-@|}gpQ`NUwMu=
zv^PY*K2{j{Z>}{yw6)?rLOS6^H{M5Yj($HBV`$m{tuidCN4vfL;diLuaeGzGW9Qk#
z4I4h3Ou3Zxq3%8Y_YX<8_AM>p5Vq&GyE1MGW(Ex$AS4w?+&sXOF)~&R8WRPVZo>`z
z?N+ZAENObrc9apt4}g!r!ohQ6o|f;y5u&ltEtNq~SHWw+Zz+7u?-}vCC=W@3j*9x7
z<fn2pNGsijv})a&sdNhN$dj*~S}+3t0iNzG?>&xj(`*WMj$e7)u3fswDk4UOYjkyW
z6^$_9YOoX_Cy6@_iLi;eEimulGO@JRcWI0makRke#lQ;gYU(y!+1azx4xcDP$=~2S
z_sVNoyz420%`K$U&wY;{$2Q$fOZz6|U*EQ800;LXk6sW_bOOIO|Cusy&(*8n=p04*
zk$Qu0SCYQq5zzJLOzr>-%t&T#@jV7#Bqt|7Ev<_Z5fT&R&<aojMCF+{=^HRrOrW&0
zwl>{6mG)Ed7UY@4Cm^Lb87wUN(3go8As9UQ*=;xE2+}?>FQ)&{=8HA`L@gWxL2@-_
z^k|=DJNW1+PH=h;pE$8<$r2+{W+oa$kpz=akrq91-imXdykdo@`19p#ZEfq&v!DiV
zL8g+LySRdB)Y<q|5I%Cz^~48oZ_to=#$1cvxB#(!((38l%~&F5ys%I|0M=j}C73oo
z3ncU7>=L7(1+VD)BF{Jq{|E8cep*`DECFvrEQ1gE#N=OE8m3-a!c$`~_c4!^f)1l<
z_*3jL5Aqu2Pd>x@<0Nw0cM%(HRq1K|Dbz_86f#>#avQFpF|#sP9*Q+dIZ@#prtG9X
zn>h%yu&{bPx@qDza38F|TWS1dkZbC%5A%pLsN$ns8sadT`<6SfdgVsIihV`rj=?AU
zk^-vO`}|4b8^`%bYPix~5`6ukchtc`(u}PWh<FCpDDxi4NlEhPFdpg(wB+yQMF$0P
zP9nX}>khjESj8u4TH1=C_zhZomaL6>{Ne?zgx5>==xrXd+}QXz%fg7ZyS<MB<j~5!
zr*<q3LFE5CcId5jf@~g6&EZb}pH)>I{bNA|kpKnH?i6i|tPpm5pFtTh%B@>E^xVHY
zEUQSs<_JNr_Y8HlgSE*Sb{xYpqu-VG`;k$j$BgNPv+TF<cpkw)j%eNrzTS~aBSUL@
zb?X+g{1EpU#9n{Kes-~6|Nh#QJzUy2lt?<!6<&^cVij*)wP6;(%$ZNg(D^!mFQFgw
z=GFhrw>=&n6=h|?MBZFS;<g?C0(`IO<#(9k*mx`(*t84T5zkHiLr8dlFjLJ*=<yE;
zj)NF(Wyqm`;|ahg?iqP9W_mC()|hm~iC8(9MffvwjY%6eTmwkK2NFrIu&~eo8>Q-u
zhEl-!^9bpzqE$-8TSZm%5$(iqa{D=r^qBMwA!a!A8$seoo&_J7!40(LmN)+g9Y}dT
z6g+$17%i+9J!1TJBe^WOFW(Ztju?#IIl3QnFt{xgF*NTB3#*U}^C^{QH-3t$9vZyR
z(i!(e%GYD?5_bpAjSV*J{>keU>|hTv#e%U?#5Qox__E0I^re}PlTNqE-?|Hs41kS*
zXHZ>y%*cUf;G<r>e2M0NZp?av7D5s}sMiSE<GITOFayh88%0MTAwvh(d`2=L{R$6Z
z@L(Ykm6@4IhQm15OWxjdR<Hg-?!R~C{Z6`~%1d;tg<`Xu5LJiP0ia<{EgFFyY`P2N
z62*Jir5B|9+||yp$V-krnOb~v<K|cwY@B471P&|4J7c8=^!V$ETbwA8xn&p-&cJ@9
z^yE<YV(H?=)W7Hu&<ji>Ahj|7y67%BOP@LmJYG<3Vyb}+kcM8*O<&{e*3-gf{fabW
zA(rY+Sp26iN@JK|NWFmIxx=ych{Zk0L_VN4#6_j5qaI*07UoWSP@evFtAA3w>>R!O
z%R*sGNM^RGy3@o2JjkBpSqs$Len3?lAg^Ld=%wxg>oD(2r7uAbW>sHwF_%vVt7&29
zucGa#AE>gr^J1s7nT@*jPo9gMqPn#%xM42@*7ABcDf4=l6NPhUpEtjab7}uKQ<Ay|
zj3T0=$7gfbqt~3iSvd(vvgM5Hu2ta?QI{}chcfIzAlI#1XMKp9E_B~-xo@faC7MA~
z-61Q*hd<eN+NGJ4r24(SJT-GPNX1QR=ft;x^^x(dZpwriC?*gdJ$h6OD3kie%r0&i
zbxO7ebDSQwXc54_6dV6KJ6o`-)5X+Be_tAD=H1*JCno^Du-=(<F{rRX*n2y75$R#y
z3?ZDJ5q2sfD?|-bX}xZ?qWf!^Dp#`iGI~`Egzx~}7$zGZ)7OmI34jT{1C%l2W&sEb
zEHq_X|KQ|%_m*Kb?V|$WX5$myuEVIRrgLxA?4LXr{pHwsBQ5xA7cUO_<wVs=?(e^Q
z*HtQrq`;V_2uPIEp0vP$JpUE{(fk>{j=H_O91H4Gk~fe+cCnP@K*1Zm=dMzLJMxLI
zKmqo{{m|0hM}ZoL2YqB;=|_CqoNm*qk}Y*+uO~3Fp>5lPu*&O)yozII`{|-cABbSQ
zKLPLWa(C|#B@dn`a^QboZs7yrj4&yiT=vP65@6O;3q0RZRWsVGn7N`*nZ_yqfJ$#~
zyFxmuOtVi;ZizH^KJUj(_yxl~F4#f0z9hf^t2^(x(y6w~V3^K(I|}avICu0Dxm>m0
z5}{wY1IzN-V?NRVX?ewVqRzHu8vI`bn$Gt`?cVwAi^Zc30y9z~GRf-=98q?L-p?SX
z`E4;zXw+OU-AIQM@Wh~WjOX*Kx=;r8j#;*JX&R0@2tDWXwS5C0(|&ZlSX1PMQTFst
z7P57E(_Ic32u(R-Ol4=pm42li#vpLIrdGfJ$Scv4;byX!hdu$4kXXO##5~pe4XU*>
zJHex*rWHcDnX-VsfkA!OHVF-K$(2A*RA#9I>M!TtwH|DrH>L6Ry28`StE&MR#86QE
zBl%xa=boi6Vn@^<Zy`;zgKiumfJ5u>E(1E^^qs!6V*9jVs~U3_%%4<mfpnMik6z25
zaiH9MuZlrgXD7F<dfZtUz35uh!W`N%fM}7Rmww`Ai>}P5DYPx+jb@(xj4q1q{_!@^
zq2KV3decb>pV~w^|2(`yj?ms`Z|BeMo7?cv&IgjBMr&)|TywB&-=gEMm3p>or&zbk
zSTyxf@1(f&b^^2|`)r0HXbsL&dP91tb+VkBH?jdwAW6<+l9)V37)Q&h0|^UNL0wr!
z|56&GOfF;@MbtmK4+*wB*F4(Vj+2ew*zw_WE<PkbnZ*N=+7wwhkxYHxm}S=113Gg4
zi5m4D!!6lnEL?}<PJ;j0m+}i<z56(w8K0?|S25?#2l!CqOZc<kbenv2s!;hcxD@A3
zSGXe%4qoE)lXw;Ser9y!Yei065P;Je|BmHHa<%Yp!7ii1bQ0|<h%)?_Zu)gpsj?P0
z9BJt92qNkOW4uiQ@TQz(1R{R1PFp)WnTGc@&3iTCZs`0+sL}Byl-y3|Gviyb9zCK}
zV3SeCwz#-oWHLl_1fn@iwhq*!8}5NTR9$04eQIbJ+|hrE%RBJKMNh7UsS{C5TMHej
zV)lnF-MSg<m~@tN5!3B>(v4f4COdAcoo1dN|FSm<gQlz1nd``*e(^2esVFom;2i-D
zHtMP7RSfO@9)wCqIzPLLdh92x@KcTyFa?g4>66|EaLWd1L=^!eOyh=-@;_TwF1&gX
zP+RX0u?c~#v7@WkY{80B95cg=V74huGqPr=w)UbK*GT?A;PfK_b?992Pb{mPFT1uZ
zZ>p9iVh7_Jl*yDo(~1Z$SxB^SX9w9eE_nB@j0+(M(?wx7U)XXaB=rfLTA_oK&RJ{n
zN``LM$fXb4oJk9)R!O|EH~!6u-cVlV%!%cUO%<b1@@B&_Dh~@9GrscPoxDv>`%uBt
zixzV(i%1$0BkAS!Hvo8yS$K@#CH>nPic<G`9FFRww)zQl@-cQfV<~z;4TYgXmO2o>
z@igsgjjK!UHNM96<mYv{Znb6$UvKg`8~-LSDE$+}r6<P*Pl&$@<pN$TCcuVsO2rR1
zvA5`F`Y&_Z4@~PSV~B#vmi@|I30^1Hpd6XR<s*7o2Yj%0ttsQa;T_jkgSi%acJGEJ
zCK~i&lQGa6gIcv}CA^QI^I|b$CJTE^m4*5)+n{o2x)ZfQ`(Mv<XE^l7o)!59ZRwTP
z$e_^bY3QjvwW}?h)Zc#4pkoqxhW^cGq}VHP=wt$fNAr6?*M1UL+j5A;(~LgGJfBxm
zAn(SFE#lK4FyPwK;?lvq+qP`ElG-UJx@BFnZ-?VExbq4P?!0*<EH5B=3?g9SFyuvF
z^-KX4K{Evigk9iK=L8w+Km1EN?b?-|gIJf397F|i6-Aq@?imleWZCSvS@6foXS_IB
z`rO0bAl@r5(DINI<I~Tm4n_^HbK>k1_{(A(vyU)wCurEp%DO>vRCXZRa>3e4Q?KuV
zzf~&*9T|hdS^Y%KVs*DKggvREa`$NNx$3ypgz$+sGhlpfK0w^y)o^dZrbD+Stfo_w
zWfzst)-T}W677O5W2RQZnGSDG={kyP8u3P;v*?HjGxz=k<c`;&HRl1G<Y))=WS+dD
zT%9wdf1IjqzbY>h^N)*WJ60QkUu$WedX*4z!;6CGf}#SaFLARBo<Z!+8_hK3{9)Pj
z8C<+>XF`<k-@YRE)4MNMH(DRHBBd%Wj>FmVpQ`HW<QobC(jjn6KWX|xY|4}d<@${s
zAu5bxbwN7FR@7Bhr^&BMJNF3g3w>c@nR&u^OUw2!6r3D!Og9_~je>l>-|oTS4uG&F
ziC%E*XGg~~Ik`45#^u0pT8W;dP@>S9eB|(93`*du7cZ{y-g*TzEj&H-p(FC{$BY3?
zVuAgVb`CgPYo=OxyrrW2h}9QeK3Xn14y}*G2@jlq2tM-1=@*dEi2YSe?X&FLHv=s|
z!6yRw`xA~F{YVr~&mWX1>x@g;MC0{!dz=7e%>(FR@7c4~(J|wfh|f=-Cb&toJ;{b6
z%MwGHKo`G$ASy0i;hA0)x104!sGxcxj@Hwl+5NRNX_*N7umTIdKtv?>^PinG=}Ut2
z!psgq9sb$JQ&p~l23a!3%;NpaMjbwP*Wy<CpN*-!jFGipIREH(uuVoK!uo!vbR9D1
z=a;jB!(dywOrhIzWybX2z`A}FV?E`TKrw@v5@<vMtXpS@Dfp{_DNw&ptp!37-u(-M
zJP{;s%~`P|Z^a0<Xxnznhm($8(<o@SIC5iET52kDH`Q%jg(xuKf%0*K#q8|S>cgUE
z{P1C;*C}v$xE4miRDCFYZ_-L8%QSoPjQA!|Xj$z`zKSG})4tcev0>`W7nOZ>enu~?
zG=Dz;TX+V>5c+Am>@Y%zV5;}gnJPwxJ$c3t8xW#=FD><aXi%9-_}NQqGF<`vJ7P+k
zw1GC1RKfm~Ym3ZC#a&~_5c_rOTx|LREOcw|uz`by4}Sp?l$dDE4Yaq@$8w^ui)vPd
zo0|koDNdu?JEq>xH#0L>kNgN^4R<-LqsdTXUkwIg1u<w_fXaLQhNGrKxh9<~_36z5
zT-e!v{@FX$0YaQ$p#?KpXV&$#CXc@%7f+k}w{P@ctGR0k8hT~MsSnY^>~a>U2({?f
zZNf-b!C73V%J{2{9UFVyzmS{E;3CuD?CfkR2<KcPI;h+79i0z6c=*uZJ_Ewc-grU?
zdcCTLyu7&n?>aFo>LFOt4mpnICF7vf*?nZB($@_xqjC>|ATmQ6FcWHnm~RBJsOJtJ
z4o<2KgNCyn7aI%i=Y4=WT=`(A9N)kDd?HD~s)K69#Kfrh1rJ&SV=GF5X<v)UG~bS*
zTsH68DqmwJv0VXU`I!OjJe438Lf#yUu}o@(X*F=5!B2a*msEh9%fwV7=)-1jE8sv)
z##-0>SgUxC1RsboNc+h;-tgE0$k4-8RYPz6<uQrnJtia|0;Cz<%<Rt32gIChL5e*q
zr-boyR@B^@D0#gLL@7n*m0`?fv?56W4JsHq)uKBVAE4-JL<F<O0|P6HvqnG*D(y{T
zDvtu0+?~gd)ig9pgPd@4?HYs$u@{Ji$eYrcIm<L((Ud&&2RCz??~s1u#%(4iAp?yU
zMp!4&qo>yVK%>M(KLvH(rpY(29^i(xB94;#O;+exKapgWm6g?`Ne>8$K!inGvxgm>
z?;Muu=S^U1Va)^p$Z%Tb(|yRxqoTlE<7jeAN^zbAq$`w6r)}|vc!&;%1GNsauo=^)
zJxWd%WLBJIF1$>z?UstL#>r*o%&$Ci8G-VI&<B7J$mte}f5*o24iOO%kkRp97Z*eu
z-5#+7^=?Pfbeo)5L85^e2E!4)anHa1PQ}TS^C-Fp1r_pFWZNxH%!@h7c=ls5jhjNu
z7y;)RK9;8DH+B$guL>eNojr1QewPt;%(w^`McFZMQ;-l&=U@QA1Ipn@w2_SA@Bx;I
zbe@8AbaWMW9s~NbcT`Ks?R?t1$bP@JDc7@MeuMb2K48oNTTH-Irq@G{w~B<?8=Ykk
zztetI8*sKobmHwDC@@8H1IxEzZ|!fK0YSpZmCx|0Oj13J=#fbX@<YdKYdu&*`cT=h
zaqH3W$l__=6(H|B@wg_izm~!OkFhrot8s1P{+Br!c7_ZYGp5K?h9b#WZAt@5!#1lB
z$u47|wuDR#TQng}$dp8bDMJY@rCkzAs7O*$Qoql&V(;hu=l8z%@f^=SP;1@yeO=c%
ze9!OsJwYG8)|T&^d9s!hPl)HeNDMqgi0rXEb;U{{dkyF-CK2Bcrzl7XBqVSpmD|mg
zpdEr&h)ruXS71uP-!Cy%*DjB$N%ikfe{35$aNsoiR_cfG5TlpeAp@1~hmeN%{1{Hf
z(=WQ{zYeRdx&PonRE0d?A%$n0`iyg*-n{ABr;m01&XKNm7(=LBBq=b<GB@m(x~M`r
zd>qbr)(g}aH+S2F-KO$oQMskMFzwbCMTu^wtyenz@@+T9VgosnhCtAO*2PnSdO)%0
z)5Ypr-ds)HCv`s(?(j_Xy!eN%1syI=GL13=S3p_9g55%LMQlRxc(U?zVpfFlvPgT4
zjEwAdNjr-=%Iei0*p>k2LK=%r1cXBL0G#I2WAv@e%+FDqkTM)poJ*w!>08JQ*aI>e
zdYf=8Z$5tfku}3$1{{8B3ZuXv!N&SBBpv5X`Q^fuE9YP-6T)uWx)r(fij46u6L$_u
ztz|+qsWv)HHT1S|^e<o5pi>JXKWddOppYW)4*FIYFrRpzRA3@48+-_h2_jU3Cj*OT
z);o0@@jRGR>Vi7e+UESUsPo{z=*ZsV;{(~{D^?~WBLnmNOf=Ynxr6I{FzX<3)ifif
zFKO>OS4=9x40$NfK?z8DRVwT5Ox(^%`U=3)PMPX;4th^;AGu<8W9H~9-i*73b$PH2
zI1DUPSjdbn@6hTJ_I$;w*w>C%P($1FUCQ30W~!>x##MQeZ8P13)Pn!|HwJfR9yi0p
zaJo?W9_;#*_~j9tY)r{2fPDxFvMX`S|Cqe@M~)s9?0T30%%6C$3;h~`X3Aq6ocU2^
z@^=Z2xNfKk0?W@!H<JnnV}rc0I}xHZ$Qbcww7twc(AWCcEw?TT8X>XKp0vK1?xz1?
zO5bQ?NULaMMvRJYK=o)-A`41tTvJhj_cozT00t7MU6g=;MNK?~`g~L*xi^21#P-0e
z8_#D0Do>hJPr;3>LE5g4K?K%>$bOH03VHet6c8NbO=DwYdT1xDBK<5c*MaZK-c#w0
z*qYOwygW-)uyVgS5Mdq|nO1h`E(tvz`n{X(9&*ZZ#4Kq0#e#8^Mg1FoCH|JkW5Jtx
zDquKjTrv%Or=VXUsdjT{$kru`7teFeWuuq+fHbr1DGgBlsX_V<ElvNe@5$IWN8R*j
zZLuTHN}bV%Jx&w|IZH_Pk@8J$)J(BVX{amXZ4z@jx7j%5VMmC?jv(`_B79mj(;Z)j
zd>1QWd^h#(-M>6N&stm>f~)N-JS^2B`g+$9(TI&>TtFR6Kb6qHcg{`|4Q3K^|J{2C
zf}|Noo(7bpO5+=Q4~Lw7GgMuoI}z5Qdjxb~A+jvy5jQUqOf`h^v2YvPa7J)lWbo&-
zzp;(ck>{;kAt=381TewvsZ>rSyclaI9R$n{Qe+r1{3*cE{+Sy%fxI*h3&Xe0S6G0d
zi_RBM1zOo+=asFYjfiUAm&!_VCT-FKpcxdKl!dU59}NUI#@>JQ$cJjlzI{yG*b3YR
z)-EQoGKS3=X=;h&$vdJLTDuskAS5N)P)!#9j^sQxtl|^ttW6TJ_3Tu3?5_m}PlkLc
zNnFBdaP=mRbNiApUeAk~dEK{f8f)^h?%a8UrNx+_PlB;JSH28+ETTC9{>Wtleqr<g
zNC^_vEibD;gW6(R_4m48T!5N#iV0-d!|GK)x$CHOhHEvsOQ6s>bLZLs6^LP~eY8(a
z1H1q!%6R-(4B`OHYd8Gxv18&zD0{@kRg=vbexI8t=ICUIwhz4bMcrLvI1DGNs;aP$
zoJJ}2KiUsiX_!3wiTSm<i}zCBPndPo8%Xd3ULi*0IXZrVQ$>T5sWA7x8`u)@mo6F3
znX`ZIUQ?}h1{qC!k8o%CP+zF8Gv$!mu2EW}04GS1SM_8i2hF5R%)tZ%Ua@A)i2XvP
zjuMa+-$m_t<GbCvcN?A+%v1)U9cm3NF_YuMgR(Q9!1%i{tTSLde@?Td9<+h+xFQgs
z#6;1Pza@_GR3J4IR-fDEM})X3ab<}*<jl>*X^P6z0n-6J3_guZbOuo{`TR)xlghD{
zl<i@z<4!ykU`m1Ea6#SXC84ya;#9L)yb@PGXsYm~Xh(%H5eP%+0G&Bmj~_cie~|7*
zCseW=AMhTJjWwP<XAbC+si3mH=GoZ%-L{^mEzS_SIgK>MZ{Kpq^JVIw%hS6hRL_5l
zwk>n-UK|2rZL9n+BSVzzwF_t{WWbp69UKTv(u)qyVgfwVG_III%&_SDTl@CNi&29M
z)X=vtJ4DEe?5MoRUj+m1u+6gEgK*{*2_?mj4K?EsA(pxXoDaVyy$_N~v&0MP?06XP
zF{`A_=AtaYsJsY{115q8iX?L<4LEG+2&Iqw0%kB-cv)GQp&%YCpAOsQ)Do~lK}asV
zfBnD%BMXRCyGV;ug*+j~+CzaQrAudolh4}tRhZSP2P4H0ZLpFZM}-KsH`!XiDEGu(
z<z$I(rt<MoaW#nN;syim5u0v=45vID;W^8*>aVA%s8EhAYP4i9sm4FhxqY<g$G7=5
z9Tb>%idNmcx=Ji2x?Ct0!W?<RIS8-GR$NCv4iQQf^CMuV2;7#U=bIOQZe-7#;|^JL
z9=b2pPM}!KtcXwnR6;}h!@a;9z|2mrDsLuvwI7yVVQHyS-@e;9<QxNHho@-7YdKSI
z!6}KxB9Z}Y3gDf&5HH1T6cRmrZemV^guH}Naq1?$I}eXf6nQH-@CRx5URi?B4Cc}W
zGX{&A%J{W>ZWKlT-7+T@942ih@fBtFBi`()fMv<~s(A#Vw`NfjR9D3Ht1Nb>%WZZl
zj&b9?-5q*ef#1y$<9o2eLWq__4}KS=u!NKVuuScBU7Ap2BW{srhLEdqO*+y3t)YaE
zTyUE0<MO~6XLna`*S`Jl`uc~ipMcfEYXQ9o*ItRC6_$+v40g*|?d?$TM135dG%FxY
zz9pavEF^J{s4`{bg5h#!#9+Zihhh4%JE8h<Er6I#0h6hb+hxi)Y7I3*^Y{pS3IT~G
zY&pu$szJgw-8l9Zn@w0%3y%rB=*?6=gKV||$t)eJN!>oZX#--1^7~n;s5l7XPEz|~
zmv1d}dme(%0$D-X1C*Mj^YdvTan1e**L~sCI14g8mx4_ZgT55^u?I_&?+5tK6bC&o
zidPXH&FH4IjdBI4cD{#~z_B3<>*RFt@^9rRi|`Hf^CB=|dn;A(-KS6es#6HX!fz7x
z%_v!gYFV&i^A;^AVdoLE5f*%8^@yfFC4U!cF5!1A7$_N#wV8!|F8&o}4>2&16Lx?v
zLlR}h>*eIQUvYkb%jS7oT*+KWfYGyYr2IFa+8m$kmj|xo+l!_@syX2Oz<<!(E4OR}
zOc1nq2K$kd>OU{;=9<$%obpK_Y#Yu!!8g1a)3u^Wv8uWnd;T1i1pK&NQY%U5@UgIQ
zuB6Oi@#8x*cfTlx$w-G*39`+TgB=u_h33DG>Zf}F?!2c0zHxB9=4dp=RoUHU-y-o)
zcp^ey;&sCwx&_%{WM!+4aS)D~ql61;FE1}<Akk;1u!KV=_<OGly<<p8;1!(Gg65?P
zF~#`S^&l$wROE$*3l6LD6|9bpF5xf8r!ZcK>3{$F0%AbckKrB_U%!^a1SfR{&Hh+i
zY|gCM7k)fy!wZ*}ZIwwFC@BpM(pi(IAvCzmNvdhZ2{?3Uv7KE4n4;Z}h>LM?FGzOu
zSD-WD%0m&?a<RTINxeXf01?fVr&F?~tb_L<R(?Sm@!|#bjJs>QiFX_9M;1u5BizO6
z+!No&RkQaA5Wdycq7|mr1Ih47N64gvIFa0EDhk)GqY&Lj(r`d97gg!(2wUA*;ywUG
z#`rRkY)M?nVYk-KgyUwkrgob+RVg5%gwcXUSQ^2vk;8n=dlMT?axd<jjn516R6q^1
z2cb{flgZ8IhUm~H*7C)(Mamu|xReb#v~O5x`vp}yebKo^+CMzLKwcSKToA>G?V;MC
zf1sB)W#Kcjjwn~+M0TTw(j3UPr#eMhFGNq6B0oJ;{QM_Oh{542H_B5b2;4(aWrEIa
zcXxN-RYc#hv26JTa)*M0lL=Yzt`NS!S6uK#R5>$$Ocgu@$&72PBf!ID%oYolwhJSK
zhv+%cX&G@tS3%ecHezcC)(j_voQCIp;C8vWgkorFAMBT0CNpISzs0)@l>qp69Wiw1
zELDSsS~-GE6J-~-Zv97KbdarVSM`L7&}B@&$q!y69i@gLT5Ig>uMsm0aJhUI$xUqQ
zkl+9AzpZEI&L#y{?)8GE@_OovPdPLIdnmx!6rA=Qc09m^_;@1@$lLc9FI+ezPY`No
zm|j)oOgwEjeQm10j?PjJ8R)=peKX36hkxJlf~rIH(YFK!LdN{DvjMtxRYq42bMiio
z&6vWh2HKm(294qASN^COR1^K?>Vm=dl2j8GtcwlS2ZrPwUV#wG`yxmZZ5$kNP^;6Y
zaob$fOw#Ey0NCHYSJqv0|G)5M&2xXCyaNjmGK8=~94gYXoiq#H*h|rgKcj<$W+_P8
zt4rOQ`S!uY^G7`|OsrZE7NNa(t#isR`Cj|O@D}|Ns&5{>XpVqexSeNwtf8Ty+WEcF
zICofbK&&EjF6=A1tjS>>QCEaq!-+He`_daRuXoL@2sK8y1i2Ipb82{T-K7mG%6}N2
zosSrAO4cQFLSQ!H<{_LiboOM2SH2%89i7XR+9~B5?3i{f=%UGp1NX&PdJj<@eijmz
zn9GYDn-S0}$WXK?Kdw`GtU7F1iABF#)ORfUd2TUx%8xl*WaF2wc6Gtysb}ZAp+Vhm
zTUDkg)6;H3KiP0iO}Zrd^_xP6JNB5E4jKPi;+EHH_eE|S`av`a;k@Y<U}$as-;{dv
zAfIc4?Pbl7+Looj;d~(tZM8IJ<YNHfiB;3RC|a5BV4t9GBB(k4Y#v9&7iVPLnk?0O
z#j<5)bYT#TKqm~S=*k<70)JC+y2a8sN0gB7WmDHw6Rio<ogqeYoQALc&H5<TR^#2^
zkDIr^LWXJ5d>38&sTpwm)SpZn?%}FG@-rfI(;8da<>-bzoo(%o`&B7*a^7+>{JGkJ
z`!k@Rj~&`t^2!c(MwoY<bAYv`g|D=4G?kCbKh6fn7mH)6G6j7_EE`E)_ZLYmfcfxt
zki|<rd^lqTi`h7hUX;e~FB{>Ho5+nuf0CYyN!5;Ck_r<$4JA9HwaU}!+@@$F-9xY`
zlIIK5Oq@vf5WYd*n+!POQ7p-?8GS<6D>6AsCE*nVTnq`S%dqY3Qn$4gg@unfF)>&7
zqfJE*4xmh{*6kdt7O4WQRT&+uiOT+Qb+WazTPg*Z$3cs+7huII4}J&WsjSe~wX?H(
ztYPbN>~&{C$zsTv!PRG0O2HaeCLN4u$XVFq|2%W%=$5&<e#6<E+W&mH@X|Gn|6Kp#
z9+_72si@}sz)-JV-rj{uSC>ZZ@!#Rl-dp}{b9K3^gHP9brgbg5aN}W1%@#SjBW{MY
z3+-gM_G-9S)GMD?240a@FFcy$cKoGF-@ckHEIO8cug*F8J^x+sZ@W+WjLorH`DQi^
zE5dSSPKgq4-bNWKCK7!y^N5(suC=1PC)qXiRrXNy`$csS^KIzUv7|^#lnFRXi_*}w
zl^u0$gq{b;w8Aae(sbZC(I|Rfs4F&-C6|-7=y|bc=AE4Rqim~#gM(d>I5_3m`9J22
zJ^QwdBara6SC1Z9$CZ{ac%@IDAh@i=t&?1KnhiqGY*)Z`Z{qbS&&z5O$e637k1OS)
z{?fkGt7p%o=|`;--j|e^PdnP@UG#?u3zpwYP`z%?e|-Z3l>lP;_Jt>FvpJtq9X0Q(
zR|!NH&f485BY`Rq)XKGO3z=*~Nw4L&KFSL8M?SrKcjjHv(9r0T(o(cC#L0uPBjhAi
z{jiNkw7a^GH(RmdEr^7&#}lbkzHQsrrtKxJM?O1B+}+-kmp8x|pht}sD*w{wplIBO
z=(h#H<e}Ts`xWZ6p)kRD<3<|0s8mt#5b_jeDK~KRxQcel9Oti6qtQ&+L7wsA!!v&w
zi~;Tfw*11nykIp5N(~P1%HS!}0VG)6ZrWlE<&@>+<<ajZhJ=x3Z~zTs*y|z-A*gm*
zj&aE@fqI0#CdCy1d#dj6SNb_BH;>#tW5x{pm0xlfG}p}VE(u*2={mi7gu7c-^0qCa
z+WI^m#tPGJY3tI_3_E{*9V{Jm8l}_dd{}(@FF`4S&;jc59<yF@lpwN`9<Q~dl160S
zc8~G-gbiYz^2Li}xuq~pHs|1v&8)13dpxPHv(W?65{%tL9u%uZ|J@Zaa)=RpZj+bM
z&4DLU)T6Z-iGkQgtw!RTz|M=o`_-pN4rT(E4wPNIic?H3EXvEvqeum2w}C0VqemYa
zt<2kT4=e(n!=DKW7_vu=T^wHwU+}m!miUe3A_iD?fgu)<JpguKWj+kj9<QP@yL`H&
zU;^Qw4&%%eL<3Uk^46^Pylw4X%@Exj7#Gr!=S*@WdLX{^E5B-OV$!*L_Zph|<FIwk
zwQor}xx;qt+7ZrC#m@{k?5;c=wUZ`&01pNDl4jXv$4KyCGuH#h()b6EMd43D4~;I?
zu<rZ!cFF~t%g*xb>3@$JI&>wuvGh-V(V`>4<zZ(pJfp>1gf<+4mv9P49_cJG9*wo#
z%Vc?;hXJ<9-<30^Qf@L}@7{EBKVh|&>WQ7BISY;^-^y%v=(UD!Yaa6oDNo|&Q8Byg
z%GsX@4U9A^1|lQhiS_{C3S9M(E-DhogWRg9>B=o$wdyM61=W5Mi5Vc)A-ov=){T^q
zQ$R!&G{tQ@kUjVZotnO)>?azKN^0>2*70kDXlDabU|&H60R4tFK^N#Cj~?JO<l4ox
z`USJeUC??v0^J@246^b|Vp4Vq56Bj2tBi%{3hFk8LYny?_6K+#8kBr7kxRG`)4eY1
znp<RH@j#JCS)w~<1F5DsB8*+2N)R}es@Tw>Ly?Op*^5hh)nTQB!*!HtfYy}Z-tH#%
zN!-5zfef^oFpuFM>DZ#(-MnBD8%$_aHJ;#MKzjhyP?ks`k|L`N|IFSV=9k9$>S(&*
zL<Ite-~jBV5Lz2Mz>G+{cW)&Do28{?c3{K)@yuOiZ$5&oz(`j@YSQaYdN7|5qbP(Q
z1aISLK2jmE5h7^-&>~11K3PmEAeI7VxCT7z)JlXC#!tI;?~azK2!?1-LZv_OGhwXJ
z(J^$`B(Xck2t-{oLol0<Ph<|f8S4C$IS8-8hSK`T-*8~s$~GeN!{wiZDi6%E2ZR~M
z15x>W^^$PVLDr6#At+ZwjNW_qB9kmdFn-ONkMACd5%VC>)ZULBJqla=52&GbcD4CO
z3BdWVPI^#lKNVjj$cAmO;74#r;1yR&Af5`1&IX3<+P2+P@oH-Bl#TCk&TLTlw_8i3
zgG+8eZF^r@s-wMuHgk3Xs8zy>Wu&>RbD+{NWc^BNMDjjH=lfpxJ^JlPKQc1Xo$mi?
zeWra_C4Fm#<`560Rel(zqR#akBqb{M%K7u6{RWJg;>+@VEqR5Zrw;|bIA<9Lgl;-E
zG9qF;c_SNS%a$#kddf@TvKue`fmkmNlrAba>WGwQ1`ZgYQ(rMHVOlfSHFP*L^tqIM
zpyx#?XFF9+Cj>*Xx^hNTc5_CE0k7^m!t47Ut9XLc)FivaaLeT6hj|oq&_DEDUzzTw
z&YKq2Wn}P|4=+X+P%4%FMp~`<?fYf}j9vP5(KE^!u>M-Bm~t2H*u94jR|93T9m*j%
zWPAcgl&m5K!utwkCaO&)pH%kHSV{vKWz<Y4E<KBAl?pTpGp5Cg!t?Z5v*@oPpAy;)
zteexj<`qhUWGxtLKOUz-UO--I>zihhCrv7xy5XZWGyo1k7W+}w#`)&UDot7ai5G$U
ztO?n<M~z)Pqd_>ejty9bt!U^=(q=DM;KK4@_IQUQRaKo_T#O71ED2XPI5x^jMuuIz
z>dHQ&LW*Lio|mAg{d0cmtle5_jMclHLa&@eqI!=<(W!mY1yh@_OZ-jbRE><aOxK!k
zVsfIVPyj;c5@C#IIDOf(F7Wm7c|^j4b%X+??jhHqgP6S~nBiZwH#NU8C4nrM_?<d;
zvyw2>zF}&G8mqjS(@GL)_pR?lzQR3?!hbshnK;xcDy$HOAK$ul!mY7E=<bx91vNp&
z5{d$(16^H7IK{D7;e?2-P?-1dm>vH*FWxZ(LMA9=R%B<1%}&d+9ow}l6BUuoF|n~J
zO<Ga&=#Y2qCS%u&=aT4#4;wxlo4ArTG_b0YyZ{y-Am`cb+b2N^WoJ6JCk$fF3?@%Z
z!y?!#C@7#N0nAeNBy3lHCZz^)_m4$IIJdqfi}AYHbWs>cKPJzf&ipO|9K_d|?c~MN
zc|tzGv_4H`<*BMk*(~4s^{5A9<94E)E}kq@ipEwErUb=W>20Du@%0G_;Smw(z1rW1
z88CVBHnKFhedxX_%8R*Z6aiEKgAyhGi4ndkR_xl9*M)IR!HmJo;{;1x!HQuAfx30;
z)amuBS2xaD!g|OaJ$9@Z!LoGeQYMBdi_C@7v>K8W<oKe(LfTj{lUdX0UB>tcrS~TO
z)TtCT%IHLz1pAIFRt#~`mFVt2J<vrwtn8{VxQ6K_6yv?RAhM7VO8y1b1e}(XG+9wR
zt5%2J6Dl}O1k7US5N+_LR#u7R&4fw_pVArlAT#p<m=cT(r+Y2?UqTX+f5jS1pD}~v
z$NnzO`W4(fZHIm-C`~W_cbW~E&~APj7y_08KTln$CoO~@K?JE6W+C${+rHzE?$~it
zoM)q8Ql)wrA}_A4pC-q~>%^aX_dXLF>j(=RuS2d2p8OcO<*Y2<Pwkq;j*qk41`tOf
z;{|4t@C5L~V9p$*163wRkf=hTILf?5+?~}#+)VHP<lJ?S9><GgayZZ#JYsw42lh#p
z1_^R7olmg;ot?$(dIAaJ9Zv;D&k<@<N1~^oS(Y4H#Z9^fpkD+TB3VdW68WDV*T};m
zA^rOF`LX)ab8dRh@d=r1X}(UjrCrsL#iJNUJPL!5`Y?v6j7UcGAgHVRYb!}|SCL{k
zIIxj7WBhRc1Qf#9CPx3ycRUU_d{}f-QR4y`FdEAt!kLtol;i?W_M}cJ@nz`o14`z=
zWy&7NFBk#4+`oUd#@1w!l_XXT|MIkvX^yElTw$R7{lB8NN(h8fSwUvzdO||Mn%5pC
zd8Ih**N7sC<Xa=H_&*3?!PL0_;gcDA|DD>`@4=H2$P{EDdZZljG;W!lH>bS6<jsW@
z8A?2KRGUAszycIpb}BfSR6j|ObIfRK8d3ZRm0!(O5BmJtyOJz9DD60HP9Z&h+)|))
z5hyg9rw^eN8-~3s<}y-zn(+go4Cf2Gz~__oOaI{JAbyHb&lompCe%M*P52oMb~Kad
zc9=yo=+CRb8NL{RIR+<#<{4SSagFkHLIMf+13wuQtU8ZJVG0!&q<Q4BdUxq^iAa$D
z>2HC>#|41RK^TSBQMk!O@F~vnrm-y~_Wl3cbFR~f4mf86aa;&rlB@)-Ct0L^!jKga
zeu)qjKfLjBTpXe@T$l>{3!HbFG#H7oNaPt;naGaj4KpOe!WOgnd97dzc8eD)yVw&6
z{?F-i6b(&l@HaSJo+WT9955Cr;rjIhP1%%eH%i<~mn~mjjFetc5vD;@1t}^MII*^J
z%E1g&qID3x20#?HB$0WzNiMQ|1Py$j`|<xa<f}RJjExl(6&00~0{{LS!2``AsGdIu
zkY`8x?E^x?=Gy>H;aa)yF?dIBA0O&?&-vT%Bs>c3bI=f|eif5*^K7JI1S>-wS4iB4
z|Ic?-Bc?%>?h5;#&istfD_8!+EyKp5uX&^MxBKF&Bn<(WT>vOt7mNiIJ`u3wSOZU<
z`ZSs&g`Oi9IW}Kz6lDg5>oN(xQ$%ZtS-%1U5qFB~RU8$zAGGv%czS}Np09Gn#nW-5
z0bH!pR{*o_-7mqlq^-WX8jT=(pb%$gXSh{Ur|wWFB$TDvz(q|0oFKv^%Gt?@3@HAM
zPkTwE6Va3GTj(WJGMGEJV}}j^^-qa+G<MM=LwrF3Q&3<NW_(|O;y@e|Nfc{fF;rG=
zhU<sh<e|*W%;wMcS2%I%)ElBU7Ov{c7ns+KLc?_+brDQ2NPbi^iTVG`N3!Ynf8X~u
zzrk8C5o}i{_7L4fNJ;P{_ylPI=?`oZF3|pM;0stn;;0z73yDXYN&NQyr*vzx3C9H@
zLtMg8s#){q9a;gHk(Hgj5gj0Wqu*%RMCe3+YHNh6Txe!ckP9D^kEhXZ94RJ2G#L$_
zs<%7AHTp~Rr%yHWnM;nqkz~kY0R)Kz(-sodX8(EM#9NFOzKYQ$BCdt#B-{Up3f&+d
z%+0l6VocL{tN4qFV3icSKYxCZk%83CJ|4x}`x-MJ6i!^d`mO1W9!KzpDlMz$><m*c
zUN7vRwx;COU=nCnCDf9m%`PH{Pa)WHS}EdGo5fHTd1K&?cpykQV+)WmLc9g;yP!8W
zEoi|)b|MOwTqq0xYx?ZjlZ!<aMNn}dxY)&HjOcPPjkz~FkgX=(adG%yTzzCqK40Eb
zNG!5YcY6@P(~e=P(v^ce;?o7@yb;Y(>cO+kuV294RJ_wtqC1yZ7HAYum}1aUTt`V~
zam$kBzOspX05sFEOY{SkY>fLi_~S-@27fGpHtPi8fEosP5480m{riVf(Y3T}TomgU
z6x640-}2H@pxbHTCOwsuEbIJ-NZgee%VFlnGpaVYkt1?Td^Rj(ns{$q+6{!QKbGe6
z;1^roKJ01ti7ZEBmneR>ety~R%j%N5@D@%b(MA=UAJS1X4W|cZWP`E?6vVeP><80i
zc1<sgR;X_|M}-?7yV6Ht3A#!`w_&qCBf#%4EDSdbhspIpD9(PhsvmQYmJ30X<kKi!
z!Ns4<jev2c^$}*@RTKI5u{;)glrQ^@;Y^7KmaSPMxM0L*NSG3ozJI?iM}}KPuIJru
z-L?%fJ@C+WG*z*cb5foY)(}FvZrLKb{#g?$Wt^YN)2$@+YSE7Kw$Tm5i4_{}_)FGZ
z$mbI>UF?>EWIq%R8;h8*Lh42R{d8<8O-F|~po&aKSqXYB`1a_!gobQd)ZkDBfERY_
zHvj8M=&!OwV4aiGHL`wveY8YLIOzI{zIci_IEFYlxu#%?(8#603NJ)S(?q4Ko)<zy
zfv<!;8p$B89)0?JEG@Ms&jkUpD>t{KP>7l%MLI~#d}9Q*vhuA+EhWA)_-qP%<ckzK
za3n&z0+003B+873#ioBKNV*Q5PFBVFS-dzbaYr-B;~CW>qK_vfC7I2NG>IZV??Zdq
zi%Rh6)zHcPzD)-&ICCevBAV`fztZ_aZ%Fa8Sw2ev{p$q~$h&ehknTdz5Doe7$`O8G
zOLx=TO5Yrv;QJn5B_pt_glz=(FZ#r;krIg$l?RRs3p&TZptGJzS9!7vKn^HhXx3Rh
zY1y5p`L>cX2RsB&N;MzYX}G$2H@#JMc8mbl27thp5M2=e`Bh)(As#<eiyf{tVFGe?
zB01#UpyzfzE-t&i`zKHjzK~`&4v#_j7=#dsS1Vh$miW%x(LMTrURWf=2}Wd+Lf|_*
z6zI|cC5if$My93b#j~!Y-mz6P(Gn&PI1*-1gYKU+%P)K|RU9D}1IbG?D)c1bv}FDb
zp*s<Sm{@GI8%ICaav8zK@~wBN7#j3KqDfH>wZr{Kf`c1qqhMdc9G)$g3Bu1dr7@HO
z9AxC<vWCdLL$308H%yV9>Id0@h;`}GE<JkujbPXQMGF_AZ*Bu+aM-X1=z(swm1%ob
zC}EzUa*%U>1MuOld-v&6T~&3;8e{FfXHQSC55jKZ8zLqO%R?P4CGI5N7m>0E8MSz)
z%!!eR#C+RHlFb<6-3H+TQcYcOJ4Ot{(vy?jMNyjNC2#^3O5paKS~`dszh@$w9S*o(
zh}cVM>1K`!(0t?5*KjH4le%l>rs-`6kprKXzefdh=0VE>raIXXMir)va3qDH_jpRi
zag_Z?Ynyh%K_pqRgD3@&wD7tRyf#+Kp>9Rm=<S{M@F9Yj2g^r@I6&e$!ot!Lv>Lrt
z<d0|0n1L|{hMApbZerpD1Vj)TU$9xI7!lJWKq6gZ0-SxIbkY5jfFjbF;Z@0sdcTM0
zSj$}?uz^QnU~ygxe-D^Z1*eM#z*uRF8ugxLUf%4yd6|$OC?+oicc#uvI;OK_HIV=r
z6KEmn_fqPlc6Jm0{IIa($|EC=;ykvz=Mt>m<8saM;}@uOH8H<@m+UZn8tmp;vP-wh
z2Sa)B<1{ohkS26?o&?Zqx)+U)>7Q+(%J>Rfd{QNxz<#8Qf3Vubi9lEBVo}~1Sk0FU
zi;2;uzDKFU4OJ;DX*WG|y2YqUGBXUwS&*8&zx6{&Kw$t5kEV!_x70qTZLsXd7?4c`
z(g-yUac&F2-8EJr`SVbqYqkNtIBQC*BPBNCK_@?7Hy&J6MJ1BN6t|o6^eGJ@nRse^
zySZw%=~(s`4@w%x3n1_Uhozi)@X(?4q{86*M+tF{9;E@H*u37fwIp)INJzjRmNgka
zUNdx_)r9=SeS9Er-SGXW2(*C%ma={FfB||@X7&ya>BqW=H8)-ZZ0l>VeD>`7U|{G5
zy(%mF4wJ<0>*<JyHf`F7kN_)17DGmh?>RUc$+{h@F3N3MW%t9e*oDGe_;3<8AwU2N
z)seM9@!9X<3Qa>JBXWKJx%a7yqJ8=1!v`o?E9{v4-N=C0_c(k%9ig^Bl{(yZiZ#4k
zzz<G2NaYo?c8iAP68}s<`;S)jlEqXnK^+jsM?4uXWIfF2vlFG;fonvT>m7WeYU1$k
zwIo#$BIcPuTLu+*^TWS;!Rdeg{5fE<q2R!}f-}E#Yy3fw8JKq(E(D9=ms}>e9(X@E
z>STOun0c&rgug(J5w*PBhcYp&bJEQ@7dFnGO*dNZ_?KHvXH4lTd1a2WHsmL2|Er_Q
z=Z6{7E3_TpbY!{7NX?r<aUvq(9XWsakgEa}M5!;Un=sL?BVb>kZMplezo>usH`G})
zC1~XP170kq;f8Q}$W!{+=wZEdshynMY$Kxsgmr{gh<9XFkK=)0yD|kH9XIMfWV;BY
zl1cj!)Fn2&-_~x^D*I+?W$Jdo9`yF=d11@ur|r!C1TG@jyI>BYysLbN2nx?3^p<&3
z^x;{_tw--J>T%5U=x5tF^4ynb4uX#liGmC?s-uv;h5hE|N3qF{EmHt+g%^eoP)EPF
z$g#S1eHf@2Q*~LCmVVd*B?h!c+b>eRm}BqfXCMb0qoX{Xd=)CA)t+>DWr`!)VT%$}
zislm#OFb_NEFxUMtZ;|PY{0#;-m_Dkzd$!?e0m(jf7RS{)Q>eDFRF8f2=l(x<1wWB
zZ069=Q2rn5LttlgY~0x@k6X8Hku)*gYHpbER=UNgg8+ySR0e6-9adO^`83+7?Ju?{
zNCu^G>8BbHebKguTSKb@MAJio*1wSrpPNY>&p5>;ZtFH@9RX*>TjVThfw({=ZsW!|
zvu7W)sUqT-IAH=<BWWpuXU2erV#U1)9}XNaBCSGX5<(Q?ns5i<*2LIYQeFnF_D6-T
zVu&qut9}=O)fmw;hW$U42}ufigtNK12^al?P-^h^e+Gmt2=llREku`uCKL}}`~Kts
zZ2wS?RrFN?0FpmX-d5)vwW1idU0^L<UG$bK2(2|X1!4AB4Goe37i_UW?G<=o0rIx0
za~CYAA>R=dfDRqRAQpDWq^SPY>;U%qwvy$vGMLWDr4l&B#gkM|UKxsDm9%GY8DI14
z*)vLyaEG<2EN~z_6`=lo1$Iw?&PEoB1zf=n8oBT$kXrO`cr{V^tCb+np^q;VbBR8H
zQ<x3K9bJdJi0!r8T!o^&7}22TMF%^AvG=7?6C<No(kmX(uXGhui$lJ?Rat3*PDH~f
zr62UuG=LVRm|_k!MP*aD^*C-!)ZVmj9hysQdNrjM>@e(!;?G!$KVgq!t;%N4SqB6X
z?jLW`B_Hn7{5OMzKBDmk7L~1oj8Ou=#W4V9k-7rxBBim8#up+`!e&gxrVX;Z*#0Uf
z3z0~2ZvpXv2@re|9v)rcNCJUzhqg{l1m-O<4_b!Hen%l~aG!)0uQG_}T`-L!lfM<R
zrg~nSbb}8Yk)<km|GtN!VjS;AFd8rVpB=}+&(5rUIpr|sbBbF-;_1=n^}>X>G)WkW
zRKxan?b;Q8U@n(Qa}t4bY?T&?aouCLhOv-2K^+L#$@3RXIu;)v9~EU+atI$I>x1^9
zYM)Tv2JGngks|{=(9xJ$d@i40izXTv=T0bk$);8q7E!rpEVP>#DuAp{y+iy9eniYJ
zo;*3p>5?7Y6U-0M_j?wO*P3xD388cuQ4RP!B=9Xh0X0zN>7YWCHcPD8C%%FnNt-s6
zQ;mQVZ-4v1v#I^KoYML*T*E1=G4Goiz3G#*Dmk$USPi=t;NZ#9v1wn1&ms>`kGsXz
z4!O2gcqM|+uZr^R+My$4z4;W-Dk+2Nf_?1N?%lfC_a%bJNlS~iQlq)?C47u8#_TRi
z@x_*}@mA03KL<EdFSP3ChU1$3w9I<R^5tR#uT$!)Ylqia+hsj0QzaHagz!-5T2BS;
z9;|=Orn0q$M*?4gzPoVL$e-<Zs%f6e+4?}_9Il>SyLG#J=MKDM9p;~08y$Iykck3c
zAX=r)Q`Cgbkfa$xKES*~@c|O1xejNcr6st3COOZ+*Pz~U5nc)a1FYr2gFOuv660Z!
z$us;6fB${!mDwkF2<&}05qhys4%7hJQiY-nFA4qH6sPX7_JLcP$tj<j8~@k`)j8eG
zuCRRT>!DRJpYW2QU2DQ#%VF8^kCVT@B>T5J>D<X=(+G=;<dB0$Zu`cwo$9`q++bwV
z{SA!6sNvokFX!C4#qf?o+wO-<^s8^0PYpAsBRT1hod)4dPWW4)5T+(E)iYBeGM`O(
zP4x1(t+O+-b|nMP-nXi}7v?~*zC-eM<ehBlUSmd5AR#)P9R=A43bI4rucg^#tpW-+
zh?1X?<_h&sLDLBhXc90y(3_%{RljDUr*NaH5KxNR?VKy-uQ+?2;5Aq@<omrv{<~+-
zs7;8yn3nAAw>+K=LF2k_UwWP_6I8nrNLLfH`S(TrK;=`)Hme1l&#CPX++<C>9%ZI#
zR|KQ80dj8ekBErGQ$wzTP*+q{;Y6deTDlOSktrW4J2r(CykSVM0D**PekRB$Sx+2B
z%0}gERYB7zlhB>A$U~&wxO!|3SSd|KBQ~0C0v`h$`~CNa^#5aG#4Hjs)%52Z=rIWN
z&u>;S1d+18F});EYTJtGUZBW!B{WowXYflFT&Av%UDtk&nMwIFc=8|^atTK`P$7BS
z|D3*Ipk&VHCmW7^6^+Lf<f|^rZ@%JdapbIXmz(x{{bV}|N7K)>AKBmhj6akuMMd+}
zI^yc>>mzOxC`{S<&A9&dj1xWGz7E)QGdwc)Rd81n1y4yADNiSe1LLs$EM2r8UcF%Z
ze5t_hiIDhADn8RanpWdpN=kWr9FQf=to8<uBZd#>1rQ~;^3arZn@V{S^Dqn%p(nsN
z$s(v0u%R^n)q-#n)X?DN2^Bu7_@H0({70U+1tlbjikbPNf&v}tV{-ErE_B7&!0TH~
zHcTK*LyI*nHYG<bEtjjd2`DuPF$^<i)WaGwcF~}RIa2%EYG$Le;qp1LQ!|o+Ll(U_
zVXOjGhv-p7(RRL7PdCkMAv=&q>vxQFs69c@rt6cu8&6B0JyW-SXt{@c=38Ogk;$_)
ziuk{VYLn@96ja0e38G=58Z(S8S6P2(jB=`Lg=kT&*}h8Pmd*B_NLBbbw8D}_%zw{%
zZ(1s*?l?Pq4EXja900gPzPkH@m?;a0OoQvD65pi7>0vE`Eng4b9?`Tvm0Ybs@%Sg)
zA~5#yWy_!>+@^&E2%UmYvGvzs&}?DiN$Ic*-%YwM=5iAYs9hb-RyE!TafuuStVRKC
zd@HLnJ;#IeqZLkC%fO(FtueA<LWSiR4GnPF|36Um4_Tkv7OWR>*E6EH(n`v0$S@bC
zk4)J*_499%L{*GbbtY{KSFQl+1JeCT1_CJ1ri0QOHu_QHy8}|Y8yy?9_!cBDQjZIG
zdha3IR}@pK@hv+j-atlZSX36L<nyI14Avx0^iWvt7(9=d%_29goI>1)qHX7B-SqYI
ztIqlZ=MS6C@+>UXvpby?H@<t+M|rxylvZt{*wfv}^)vf9cJ^*R`G(fEQ@}A4t!^Tu
zzEcgP#<!+*l@W?PXJ7qhg5CEH@dh`PKWBW?exA%w?(KEFktV7b?-2QOYHf4;X6{cq
zN60aj!5u`Eu)zdDgg1?UBG~+%eil8IPdGh#f=UK^gO8@9G)9DMPwO?Bg^0X>>Lhs^
z0^k%qKT3-L87T{qx{zuFTjla2^tK_F44w=9zNRJxqAYPG$P{kc`CbtmHW;P!mO%a>
znPRC(Z>dqt-@Fev(h=%5WhoHA%jeHWR#i6()%rYm%l$tv=qA&)2$e?L^?OeHm2Bvh
z=ddFAo8dSD%bv)8C@dkq@l(9oTAJ8W-^j@A&rc$;G4YeenrWD)hhJa2Y9=^~?ck{~
zhq+<1=XF`Llr%kjqv=|AY9+Ka%-p}{@0~vOQMv_{bBc4f(KMn$bjm-T)RAdf0ffG^
zG;9jcLZXT!7M^u@&R9|_I#Vc3gjb15xTZ0$0=$=1pmRW>M~}!&A!)?-Fn_6^Vt8sk
ze-4xVB*B_cd{0kTgXqBG3>a_=c--^?{I{Q21`ap(pMT)&1F>>)3^v*L8K45cR$0tH
zL4Sl)({xyRb9bxAWY6}n6S@yy*x!RptP#mR%_7P-(lzBEY{8S6-?b=j6%F%ybBO1z
z32dMo*v`){;dDuU!lkS_4b*9KlLBu_#~+3Y2d3Uh4{Y{JR4q5`wl;OcjQ4b0^&&&9
zkC@0X&3OSCQ{v`)1UynGWamWZS13~Q9-1x6Mds()6FCx7Ur{I&fxyT&QrK$4t5<B_
zbA7*qS^aqGkU^+`0B%7rHXPxIG6`=I^9!Q&dCa+k?7VmH0HMHOCn|gF*fEuvvm6<A
zTFj{NykX<*1i0KHHJ7HkC3O?&RrV;(lR}M*L7{8z+*TIfhF)ZPc}{BTU{KGYLq&W`
zRWp2wufCxn7=)NrK!6ORj5dmPs=8@~xT?Uy5K3Ms98s}Mnm)9<BvP68J4uw(DRnB}
zAs)cWpSnp{ll0}0Hlq*F*vW04PpZijq|2A<zJB~(n2OBrvXv{zzG>4lD|z4`w{6=G
za>jPb!pwrINUtN8QZH)>!Gro5gs-rIX}$Rn^OV%1jvT+s@~BN-j0HWlpCE8SU7?aR
zbsa~HW8wSxAh$LQ{6$aMdWZQNFn6g-3bZiOjgKp1#iRod(S_`+?@Qb|a?R|d6U7%1
zgi3YO&%V(ZotSbHr(_qrgldwa91`?LaB{tMwod5ZU$;(mZtk&H$b9_mAJ3bPtmU9n
zFo65=mR^1$lG?e~8LLBh4HpTY7iOj!9(z+($g41TB7+0K0w$1H(XooO!P{4_LTgh#
za;PyE*4EZ=3I9QgmIU}>cRSy1hy&Dx!DNw_2P4TL7f^k@QsQnut9ry60d$enDNhI4
zExC3>aBbw)vjr$O!BP26EWswg*NAA;%s?Az66#J3WpyUspTlf3&;HH0#c<p&8M_QS
z15Tsqk+2GM@w{83=%wkdM=vpj>j0fQRIB9HzrN*3C7&incqM|nz-Hu<n)pTXx+r7n
zdEu3iyAboQoN@wa<TJ{fK-04MnMUcUNsts;v~GWJG&1~KfP+01z~qlZyT=5<Z$Lfg
z)9csDewWnz(%eR@1Q``k8E0$c(k@m}#cMc}t2U8_yDJGM2{r}*{Ue8mCJf$PID_lj
z(G<)_(ny@oJuo4#;5Z9JtwA)g$<p?E5&bk(MIcq!<=}3JIO){LCWz!Y-P{`Ayjd1!
zX?@p^lU4iS4=U|swGc>DKenMrtyeu_*;t;M8)&CYpFx`p>k=U~O9!I0^$W7XtxSKR
z^iS24pxR$+e#gF(CJY*MYWQB*hV)@1rd`aD;~pqB9|NH#wV6C=66-3+M;vAb>MGxX
zE>KVMHEH90a8;-^9ttE#mzjk)YLp<@vC0<#T`~;A(@b~)0-g~x73AtA)bXQgBdZK!
zhhw{ms5;AV&&~m7uQt|(G<MQMi&(e7$cp|k9xj)@RV?MI%mExJ&K<dztkDVWo{;nB
zM+sJpjpOpwI}OtXOCf97zxD+59-u>u{&<bGy!*mJMgjXBJ=#-2;mPSay?&<5#w83x
z{#0I$*xw=QaDZ<pYtm|B{}oO45lt~48CKQTuK;4|#Em!YsTS#QC^ontt+R|s4rmri
zP0X8+W|HX6+3^SWLomZTcq)iwhU)x^tu&)?v%#a6{)G$W9e_Cj?LzBjOuQH#KIY1*
z?CflC|1;6=HSc29*kX;$8Tj-4lbeun$qq>wupV^$Iif(W7v^feot8F^I$9NLC=kWO
zJcoS=Xameit{JUbC}sf*ryCzn;<Y_n@yYCz_rT#KU7_HYiG&LqO0$N^_^tUC)8U7q
zYyqcOiZ2qQKxWN~0IUT3;+KEm)#<?~?kH@1($%)=BSbD?d?ftz>7V>C1kuNgxfmHa
zTPZ#+j)v<8w90#V8LRqjUDICd>x#Uw4B}e1PEKC_8cqe`2%dN|!SpbL2b2UlblB+R
zL>UgcJ=Gro)0#_rC&hnWV+V>!cHuDRdi7{YqWg&tFH+`-(Oxs7wP`P|268I)X>nhS
znIT(u1HAxx8A9KxOK)Y5it1|QNY?XUg!%W|j1p9NfWPD)U!Oy;)VJOIEIT_0Mk%iC
z&)2tF+3A5OLJWI(&RW}IX>u4~=c(jxL<&@o4{0+1!u=ry$2gEdYRi}vzrOL8FQZuu
z+MVb;aEN$FNx?esDlj$H-{-&qx6((uu_FW&M6cLAYAXI3|F9T<p$%mhi4ttLeoked
z#9e+rtZnG&lXYJeXI78sFJwfL0k2qGW^I?|`lMtJCx_#v$yS9Wfe`kl0~1pASJE2*
zS!(p)!QoY*VPUWN<79Xz$rlH7m(gfEga{B;4=9nogPL~34*lW+Jh_~}yq_LDRQwF6
zQKHmSaad72f%{6vnoWSSK<2Nis;Z_|7L^2k$ny}+(mF8W$CUE!8(2pK@gVgr-`&bR
z?M^%+&ng-7w}_o0;3klt^5Fvza${Y9=KyO1w6TVoh7?&i5peAEK&<Re@v`vsoVG9@
zQA~HzmtmWBd6%PL)j@fByGxhu8pt$24_;n!=o}f&7dI&h7JUO$2PrN>-zGl{VM=|;
z^*&7%1R5CvlB(v*9XfdM2`%7Ib5!rQllY0|l-NlqzO&%-<rsgU2fJzMVbCbjwW)HD
zPUUrIBYtH8vIH<W1zaAftLp#{v>W+uZ*0XEM~>s@)6W)v@nR|G1cfdENPNausWa7@
z%8m#KD=QfRduaSn*{#OP6BLt|4!21ZbasB-2Ei&QHs{Wjr=Rr?3Yxs^z5+rOPJSPo
z>uxkbkTPNHtHFqA+@*(B?gD5YP-Z&qhvfD#_io=_3P7&s#argF?u)+;wMPx!jW^38
zLz@VH*^A=lSDXzPAM)I!Ap1~?;&Kgo^Y-oC)YSbH%o!e3JYRHRx={({U7D{QC5fB>
z3@E#2{?FLcOf?(p-U-eSNj^T<?k>b*Zl@Rp6Y68IZF>ssJdU;{2RDWRv4#Qkd(t$I
z$B7TwUS~%(=*R31RY(j!KlVo1FhC_FQ;;xxl$NGCc`Yxeuy71=M_BfRi?aKy7{Z8;
zbFbv71tLTw1h%=s)cBRnvU>kV{;^Rb&cir#XxB~~_E^b!M(+~dLqvnt_LGI?KHeZT
z#&Iz31rJUul&wJThP7+1+p39#B5umH4^SnPLeJg1KmBvzBnE8kx!71@0yhenU;*?4
zrTwiSe$st>8olSSXjWOou;}MN^O=f3UD7XZh$K<tKiXRC6;d#?AhAfW^?dtc5I1hv
zAeucnLd9va!sm@%=(<EZb$^K12J~8Ca?e>iFx@U;ZwuW7IQHTjxm*Wx-q(O|kc=i#
z9w_;YW8xM6DLtjnviPa>lKmK?Q=+_=3s4#|1$-Kz_OJS^jr+wWhPL>b3U2`*<8Y|B
zg~BxxeK3OPNO_s$jHO++PwX*BU^L&t6S?Hwjp?piAw$*w_%Y@Chp`1MCH>p|e99XR
z=Mq0vFl|RPx>)Uo06h#VVznEgAMN76kYmTT0D?I6kz$Q23%2gxVE*Uvn2SYLexhNE
zBAYAy&QVqK4Gm*J3#a|nz9&o_%)UY)fWJ|O+mXkLIp(?Mw-dznavy(>I-HrRdG=Xb
zE30~T!|79}D78GM&;%h%R12|+1V_>3`pvQw2<(Vy$RH7r%gH<x(b50s=Btob88%Fe
zTmyH8P(xB@b55|r=bx*EnZQaGYY8Q5jAgObo7}YX-L>Jak{~8@tc4qmiLpI5t*O^c
z`;mF>`g3V(0b=MoAbb*ZVMqx+K63@((JFsRhTHAO<_T{%oJ%Vo{77b2=^Qz6qQNcz
zw~zDZm?kH07k(`}Iy^Sk7F2|c2@Nc)J&75?9T%DyIZ>wfpaKKLga5$4+VBd`0a&(;
zlLhmI2kl1Z#yg`np?}fYaQ@1ba)2BREN@gwwr9V^{^xq>W`vtW;Z82I1%#pkxT~C)
z=vFx&Hl2+jku-UTzqYV&>^U8@vjIIO#d}d;Osu*H7m@<?ODaGd0JV|B-4~cs9YszO
zg71aJw_7VuCvXS-Lv&<bhvip)9#Z!}m60Qn>VD@&h_jt7EG-f};g0>T)2bTa&|sLa
zSI2lYWL5Py9Z(GrgxZ=mcF^iKg?RoS8|G(+U-X0paIiq-;5f|MbX(l%jhMMnX7kH;
z3al1I2hz59IuxZvcveD6U@i>P*m2|h%lcMHqD>iv196C+lr5CtVdE7RF0m6zr`$ZV
zQ!;n=+dct%Q@Q<wnYuh-^~RW!2LI1y_?s=k6Q<O5+}ixVX$S9lhL05$N)T|?OjvQj
zUJL!eVFGWZI4@eU+1P0^%Pz6eXNMv2^_YTpWY5fJ&rp7f0R!V`(iKHm$m!IzA+E*h
z2y+ir7q`Hu@D&TlH3$}ZK`o1RwNd>cGoiX2F0#@9AtXr#qG5y^(4^;(XLqo(dvFX~
zE!0YoRG9YWF@f+nB0@Bl2@ODsZ0SKV^JCMKg(p{i8RrnN;?AvGOYrwJHP0i@T;9HQ
z=-VebIYK0zoP$B1M}vZX0JRCFKXO_<FI*;%evf7m`uKoCWAfh>o2rRLqlbvzWd|XN
zLiNwUk$4!C@EVqYG!jQCt6$no#5|E7(FYM;wGLVV-~jn)_@eW&j|eslH$kQ#kytlS
zO5N@`$DS<Ll@EpC?0SyPSoiIlNP^+=VJD@3FvDa2{tVzgwVKI2b*{=Bm!#5g_H5Di
zBM7d$p+a#qsa-6GZU3eNX1YR3_7A6m`b8m$_qI`_kVObe3GL&jC3L&-BWKuq&u`vb
z%s0zA<^ekgQ91#BK@}$J*)xCbKJI~Y7cKhE9~lCMSfvF?zo!E0OtaBl_hv01fT@*P
z{G#eQsj92nP=`j<MaDEE*2KEnQMls-6{A8WF>WjFs!t!dV-PuEeRwFq*L+9G7>$d;
zF7!(irtA@Jm`W+Jr<`(EdC~1dzrq1Lq=`$rP%8-2RM<c+ipXMLEFs(zV_gGnMk&(#
zDLz1Q&{SXl0S7hwBe9`q%lP<wLBVYP!H>X0=x&MR1nIl}9<=-N>eyNF`3d4TLszt=
zlZA9~#fsAkCkU5?mqzD-kVw0``vN~G_RL37?9?d}{cq(v7#>2ncz(`1_V-_3ek3`$
z0l);EDIEs>#3t9J0WQ*!Nq=AJvuA&TtCKbAd0|s2&!SXmQEn{0+Srt_R!bP{4OLvv
z3zQdV;W{VAX?ODqt@otvfR-E=4+ZQtAV0uY)T_W>F~w0$#M%j?#CSe!G!AtAFdpAQ
z-oQCL6`<DR(!@;b-Lkz?i~O(<oC52VBfz-8S!2_Q#<{YSd@*?!%?~FO^v=$Qv*iL7
zp=fdNrzh$^g@((7f!wcI7s8*$`gc>n>!r83zXfR@JBkDRDmxAy;+&rOoS~GuQZ=Zf
zn+7&>{Q=BYAJX{6O$Q!lNf;O|1f)i3%zxpG#P|jAETYGzcG|?>pkI}5WfmcU#;Ke4
z?#+@5GgiUWo*K5Vg?scUY!GmtWgbk0wF?*ilYaJp+7{f~>(sqJ3DaIqnZl}2S4=1p
zFWJ34%=)sDk|;5V8SsrphJ)Ij+;gbb9igNmdsTG9!9#c5Gs9U`Tju}7FG|vu@%I?!
z&7^mPrTG@(FGH`rpf3xDMl+zE986y-T?<9?ZC7I4L>dMFtH|1fur~AOX!>+ItNULw
zf+QE*=73ZdUpj_TeurbpD9yIY?5XjLjiI60_0q$TDJf*xdS2{Aks70JYPH++-xy2W
z1xIE++d*HD92YKJh!CoS-Mj~_LoMwb9K?7k@Hss%a^aDSuAH&j{l(?q9x2)HhSpJc
z{)|3;XFVEGV8X$bqFeIDvHMwB&e%VaKpEPE5PQInh5tp0vR{^D-%C#~6K#Ux>6=`_
zjbky`Z|qQKy-C~aU7>w1pYm(_H2o6$Q<E?v7j}-q2_im{D3Vn7@j=iK!o$NA2I%NC
zusKE+&HJ^#fBjzcF3YpQQ{0j-fBf{xHfH{aO<yGLq>iGqeU01a)b2<x4xjaFM@2t_
zxrCd^ANK3lE1zS$=|WIAf#nwv8L6?H@*R`{30?+yKu@8)Ws5~exFoV)F$*JhA-}=c
z^`7W7y<Aj&WY<x8;S77Rfq*>=MoId^wBrxbD{o_?x}W^_mq{h$l||by6v}_!y8Ce&
zNbvL?3fOi4wOASsUo~!GTTyKo>G2QR&nPXXJc{`BF@8;bLv5Aq>0at{=jR=~>QF;n
z)u5NiPsg{FOmcu(C%p^A)wprYPg;Cvw5aZehU&_VCQTzqYdQns1C}wKnrK8Os@xmm
zmAD5xfFR;hJumJHo&4@br+?jnVoAp(dS0;Ubqfal{CdeIbV)(wO=TL?V0?V`ji=lp
zw$oAh4wP&RdUfH0U-6V`7%?0Tzd|&bK<|jG%hE<4x8tiVvq@$?{H9^ADhJy5U<A$X
z1yr#4^Y1~?zcr<G1=C5mlNesCP>4WEX6EY=m^pVR8l7y3e@YtD3xe9*IexFPg`3NM
z=ZM>W2bY=x5*<D<*ki=Ki?hoQZdg7r-Y?(5VsAIQJ`c1mo$9J*dDysX=3u{bQKtj#
z-`3oF_x(q`&dU{VpK8pR^LcB1b@1bNlRrIn&<RAXM<=!>J)n=4Gu%`66SrkHNpf%(
zdF8wgl&C#hH<$G9G<j;KssEsF*@CequB^R+va@*Y+H?O1*CPq+y>VkTzk=%ad^0l@
zt)Hi`miGNt%F}IaMdP5iPb-OTBJPU#8BQze1s7_}*8c4-exK{rqA!q=qZ!=2GIgeS
zYfrk8EbLJ@?B|zS?4;PIDe=;^CU8PYXO9AEnqlu3N+kanFJI24L(0VBn)1IblBF{?
zZ`R5l1#Y!kwyJYF{WuT_xuEHzaY!ztClN}tKF$+A@Zr5(DdG3<7YF|}KGi$brJxoO
zKOb;FUB7veKmXBo2egQ2(Pt_??uJkB>GyjKhiA8L+qyId#Ga%${Jo0op+ZX-xM%7t
zSD~q<u0F&=MNQ3N)vBh~<9>RUq2XS8>gmJAb(a0G`!zKNh$;~yd+%>4aqlyE4gmRi
z=V@O*=7gR((>`TPQyS=g_pf_JG-_vO`}tj{@o^>H#@B3%+XbLQyk;_cl5IHbXK*R|
zHiUpQ(nWi%3w<Ec>_)m9?b{F#=Q5$i^UhNVYR<_xNc%Pvw9Ydo(DTkTyW{hk-48l_
zdPsI?gGE!`@#NIJY9%@4$Q)xAILcSacH?#do%4NFLPZWR(!7lJauRm4PoJA<tT)l=
zwEx5NA-}Wk7<O-;rkF0dHMip3v01VucsT!k^9Q55C=U$vKsAG|)L~l3W4i>TzaW3&
zw6s;G<<SD3ZPh0@@B6T*-rm$r8TBF`kY4<ymdY0i9xyWTEQh@1?%Rj9JcN|N!Do9g
zdC#8x5gi>(_DZnKbQ2m-VU()ZsF{>H#Y#fwznPGvVt;yoM<Rs|0%Nd5l2(f4RJDL_
zl@N>qEes3{e0aOkHIZBdW{b&4hz3HXf1+pol=p74+O}+7OY4)ko>aVV%=;s=heT=O
zu$*NFw>|fm2-u~qt)?Ywh2}In`s(%v_=Ilb#yCs(8P5oA^L&UOE2t+$)1XjSnyvP=
z0Yi|7A!jx=8Bq5TmV>4}97D;jZ#FdiHTTP&1&J=OT`iP79={`0U6}cLcGoUliave}
zLgDQBcnW>Ccsw>lxWn!zl<~O=NJ|)f0X$EwNbu<Aem;ytCi`jKrp?&y{piawG%%p>
zhdgqV5TAL!8mqTk26rd=G1DyOTr;~@Wzb)($B9}^(LGa992z#Rhh@>eQni8iN@mO(
ztEhY`GV(nrXVQju9l!j+>*u1ij|~Z0_nGS3gK^(@;P+WS5hwS*gjhL?r=Ul9j*&M5
zt{7nz$BY>hc?CHPEPpR`omD1KrWiDS^7QE|)YUS7)Va61*hH4$oiFRux9{9FnT%kD
zsJw;t(z@hA5H1+1ZI#J@Mb=H^w$1iX<=c#2s@HM(Sq~qcS*%|7F)}(@i${TtJb01y
zZ89Y&LZ%<dR_MP@qG*M0n?G-!K+3(nwV+M1_T<%2lHue)ma?rW8~||?{2CI;U=PYJ
zh_K`_*pf=XnKOc(CUA0*&bu{yAO<Zit`<4RXQ#_r1!|H=#ffuA${&rAQzmN!?Z{e|
z-P9x{B6(};$*^+>E?L`e-#(v=i7CU+3$d){H%U*=n+}z2H;F`JBXAm^g}J%8$3dO|
zwc(uQ5uuJ9{(V28Hk@hq^y2NQ=^9z~zN@Y`j2hdq*IRGjii&m7h>Lr*zW#oEA!#JF
zLa)7h2YLWHgF~iG@+I+ucTZYVGLU2s&q=;wgLV>WVO!DDVaGHY!11!q8%RaS+@e;G
zV{@@&Pd!5>_7+pgps257K@c$7tAJqOQhGfIbFWz)8gOq*t0x{*I{tt{VD=rBDBE!D
zw;r5EjemZhAr}=Hi7d0Grp3`?$NCYKK#bPPX8V)He6OqH#N%FIRtGcW7;cO7H+E<;
zqsGFF7vAgN<hq#B>tj>M3g2>?byh_Ux%Z~)o7Vo`6Dz?ILS0%n*6kzYAss>Z5*I)W
zIj4oydeUb?mYl3C(ILlaYGHLqZ{<ru#Bu-3NNI&(VL0k$E34kSZwt~JyJcBl17s3_
zFdur13xtVFh)A<RWB`p)XNq)6I5*+;qq^t`dbL)y{{!?rNO?RIcIdKS7jfqSYMm2)
zZHNvNO*`p<z<n!BZHLHkG)l*dO0pK^p)wKK^GsyqDqzSsrF!xPj9l-I_aJ@vtZy=`
zbf+Jr4oG6jL>|j4t2d#J&~f+4_g;aWm;Kw5d0)@MViQjYEH?$P7000T%A0qeOkIip
zP*YKQ<>ZSjx@xxc$rHh*>hR|a#~rjetzUluDH~ng8<eG}`Y-Ei4*~MTp5>RxR`&H8
z?_>X^wjTWU{?;%O974}|YbYUN`t&E$=QK?dq({i6eNi0dl(;}*tmbswMrk#67a5fA
zwYBp;k{4j0_k5jaZVniGG7r81jRYng8^G*IGCJw;HOuVnEd!b+WwV%F#gv@&>(`SU
zQ@io@3EwmIYg2B_KhC~0d}|DO#6N!|j{)6S1fW9%pL*Ie+hG5(%o=i%wG6^Z+ZqPc
zSXsL@dNPG&|6p91vu8TF;r*>MynBCAlw^7LjtFRdj6=#rQfjt^5cKf1{XItS4347T
zC%os;$M1whXMY8<5-}cYW7{S#&OHQbn9peP)ne=IUod@o%j_NmqIxZA<y*CEd6|^Q
zXP1aC1J~khoWFzNjj=7K=<R$g>CGja7JUWqgW4+7=EBY|gE;aq$1gppxx3||qAKYP
ztnDBT4PSe>y2#frFp0-9)x6IfsHGLCKpi!;=rh~!pIJJXIG)nX+#ErZ$TR!uno42Z
z(o6M!H->JPAqsNn0y+VHaLml-&qv>m7M-V<7qpc!!+!K@zN0^%M@0w22K0|l<gNki
z{&aGRJ}@T+S}xF?7#K<w9zDl!6tCbB!qx`k=skADKC&L)A}ouLnN~Ov9-ewQsR}Pb
z^@s<cno2Mj!yJjyQebPkju`$WpsshO*(~E5f_~s7icug3P}iCy7-_u|;m`KtjMU+j
z&6qhe85|As1bv6w19Qr{*Arp=MCn<jF+pZ3U8QTNZX5;ldJy&;Bl01LHhKN!FFWg@
zZ&!ABTg<_%=g`O?Rq<ro8|o7h6Z3(0QEXw=1?=#;FLYlOn5^{luv$D3r6?qGFm&hR
z<3Dj$CwshXIUs5ZyO)>kg*3^72c6_S`8aY@7xZJ#pEv%)%u0QH|Ni~?n8}l^&CTP8
zy!cmv*ML}`yb)PC*XHTo-n$VI*!J~9&uoDiQzT!g#kvHgB@^^;FrbbO4|<unsp!)u
zBRN{qW73ZUJ&EgySw6*zmoJ6L5_0b_?mz~?aIxRf(T!6+j)E`DCvt}jeBmuotEi`<
zfC3gV+?yWycsSw%=p*!fvQ)HSFI%<hpE!pF0GLP>z%|dZ*9bH2(X;0!Ff}A;Xqtia
zKsK}%Q;#1Ba5_6gTl+ibmKzrF7A3U_y?x*d(F0F|C*|MSj#mbf=M?+^y}`rXPfH_o
zFQvRgNaGtnnt`1YCVT_SW}$G7>^Iue1Wk^jSx`_=@%J{(e!u@qo5KC>qukty6j<|a
zVsJ4hRDi~)tNZAX<8y6-!-8QaavAlV_a$_tmfV8?S<BA-d)9}KA4~eg*zq<=ZHIqm
zBYO3jqN$nv{Q0rD(*}7|)Y3FkPaC1Ty9CUgJKDQ#;NU@n?!ffSn@Zy|P5+sXhynO!
z@cbbmk5~46OdOS-zL(www}yJMZVK3f#^k@(gktNgVm$!EeF?`6NCXP-C7^39n>X8b
z{=|u$_liWj8@-V9FH<={j+5HYq}{i}{$eLA?>GikWYo__9bl5Pcfk>26EXw?`D(y+
z0M?~%{~~zd!m0&N+T?^cqWm8H`AfI%-EFU~drf7NH~Fy0N}aG-M2E~B#d8WKypz87
zL)F6{G#zpkyLW%O214-HM-Mw6M(}Nnt(h`-I0cKe`o6G7xp3JxAlCs@)rG-^gZrUB
z_~KLc3CkToCoQOY?sFOW!`3!LN9|s1w9ldYIj{*JuPprdikbwRf{aNQIj8uR(FbT-
z!|2QiG^}t<#OnghjQWlG5tl$!*|spJ!`_~#^AdD=2KdH~Gb1u)QJ~tO@}}?BrFLrr
z){XZjx-HnuUfsG=<*uXR8{G#&kWJh`!ZtRa*%p8v0OW_4uoh-E!;YLiJL1dU-{p=%
zW{r;@f5<QKboYrJ{f-~EGB=+QM>YO_K?3l#v-YrIdj@~Mg7^ScB*HARf<QX4AHL4^
z4i4^HJ>Dlc*ygrXr>4VKnhhF8usZasFM2f94~G`Bc|l)Kyr9OS{<6^@=+WFcDwi@b
zcSAcc6mivUqu@6a@s1fIeJQ)1JTX?;==l^hzDXCLE09G4MK0$MUOi=HE?TVURGq5+
zTXs9oTWkw~g0OivO~Sx&X=r58(22wWuJi2IyYxl@fL+MD34%}0tq2GcvP{XHjEyT;
zIMH=ZOu%QZ*fXoYLOox_D}DRfwdy9sJ3w4o7ReEayhKfncqzL;rDXn=&6}~I1UpyQ
zyjlzSD1t!}x$ob<v*W_p2}=e>MMu-HT?r2l_~ht4A$sTS4abcGNb}1hmTQ8x_TJ<d
zPoIjm^t_w=KI7{56&JPW@?QX}k1G`N??gl3<HJXc@bmSB!7L^Mz`W7(N=b2}L`0uS
z%==OL;7YeV+eMu**3^t++xhgJoOkEWDTJ=c0U-Er=0bYMpDRG0?)Tr@D^H&`Z7ER%
zD+C~<DAyA3c{4Jv{2~bx1&ayLOE76Uf2VaJ$<UlW_O_gI$P_m^{K3%>DMLY;f_7dW
zN^#Ltv+z3%@Zzj()xnBDC=P<6aUy1aW#>+vE>MZZlj?ba<<X@kN`Md>*~zRMl{%6n
z(CU7gJ{v&&skL;{Llg`pzbOnOVME~=x7Ah-9x~+a%a@IK3qi|x_edyMl0BD|6lLwk
z>pod?!7b2a$&f6<)(b8xPM_QUtzzoiz2g$eubz}WB1<OEz#GdlAi?RG479>c_^Vs1
zW*FmDC}M|qD<uEYf(i&jwnK;gkjAKd5u&Y6$fst`!6c!9YkRFh$E;}2sRlXETa;Jv
z$|Sl*e~%)!>qwx$glkxrcx=6}3e+k{p@`gF`WRk*_KY0@QE`QX1A{Ak$#nw!{2*xw
znGX{0tvYVh;Q5i*16uDpOq2P+&5aF{U*Hl}e0uod#@3UOWD{zP_actPmUs9%a2Xwj
z4i3*p<@^kT{~-3YKox~=gkC{ujsWQWyLT=~FzDS!D1gRX(E>o$C;HbogU;*M+W}MJ
zrWmht4LYb!2=ag9Fy9~r-XqHTCvggJ(%_+8^q`KjCnoaZSe14ycjCo@CU5aqJ$;@E
z3Gkh$XxvQ)5}@ABb8tv}dX0a?7L1G(qsFO}(cCvlc!?NZr{%`qEG;aKUcdew<JVT1
zjm`-m)E0_p^mP$qFgUFf`ZGa6LZkr?&3)*9=}Cb?bnC#~gEmXpf#J7J+PsmyfPOw+
z544#gPZvbFnDj%XBANn)peL}VR5Y+6RC=K<A)QN3ai1A%BS$(STP$taB5vo0Uu2L~
zAHa2|YqCY(_fBKCQQbwVe)n8!Dz{qrDda48wX@|WZmx=?S?%!Q&KsW*RuH(Wbk)+<
zjyQd~vmPXA?o<8BA8XKGP)03v10tAg5T-ci=O3Tk_W}omgvl_xz-hv9q=a^zyOH*K
z9fw2oIr2_Kb2{*&v_BjGF0(E4mpxRybT~39N{s1!SY=ER2x<xFHC2QV)z=A}9dM&d
z?M)yffT*aNIXINruf!!UIJ0nScmd)#P_8n2Ijx^?oL$V`%_u0?4v6=#xx_=^+G>)O
zvB<REv0)>e%L+#e2_EvDhQ-LYFsn*8yZ|YndN4F+3tLbCx^B@f7B#Ma&hOaPdk8@S
zwnI7+yo>qm|B%%DVB$Q~!=f*>Wa%W$i(h6W&ZCPcREe@v-%)?KKP)2Qe)17rZQ*ur
zok+Flbr!e$G%8*+5y4$wxrb9ikt&C2cFgyAc)0K3K|_beT3q_ez?AV7Pe2XcN5?=`
zI0ktUvM^Sdz9Et?NP;y$G4pZ`7i~4aH~Wn!>1K@=KmwcIH){<MmDz11L_mrBw%7Bx
z^kg%L;)q3$yJ>0Ob<bZD%X=K8Z2<pp<mjXk_aaFwBNIsF%gg8H1ihf_3@!m_bb69#
zU?g@+NT?;WggNtVYckY~ID>f503i+;7dc(6att&0+}C>fj|j`Wv;zq!W`~|yD?O4^
zHQ-(ts!%0Y!orldtc168!ST#nY+TlXCu}k7{c<1b;o?eO52+LV1EZNUp=oNyZP*LS
z^UsVGGO5w_db63{i917bPFNrNMRoUumK}=jx^ZA_QIYs6aD%SBd*?$TEHxwfrQS$1
z5NUtW>{==G3AXx7pWgb_c4$pt7pcOMZX3#N`cX%lw_N&PE2I_pLlZ}HGc%|aQ%CxB
z(Gvs${0eqY9Z3O|uDC=L%g9gmlq2HM%>L#ZavWmstegB;3Ske0uPUX$9Xp!E)oVj%
zAaCiU2jMasDE7kR*3f2s<|fg?K}0iQqnLSxV+NV|n^c1DkW(hAhYbSJbV*9~q`r>A
zS|W^)6WecNG%k#ysmOXxJP~XKP+kDbi}~Sf!ysg@8rFCb2N6IA1-+n&F&Pmez55ss
zYA;r6<}Bq=vHSx(09$gdcKmJ0gR|#<f|e59Y4Yvc!>>IzU-JsLM0xjQ9#)2veeSCe
z`PsZT90s8MbFT#(kOAGH`vha3X*O;cS#IK%8Xx<|$HpI+o3#AyC3-ZF<48=Ll&h}%
zj6$Nyl`%1}k&}{ee-pMGN}-Bue=-InnJ%=%yj+O>`x{o#jCI1)@hA-im~NG6^>KP<
zYfU~90869dDP&|2MHo+fB0Ud9>U4p0LhnZqXRwrV$|N`sC+9@7eATQH!{Y>9TRlLn
zcw^aPf}O}oc6#^OFC{Y;_H@RKuT9zZ-9KsUk1l$0QT`q;R(*Dzq@`vJv;)F-LO)U<
zL<G7R_1xW^<wa*{#P$hbWH>|By61-gg3hsc_9FNs0(p3;GBCEZufFLaKR-?Aa~HOZ
zxpw_Jy@31e36P$?^lD9Jw3{~pB(%cb-XKSVrof24%B!5f!0xY)g#2=CL|x4SP0(`Z
z>!(glU)^tq?s=>~0TkPXVFa_^-VkSkONEt3Dg9&~F~Bl6ov%Es$3eJ(7j#S<2iM}{
zATj|WDSc^7<^1&2CUx9w$Xg9ePS(jzMuNfm)>TTBL~l`F2~GZvioeXUdD3nqSp=<^
z%leI-Fu@fZ@Pxyv-W}ilf32N)K#lnu_Ot$CEXg)1RLa(dDQl5ZDpFFB6e5{}7LzDT
zQHc_2v?(M-t5k|2JJEttmMk%iRKm0(;{7~jd*8p`^A9sRr{_G+_j`Zu`?{|Cy1DKq
zqa9Ep)$5I&oyUc2nJdni+Y>w?A&`HSe~xzWq)V4Ha!r;#aZH`l$s_MDH!Ne%r5Dbq
zfF`A_83RPbv>pk$S+WvXK<(F(+~4V!luv51vuv(UUMWmr4vKHJ^5sCzuD8rQ;^vy`
z>0PJV6qiF&7OHZEGxO3V?Zl!N&ef-%i%4ybGf4HZqQcQIM?E^X_o*&z{qKct=*(&<
zpP0?IxR@|GH9~~hi|Ol)c4%|cYzO-iMQDSKccZkpRwdJ4heje<H$`*h&~qn_9s2{6
zGEwEvAxKIE(=aFq&0%Vqno%e96Gmg2WxH@FT7uO>Di;so>*Vg){PoM$(8I#vTy<bl
zNZsS-&zI$_OeG?-F&{dDRN1{6etg%iU05e13IjTd&pZ1=F(f36DVq)<p83!H^%1H6
zN*XR0k5D)*k6gDp*<-u5_Sf>`ot(3u!1wRc();QQx)U#-XB;4NV(`b?<{%NS=Wbl|
z4j-yeWKMp9|K~z8vxb%yM;(hh9e`-`_(-kI6U*Y0lQlQDB*!{(g}6t|x3nB&trq>o
zd2*B>vTdR*H+%2iK5Y$j4h(q1P0>;FIKSm%y6W<~0ZH6yIQXE_z>hNTICUQ8sZX4E
z{PpwnDAXU(qPPBmepg>`3NNjbC8#!yX_4=9V8@Qm3l}aduH={}35_jB>Ah^oa%RC4
z&Nv{dj>+O^Tobd;bJD7x_54Ufvl|XXqH5oXY1Rg>tE;zwqMLt0$1}&!u-~+K;Otm*
zpFFAhmqYww_tvH{Hcpw8v%w`F#N|j6(`|Hw$&7%s`O2na^RHd=5mB1!n+>0S#N<s{
z9Nr;Pwk+EtcTSYYWM|I>0ddW})$08EzDiEfHtD$RTY@P9pPKuhie=zNHIj=-lft%r
z1b~uCwN6D8AFtj7#tH~^-$fm<G8)wRZwNP_(?+#5@7~?naw}Q&QFt50s%zv+2YvEX
zTHES$i-*+uXv9TI=$vr%<l|Ajxf`iAhAff)Q<5HN15aH&T9Rj3ScHfw@bi|t7ohku
zxTNN`&wf68XV2M=LRi>%+-^|F`~?eEykls~K<mJfIDw(`QShOP!aTcd{KSrcAm*Qv
zB1~S3ccYbyk9_Dzfh6y<=lAhCr+cl-t;Oyww%a<7kG>JZ7WdArCPXRz)eGvY>8%_6
zk9O|AUm!1X&3*)|akp<wfW4E`(sgsRMtZ31l~_-JuSy^Y-fNzooRXrb_0K%<ArNm*
zSGmU=E`hZ*R9Z|xj+Fu;q$-FR$`3J7i2Jta27sql8IGV|KGcucJkHemmuF6o!Q46*
z%InrCPF2tD%`I%$A3W$Hy)xo*f_^32RjJY4wRmfCI321fadC=|EmmqjcvEoaPFSfh
z)6cxeC6sBx(1B^6NcwVYR;;I^vVtQQ259t(v&rE-cGLxdaqForjD2SdCull)z?-{*
z%<W1Nn`g;MrAWRLhi<q;4EDZ|KT;c)h25Y=SJ%);Xd&0XUgeE2cD%Ck9(SK@b5`7V
z)mRfN!e^3GGu?I7*WGN67|+RNrKJx&dxZ~{EOJk&+@hFgAT^_9#sIR9)(7UA!`_kA
z^p+eziX7Bb@OzHr6gdIuSpHAYV7Q?5M-swc{z>xmaM~Z!c1<58D<O`s?aHf0K*}1F
zn|YVMJ>1oWfgZr=x;i?uKEWH?WUu#?kk6VZ$_+mKd7k*uuyic{6jTo13)sHFs(zW{
z^!vya$SwPr+w?Hx5M6YeSx(-L3_979-MLUEi|?xJ&8>sB+9?7)N?#adZ{;Iw?Z<W*
zPH~r)j9MNVD3)z%;ri|Oi;1N#Q`NN4RU5Ti#csBE4Fl|beY-VBwF(+lRhz{sFGfOM
zp~HjV#i?Mz><=*dIYnlBgY00v$1Ggb9XAe6O^=Ta_KXsy?xczSq2_$U_kI$RLYaLn
zG|N78tNne)wVO&RR+12o$>X|v_2}`Dw1u)x=nFM7lh4v$#e&h(^Q~@82OAJ7%(qV>
zuwmu#fp{3W+hh|l1Xjo1o<^7|65lnS<qiY`GP8j-wX`S-m;y++1auZTIkE3v3yp@c
zv9w^-p6N#WcYm9WN97s|>_ZP8G#b1?C_?X?rw4lFaQ|mlggru7Q$7@t5J62>cKg1|
zh2Rwmfx|xy?<p5cF2!n!E(b<yFv@orRa9Z=vq8tvhizP%H}`sSk*;Tg;utlx!JqW=
zIe0Ne7MJ6MbVyj<?urTY;|-1Yv9;Cg>mOLQG$BSz+#ueo*T40f00-+n?JN5qx;MiU
z)y;~b)0=GJ0k>8I!MR4B?5xxyqf298O!fIiY1H_LOXD+J*-1d`-1NmeHvw_8S!Rik
zQBlD(&MrZwIMG7M@I-il7ds(1r@o!X0mOL4@ky4)E)H(kYSq2dBiSFiih~NFfpu{S
zmeZ)*X}yxr!C#&zT>(Hm{*GC<k!WH8C6)g20d(YGD%Ev=*(4a=KRx#zlHKm!x}=~|
zkS5SXFHwMMT~FUDi?^O`9WkJ1#F&D*AcF;SDIr@wePYgK%-*E^z#OzVuC>vRKgYGR
zump>a)vLk0cIC$CuIoK?Y7<Eg#X6@es1;}!Ga<z@ylIMIHUK!^p{<Vre5Q0awFY1S
z-wD3G*xDK|^j#8WixwFSPp;e;YPD!8h!N+A+0L<9UuM}<DSgzxB6yuDhs>l<C>PUM
zEQ&IS&<M|7;;7}<IAhVs;Z+ruC3DLIeV~Xu4km<GR7l-B7%?=_qUJWo;m9)_Lg|Ha
z-}X#8jqP*>$J}?5A2EV-BgrC4>#wR_ax1CH^7DJ=Ues!gc#+f8?==NmT-N1&uPGB~
z>+#u7hS-W3?8HyuO*=P%yCvQ*&Z(}m)-jklv(uri1xr+|+nqP&Jj^?4@8FQJvLZcs
zwPA+a04b?jW}ycK$r&pcukMtXp#6gzQ#Y&|D6^)Cr-k<ANd5(oF1uA3<Qb@b13p*u
zuCTvU_&nJ~&P?_7337WxzFR)Es5A@HdZ=S)a)$FLd`y?gu%SbXCd^yARyici!O?ME
zs!9pRe)z!OR@$yeIfoz5Dup_$2PK~`VVEXJ>CMjp38EQ14N%4w+A9pGBg?ocwZ03{
zf63|9kQ!M#citc^j(8k*D}eSpHU>*(BcKU*1t!f5@<KHf$@mgVBGk@Y37~|lXK3$I
zslGY-%0cqGkF-rIkV&)*Pk3`Wr*L6IU~Ev5i@g2qg%c_w%0(ZR?kiIw)HY9AF#eBX
zo`(=>&6CcbBY{SZ4#k*ko4eRHts`R+8<(AvWA^Bbxs4Yn?!6O2o9=-F*IqbNUmQR6
zVE{TsAIArathZaGc()y0V^Jn0IWj_beA0|@l?ek|);`Zk9xC*=ku#RcXTvWbZ=v)e
z(fi}rq^VPL%CCNwo&EC&bn54K)rA$(^Dk1XiK?nX|4L~B+2gQ%5BP<sd`n+S77kRc
zx^cf*$iZD}Q>ebtciT&kl9%5h5)SvKWbTN+zmbK-z~*$UgX8+R<cfe=x#Net3SL3g
z8v?@|@>pYU-%NfoXThxOe)+d=bFR7Iw(fTowW8D3*k@*dT(hy*Zt?TEQh1>a<CEGk
zy{6dR&y9CE<5A-WCoOGpV%Dk+<f+nz3-Y4;Us||)j+D=<iL-S`L=!h&&3uJ|UB%7H
zda^Jham*A+fB&w^;rZKNh-$~%&!{_fZQmMRPL=|z-(1@)Fy1%mNhUDYsNdu0?ynOS
z6@~Ku=uL@%e%aZ^fsrBOC!JY-|8hvDzeca>RoFu1&0e<(ODeU|*S|qCtB%S(X`H<N
z_rC}BN;CHVs(}F28KtPGr>hHQDDA&jPm?)IWR#KRYsQQoz3cWWoR;K$Xuj>cL3vSp
zQqr6=5&LciSX9<74`fXvkD8Z>0hQoq-e~IDLUc>1wqkD(fm_|%DAlS)g`@)RhGuZ&
zp%j2AE-i?Okq_NaCiTvF*)rLuwwyx!jdL0AW6jO>@Zc%slT=Wqr(RkHd7-T_S*A~0
zo0)hYKVAFcv=v_z1`iwvhFfvl79z#5V97vl!>W%)Z-Wc~#+Lo*xIX379kG~)Urld*
zJ>+lyMPu%|W1`cdPcwz%<^5cUB>fqR4fU0?g>#?6HOQ%5{>B~GfS?$ZS%Z2i7r!V<
za07ABSz*b^tpr=?xi@v#CQ>(d_p*mYmo9lZW`t#=+~2OXH2a3HUDcInK&*$GL`(j2
z660n>&nUpbcE3lG#L}|k3%^CWyPE*6DO9BXcsStZwQDOeh%%oBO<lf93l+(fDOSeD
z>%agv$Q+5dc2&9a*g=PM#%53qd8DL(r6Lzf5Bxhk9QJRNjsaCbxP@2*zE3r7B2nVn
z!KjmBq)=xCx0asWHKxt@WPP8D(8(fn*Z7Tx<o;D@Zfbgpxsf>3KnY@lg-y$nw0bB!
z)86##tg_#wUx3&C?VQ`UZY@{yoKy6>vTm`SLq>Xfb<d&+f4g4@-mI{Gre~Ys(^Xy~
z(dUEAEiYbItXbgUpZKD9IJN4j&$}Pgdf#;M_|q%GNLTk5Ijy-MRI7XW3ItY<$9Nk(
zQ-rr>$Q6$4>N9wMZ+GwM`%SH+Ed|scd!)yS+NY8_<izxEQa3n~pPaorU{AJ1pSGxr
z3b39i03@`xZJS>32xo-W9&TD%xd1f*fqvQfv7<qGk39xww^*i7G2sV;f9cE`)Z6CX
zcZBN*kyu~<kDks=bRrSmM%dqD3U$t*{rfMK{EZL@Zhmjris?`8*xm~o`@hkKKrijO
zgxKkYQ+?}(2TDw!yB`!IkWSEY7x=)$%VzsBZs^Ueacbad7u|+lC{767kiEI*wU_JZ
zVob280_1-L@VoX5?;~wsJWkeyO46Hs{{AuLfPks*3U2vQYmq3H4HEClU9+$G{H1Mk
zsKCaN-W)9#2ETsskR3N}f9tWMBjwXpMyA$|2+lcn@r%!f?(yr^&-gUl2(gD&97Edu
zr`wpC%-7KTbGiZb>*j4!{_c=zK<6gt6Z{%C&M(9Lzhs|nFPX{v=UIGHVXPhpn1Y`0
zXgR)yTfTgDCw&L}G`>)B<xGk6gq7k?Pr8IWC%<qNX8kE^!C6_|F8QsSk3{PW+u}{(
zPp?UecZF7E+U&ZvH}h^J_qmi@Y;AnZ=I)oNKXaoCHawfS;kMWvo8f;Rvf0QXyG##Q
z2rN9I1>9-T`fQSb*W0E5%I6#k3(G6{LrE#%rShSz@EA%7cfkH@B#*j!ronNNAB)QY
z0VcWm+e*N+A82yR*_5bISeQcS$Y@%5@s@xOWj;rV8Y&8^+h{VM2t*QYq$0y{bMJL?
zGZUa=NNdPlm9Nk2A%|eoeEceSfT}C5BSy8EltSk`@@r+dZnYcP`Q}$Z*Zp)5ivX7?
zGzYbBhz)t4w~AS;wKf+J+XcZ23JRwNsO{dfC)V@PI2b7Vm$hpun+I47MYUDhqEgsb
z{?8pLE^cwR*d#Y2@-JWBzU{z20?1sd0&Joxtrm8%_1g)%avXV`hf*XbgCO{X{D9O(
zR&Ta;nLmV`TF#R?P@YADa%YgRI#dW#%OE0@ucoW5UJ$T?jB>DHJ#vRI`^!fp8lGS7
zlK9WgSciSD_o_zB6+b%Y#dXp1%;aCRaL*r8|IbZbCP+hBr#Lx~rrTE>G-|y}r=fl(
zTrw%gb+`WFT|rfoKE0f}@$FoPYgh@)EeZXHMQdzg5~gmTuixEb#C@V;c95k8NL=*7
zVNeFuk|$!Sb4zAr9IRef0n-ULGP>1PHOzX&@nwh*Y6@@TLu#lfK|j&EC)ro0o0E;l
z>*Hk|tFxQMUlMD?8u0*6=1~!53lCmv_Q|UxyH=h){jkJ@zpCaUIEe5A&GL?njHFo#
zjqxJOwFv=2A3^YKSne<I5<Uzp2%$1b+w&LtERf$RQ>T8R9cY4|%530f`uDn(A@&N6
z)o6l%PusF@G^-1U<3u?Dec7#^=pMHrZj-x4jhgG9vbHhg$%uq=v?ktd<bkAK6J`>7
z`sgfA*_o1<7_5Ge<^WGm&qTzzns8#>0}}Tl5~_ByvKrD@%2=Mez)VcxHsAc4);Y;r
zzKdc%-w>9?^9Unr>&Oe&;6TF9nUoL)6C(U2O7JRf$(j7^AZb1UmE&5!Ljbp@toM<a
z0AGm0Nl9JxI`Y(k0+JQ$7_bO0N$Fg>3>=SI#<R}b5J07!;2x{NU1bVI$ic~T^5DAj
zB~3s=Qd8CB09gdP#=82Od$QG@Kr$(AIEUGwRyVP(j|3MM;q_l&<YX6lo2CPa-4?ys
z-I5ayF$_SSkcCgv!zXvyG@}$0c3Cb@R)HfGy0+F0+jj;)y}0v9>Ln`n<wJb7y%h`(
zT(i5S2^-S$G>>wZO6msy8`GK$Y7<w7a`L#EtkqYdv}Hf=>>~bj2u9UdQg9au+5na9
zmtHmI=H?a@IOtc<*(0@9m=RF$C@+F5d)m_6OxqrmHD_dgQIY-d6TiZfxN<|NYHah#
zp~rJ+nq9Y$@lzm@DG(pnpQeRQ5rIrzB~Aju=!d#POz6H=H3&$TxOg6k5_(QNa)D$@
z#5t2;vKHYU?&c=IAZwnargTzA;YCVs9$D}bUz+)@YqHSP(?cS4QAmKA?u-j7U}LH5
z%j^H9rR9xvt#v|D(mCkImhtxOQGKczvZt)9<K-~KG5v4ygc)+C8`d{0n85`QXiRa_
zGZ{*{{mjA{dfv`c`SqB@sS`9b;vOTmah*0}#*xG&@{5b&U{72dT$zPIPf=zc*bUHG
zjd`aK7Y>-P0AER8k3F&67YGZUO4-!9$7l9=wCFN52AeTexo<(Jv>3K1LEqhdO6DF-
zKv!S=QfAIi7Sw#^p9scC|6;ZR^9+)Z(9UF+OlHtn%XnJmEK_aUh!-OIg9<nXYy6IE
zd{d50Az$(iK*cZ_8Re<lEG)`|#I#KgQD=B|)`+FVv*RN$q5j>bEvyEjz|KkP=5Ifc
z@8^;C5ep%kY>h)nN(S+-##xXhmTgGl^;eCQL0gbLpm#x<gfIhu4(3GgHph9C`Brbk
zSb9Sa4RRykaAounqx`kBu%I*)_+hMyxIxOF!tXs2*u`ZR2a)q53kMx+XG=)B-@kuv
zf{{q$M?mh06YoP0Qjse9kX<#i@dph(3QyQ%wA$2kKiXg{L5|!}V=e97OAS$vJ{Z3t
z)5;T$xXl8w#BaEPy{7Sz>1`bvfRk{|MMb$pCm4z&m^$O=(BeWuK55w{u|&bGiBN$Y
zNKxpOP4q8#$!y1EP@ZhOB$vS-SdXiAo{a5zaQ#gC){<#Am!b(-YP?7vZq@wd3Zua6
zQ^+?+Z!NJg0SN#bV%5^_jdu~zJv1sRIVJ`+_M)IxwfBKPzqby-aO!&_4+J!^o{wE9
zL@<4PLd({vQ~U3{Vm~kjVigto64i|`#2S6<nHG&zG3U0<P0lLrz6|E^p`n0**duTT
z2o^oyNms7ukT%i03wG7g$^LNazrzflGK|MBf7$V>8}R-x@bEZ*8612n+&V%vYOIv9
za4Kw}Rd(1g9v})8WhJG``b{7=JiTVfy!h#K<6E6|UaqTeF<Ur!zV4XZp;O2&D8R=I
zvDvJ+GXF2V;Kz$E9zoCF@ZWz7GM=(8Pf1P9lMkhxFo(cTjz*(#+LDEP?*+uH8r#P&
z17A(@9UTK5ogGO@YYYqwY;A|_%fl;-U~54M(;hZ?5`eoav*SBo&&vZq0Ta+Qf3&PU
zWQu)QEHv+tb2k-0RhrF7Jw;g!1=L@M6^#4H>z6N4n88ax&C5s&%}<Nb&n5o%2A(<h
z{CSa#GejFz6+AzvtpPs3te1j<-}>}%WBSbJ597-m2s*)*H)}R=V6YL-0_Z|d!`f1N
ziTZ5=5kPT2Np2-xu!7^}gRK(nmC%dA%KE57hu$MLIdEW>N^u@8t1L4L^Oz~uWM>=o
zK5z29UvSF^9Jhm6DhUbhEQhImIGL|EY+*pr<Pfu}>gsUX+_FqyEow@Y=7X5SiXAKG
zAr6cjn~$G)RB*6-O&zg<Of+IPq&D$e(Ow)P62nC<*i^lF9A1egCTa=_KK8{(OZVgt
zkBL2gT%ZnB>lX!5zx(;k#w$9rd_Tfc2))9B0%^KXv}G~)%b}~n#_n%DI^|i^9IdL0
z9m3GAW?-I6Et=*4oQaG@b>gvjm_l-O1kH!l|1`^7fWoo#g3Vm8R`R=npSr-qp^&Sp
zt5eQN+LU=Nud*Kg(hWscS}~Kep~dD?c0Nmpazoe$-8laZ%N0PYL~+xhs3>~qN4s6Z
z*?5f7B7rbEcP?NUQ>O<#{W4dICsz%<Q0-E?W=<w`GQ(dL$^j^1ZdeE|KwR?$n7_KG
z?11p*k;33utc(2LSiRqDVU5WHExUdXgKzQG31mEf+-&3BMD$m>)+{MUOTbA5rZBc<
zF`2_Ne74A;VtXWKHCj9!k-e>L>YO6s=JSb`t-pukf>&2rIbyL*czf=E5_ugR3nZc`
zZh}S(A)lml8GI*Eu@Y=)|9;u2Bbx`(s!jL8JS{yvaTn&ird%>U`p#nU;)B(N`VeR>
zBwgXNx#h(Bk6SLReg)!*w}freS#8;eWUvsEvrJqj%g}a(9%%QMI8Hw4L4))&U(#o^
z@^}m?DmKvFBS{Alb&ycn<U0-5%dbHJi~My(NnfQ~__CuFoG1S1)bcJF`qSFCmi7Wy
zRCfgQ{yKscE3GJce)p;c+SIm%{?o~yz8**2;zG%1=<up2@(d3j)(9Fstrx?tqK_SG
z1c9%bXm95S2{h!7o#prS4^W9^9vAd4s5=&vm@)taU4e4Ff&74;2tHB4%`85&ZZzvr
z@R-C-CAJ?8I&+=mXhw^LA;F!esxeoc?1xZtE+;2FuaWbC=^ff;nrwG;Rs!UaS-5bF
zQ!|PUt3hp)yWtU+N!K2O_vpQ!9)pG0G9U+`%iArgwGWJ1iM_WTm>Uy(bW0wk4r@E%
zjQOjFmgNAm$Zu!$bL$5zYo4*$kff5emA2Mk3PgFXo|WLySY=E<1$z>ga(Ey?LYtPL
zH{lNr4Zzm95Qn^wd{9`+&04td4EZIbqyR`AK5UBv`@{G8vu2qZo+icyjcjK86oO?&
z=nzT}<jmI7qfSkTj`q6xf{Cy1Ys=OUwZUkWolqygZ)_w^G3*NN<=^$$U_e|LDh=v4
zq|HWi=KS5@-B(H-=YeXNM3O3FL&F#Av#ECIx0V72#{d9Z!fPW>S{yriG>syMXm3jn
z#epzwTI8Kql!llRBh1Oabt{1j=S$al6XnsHJ<iJ;HDbgH`lvMSKUrta7i0k)K5(EE
zp^TH0Q`%Pigvs3A3ld$mrDy`>7~26mPyT>qRjElPK}taWgUtfNGR~YiLeQa;h!T(*
zNNG&7rJOKQoFf_45Jm&k8KK`{SiUCTfYA}c$d&w~oCnS{ilgae+X7#rCvwhS!byly
z3P(MZQo^~=Y#BE{?4!|zpwg#wq$3C3O7}fKO$tZb0hCudSslC)CtmQnLV^z)bu4WM
zL=Tb|bNS?;p!wwa$a<IrlCVF%HlEu_+!tEyhxYHU>l>St^odS0Y?Pi|zC5$e8Z;9z
ze)aR`&bV$P(W7-VF(Cn?X%Rd$zlc+6-Sd62OG^sP)fuEh=89ylfwYA9Zf@_&8IHgV
zCV;FfN=sV0<FJdq%vh7HAk&~xJi_3nrC*#-%$MMbK(Zl(P3{5Rv7*ND_{O}u@QGjK
zh@P4IRnm90qvJNz#Oy2{@f%1%4hZUaPB&?o3wCI@-Y&)K8Tjd~(gdkMMaxkF)?0v{
ze5A?YC6%%b|GaoH9y3{7XOZgC;gN<!1UQNyM<Rh&TY6erg5~Zm|NfTwy-_aO_GVg(
zih8fUefy3*dY8GzB$EDpfJUBM`;~XpJ^yLUd}D2TK$Pi3<Q^#9SACn={R`{P8g=jZ
z&NLSwiaC#&%RY%KFPyvg9hO|}{QaMVKc&=uz)@mi00f8&WqkyvK=g{mFMj<=b=yRL
zEZe@l2HYWi<yjTa=YM$pP7Oh0X#2lKpZssBwAkwP4@aq%{!sjT96fn*o<Lt=QXsSw
zV^UD_qJES4G55wQp=VZjMZc{cdZzko9W8S6$SKT8`UB#mf8^QGwx)9=J8zFZecHS@
zP560rE&L74z1t>f3I8P~W=u>NR)%ELti3V#>=M<o=uxU^zcdEZ-4ZEUrXtuUXm@XN
z@~HN-?6_ui8=|K9G3uzapi7hN5PAzMjHxjJ0l#g%i<Q;Xvz*u6ZgdZn$sMU{vcxW7
zqQSWKUrOCsBuGD5Ss)MeyPohAK2oHjp%DVSTV|AoKb-I?6=;MqG{~0U+LjYpwaQNq
zP?&pP>u4tL_(8D}T3DOd_Dr_FdgaQ1Z~fC=GQUen5%079yjKMbzQC2Pe@XPW>_K68
z#K#8TIBC(HZy(YAhq@}5aH-lR8mVH0erw_N?=dn-pZ(<c3@=d<y1nb^zXnScm*b>d
z`EK_QDXH+&UN;o!OcK#2HTqkpvzU8L#l;zAC_T5^jp!t{|6h_CbMM3H3coaR@kZ@R
zov+6oNHxDdTWwqHRTA`>g3<oA_9iIIu|soD1e=ad{<=x{ZQozoW7(iY2CcAyvSPZ*
zU(w6yA2XsuE4<&^-@hjg+ax1!KboFj(EeK#_v<9~c{uMP03nb?NKbCF>d0}^AMB^_
z+0p$V4zMf<qhciq4(b@{A-1$P>ifjc>uhbGAy6AIl*v>Zh^Nrk3O<5=_l;3q|DVq(
zhWtuSOvI6HzN>7<Rj>bxc{a?k)*(LW(q%S~kGb^q#XtWve5hZcbCW(jBKpH=BmS=o
z>UgVI?IF|Ab@x*qb7Dd1>FDYf6%`E}+K=~>(&MEwK*N~S{lUYf*`4gZ_YHn~D}Q)7
zk--L^J2y=ZDdXEm8MB35``Hun%3Q-*bTF}>=tn2v&o^o7($Fq2QS{o&{Y%{BoSg4E
zOH52*;zT!ca-E|>Apj8$i<Vli^1Xfj=Lhh&-ZO1%wt_^NZze8w-maZij~isy&KzHf
z2i}1C#2Mx){@}`fd8b$R%BrgSbSLQ;Ao&J55N`RF*UWwz!cidJF@`0i!N$luNA;Jt
zefz6+7lc@+GBW!2^yMhy7+J?02o{HzlQ|R(gC{`ec)r~5oqdLmeJMTv#~qiF%98xK
z&pw3G18^Jtv4okq`DrE;v#j)HcNMFM;k@Q7#V}t&XAtIETWeK*u5I;GbWb7E8xJW`
zy#43Zd0oM(8DG)u5_H&P(AKKw*M*Pb?g@4>J`$WpWWV=obAQ|rubqu7@gmN6&JxlU
zmZquecN~omj{cp91oF!n<0HZ0Pj)tTs=U~TEPQvrw5xo%#sBNp4KtMowQqj;*n7a$
z6!MZf^lWHpYHrdz{C(@RU+(ql$7kMgrpGgVAqmB1!oy8s&mX%`MwpnGBnlF!_i%Js
zBYuM8QuAnKBk5bQcI{koRBB5}65ZXiW4wP^i`T>1F(fNJ<k*drgkY&=ET{jPH=_Y0
zWuf@%A&0>vvF7jP5<gBHZEq2>U4WO9$ByZW?*@{f4}h-Kj?Z4bnpROh@4F3wc0Gfv
zXE@xJ(J`>HdW49X=^MwZcZoe;Gl<c5=;z@~b%eR}B-|>GC;xI!NPyhPj!7|f3>d{g
z0l|d~94ZWp)4rNb){g*N@EsY@^SfJx*ACh>+NdiaLPq&OOyGiuFk;Nq(PAH_3KTJD
zBng<01Y&+_$uUX=;rdC~1g>)5`TgXT^`%?`{-*gb1_>KTFiybaNnncRu1kc6(VWCe
zg?b{KpVOA*%IXM9WN;l<#@60JImJlhDL8vFUqn^{x?gG9w9Sl(n&7|~2J;10Ry7PA
zXBsbRL-7bCBINDpnN|*y_yLPX|4u6-;O9PO2%>!%Glog*$LT9+PalJ0bcHPuKbUMr
zXl<fcZ5DKdQR7#1|5D$vTeQVd98yS-gKu+nRhLu17#;l<Zc&g!{(X9j9X!w1!zGtg
zb%C0YuGT#q52Bs3C&5Q0Q23s_w6`PE5k@4EHO^Oq2oZpDz>DkpLM|Y?l%^BbBeG_$
zQQik>Bgl2T2}4H+@X~+98Oy=~IP#Go<D-J9ysRg-Kf-kMc_0P$0(Eo`Ir@elQ_ryy
zUr(R#BXiKcOnHON3y#4<L=ifKEqamYTv)-v9xzmj3-FRb^y~;r$H5q9OKT98F*e*J
z|G*ACb#$=@OdZ>f=7RVz$fBp|_5qa9cE>t<jih{=^z7L)O0XDPZV4lDn%`9{RrUTE
z?4^9EgJBRFLF-Bj=%{micxq-^JD6EIA0C`cmBLa*LR?&qL<+v!8k`J-uH)&5iLEZ>
z?$JAkR?#GvF}nZ+CEP9!jMRsh#G(xWUU=|#s6(qYBKR8$j5E{;4LMzZ+=J+}=k=Qe
zcP;38aXDaDd}B<7h>tW(LcA-^B7#pb9hTgxcyEG#b3+4)-e|iO9jjoq4#P2Xp=oIj
zFjHe!&IUPR&n>^Y6QdL*b#<}Td%AYMOfUl%?$5AQD=WjH>bQMDWWe~3saW&V;zcS9
zH%}IV@nOwxQS_!@gAZtncB!c!js#$OXV0E>o-EU?d&hRVI{@Zq=#=#<*&d|VNBd^2
zd2kz#I1*yyk5@c1#k}sknn7LN-tVlSrFCdJGrrNEWMX7Puj{S0n4j?=btVNDVWz1K
z)Bwn6&4%6Zbd=wVbFYH~f(y2Op&blD)G}-}Q=^%tl=5uHW<{+hN;NNV2LgUPyRhVH
zZm?~aCrK>SigO$HiJ+<(#mX2GV{&>+Z{f!B!WN|se@%MY4h98X*1Y|~`NF5S!jx5*
z7enkPQQ9&>j)Qi-OK0sa5<PvFsGh-od&Uf*V+@HJ%K~7xa5|+a6JS6FagaZGV%*Q1
zc4-%O9z}^^#YGAUp<TR8QLv$g4PBi3lGYB0?yw7AQA*|oN-s>BQ^>zj>12BR_BN!+
zIW^Yr|N2_Jol^w}=Jo4oOo?Kh+Nqg$e2*!K$;pD>3fLeyX`+33Cou^fh;!I#3~0tV
zKAFsp_s9(u<?j7iBaj;#4<VQ32;67)jiy<D0tXMFOZ4xN<ZH}U!xwCZdxHXYdFkn9
z#Ya1NUBLl@5^@771hxPbdh^1%tkw<ERTDZcg}9ktT+IBr3nlNvme*g$L5R~S(<6W)
zGo^KnNOZqNk{Um=*2Fng)aA#s8rJcBABeqAq~^%{?iN()Pas3<l9BO;Tr6=6k1s-g
zx39n1yL7tXDAQ;5JI@XHg=5-)DzW$QousR4KP3Hf^nJf5>Jhzs3ZySf&$-QI&s-XJ
x^Wtl(>T_LxYB0qv_5P8K@V2r4U;iVq(`U(>F;AVW_6k>Is5fucNu6ao{tqt&R7d~-

diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 5ea543f4cb1e..77af3f68a050 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -1,94 +1,79 @@
 #!/usr/bin/env bash
 set -ex
 
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
+# usage: ./build.sh [workspace_dir] [mode]
+#   mode: "install" (default) → install directly into current Python env
+#         "wheel"              → build wheels into WORKSPACE/dist
 
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
+WORKSPACE=${1:-$(pwd)/ep_kernels_workspace}
+MODE=${2:-install}
+mkdir -p "$WORKSPACE"
+
+WHEEL_DIR="$WORKSPACE/dist"
+mkdir -p "$WHEEL_DIR"
+NVSHMEM_VER=3.3.9
+
+pushd "$WORKSPACE"
 
-# configurable pip command (default: pip3)
-PIP_CMD=${PIP_CMD:-pip3}
 CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
 
 # install dependencies if not installed
-$PIP_CMD install cmake torch ninja
-
-# build nvshmem
-pushd $WORKSPACE
-mkdir -p nvshmem_src
-wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
-tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
-pushd nvshmem_src
-wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
-git init
-git apply -vvv nvshmem.patch
-
-# assume CUDA_HOME is set correctly
-if [ -z "$CUDA_HOME" ]; then
-    echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
-    exit 1
+if [ -z "$VIRTUAL_ENV" ]; then
+  uv pip install --system cmake torch ninja
+else
+  uv pip install cmake torch ninja
 fi
 
-# assume TORCH_CUDA_ARCH_LIST is set correctly
-if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
-    echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture."
+# fetch nvshmem
+ARCH=$(uname -m)
+case "${ARCH,,}" in
+  x86_64|amd64)
+    NVSHMEM_SUBDIR="linux-x86_64"
+    NVSHMEM_FILE="libnvshmem-linux-x86_64-${NVSHMEM_VER}_cuda12-archive.tar.xz"
+    ;;
+  aarch64|arm64)
+    NVSHMEM_SUBDIR="linux-sbsa"
+    NVSHMEM_FILE="libnvshmem-linux-sbsa-${NVSHMEM_VER}_cuda12-archive.tar.xz"
+    ;;
+  *)
+    echo "Unsupported architecture: ${ARCH}" >&2
     exit 1
-fi
-
-# disable all features except IBGDA
-export NVSHMEM_IBGDA_SUPPORT=1
-
-export NVSHMEM_SHMEM_SUPPORT=0
-export NVSHMEM_UCX_SUPPORT=0
-export NVSHMEM_USE_NCCL=0
-export NVSHMEM_PMIX_SUPPORT=0
-export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-export NVSHMEM_USE_GDRCOPY=0
-export NVSHMEM_IBRC_SUPPORT=0
-export NVSHMEM_BUILD_TESTS=0
-export NVSHMEM_BUILD_EXAMPLES=0
-export NVSHMEM_MPI_SUPPORT=0
-export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
-export NVSHMEM_BUILD_TXZ_PACKAGE=0
-export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-
-cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
-cmake --build $WORKSPACE/nvshmem_build/ --target install
-
+    ;;
+esac
+
+NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}"
+
+pushd "$WORKSPACE"
+echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..."
+curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}"
+tar -xf "${NVSHMEM_FILE}"
+mv "${NVSHMEM_FILE%.tar.xz}" nvshmem
+rm -f "${NVSHMEM_FILE}"
+rm -rf nvshmem/lib/bin nvshmem/lib/share
 popd
 
-export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
+export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem/lib/cmake:$CMAKE_PREFIX_PATH
 
 is_git_dirty() {
     local dir=$1
     pushd "$dir" > /dev/null
-
-    if [ -d ".git" ] && [ -n "$(git status --porcelain 2>/dev/null)" ]; then
+    if [ -d ".git" ] && [ -n "$(git status --porcelain 3>/dev/null)" ]; then
         popd > /dev/null
-        return 0  # dirty (true)
+        return 0
     else
         popd > /dev/null
-        return 1  # clean (false)
+        return 1
     fi
 }
 
-# Function to handle git repository cloning with dirty/incomplete checks
 clone_repo() {
     local repo_url=$1
     local dir_name=$2
     local key_file=$3
     local commit_hash=$4
-
     if [ -d "$dir_name" ]; then
-        # Check if directory has uncommitted changes (dirty)
         if is_git_dirty "$dir_name"; then
             echo "$dir_name directory is dirty, skipping clone"
-        # Check if clone failed (directory exists but not a valid git repo or missing key files)
         elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then
             echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
             rm -rf "$dir_name"
@@ -99,7 +84,7 @@ clone_repo() {
                 cd ..
             fi
         else
-            echo "$dir_name directory exists and appears complete; manually update if needed"
+            echo "$dir_name directory exists and appears complete"
         fi
     else
         git clone "$repo_url"
@@ -111,17 +96,44 @@ clone_repo() {
     fi
 }
 
-# build and install pplx, require pytorch installed
-pushd $WORKSPACE
-clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
-cd pplx-kernels
-$PIP_CMD install --no-build-isolation -vvv -e .
-popd
+do_build() {
+    local repo=$1
+    local name=$2
+    local key=$3
+    local commit=$4
+    local extra_env=$5
 
-# build and install deepep, require pytorch installed
-pushd $WORKSPACE
-clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "73b6ea4"
-cd DeepEP
-export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
-$PIP_CMD install --no-build-isolation -vvv -e .
-popd
+    pushd "$WORKSPACE"
+    clone_repo "$repo" "$name" "$key" "$commit"
+    cd "$name"
+
+    if [ "$MODE" = "install" ]; then
+        echo "Installing $name into environment"
+        eval "$extra_env" uv pip install --no-build-isolation -vvv .
+    else
+        echo "Building $name wheel into $WHEEL_DIR"
+        eval "$extra_env" uv build --wheel --no-build-isolation -vvv --out-dir "$WHEEL_DIR" .
+    fi
+    popd
+}
+
+# build pplx-kernels
+do_build \
+    "https://github.com/ppl-ai/pplx-kernels" \
+    "pplx-kernels" \
+    "setup.py" \
+    "12cecfd" \
+    ""
+
+# build DeepEP
+do_build \
+    "https://github.com/deepseek-ai/DeepEP" \
+    "DeepEP" \
+    "setup.py" \
+    "73b6ea4" \
+    "export NVSHMEM_DIR=$WORKSPACE/nvshmem; "
+
+if [ "$MODE" = "wheel" ]; then
+    echo "All wheels written to $WHEEL_DIR"
+    ls -l "$WHEEL_DIR"
+fi
diff --git a/tools/install_deepgemm.sh b/tools/install_deepgemm.sh
index 4f2cd302c3ef..ee9a5dd4aa64 100755
--- a/tools/install_deepgemm.sh
+++ b/tools/install_deepgemm.sh
@@ -1,12 +1,13 @@
 #!/bin/bash
-# Script to install DeepGEMM from source
-# This script can be used both in Docker builds and by users locally
-
+# Script to build and/or install DeepGEMM from source
+# Default: build and install immediately
+# Optional: build wheels to a directory for later installation (useful in multi-stage builds)
 set -e
 
 # Default values
 DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
 DEEPGEMM_GIT_REF="594953acce41793ae00a1233eb516044d604bcb6"
+WHEEL_DIR=""
 
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
@@ -27,11 +28,20 @@ while [[ $# -gt 0 ]]; do
             CUDA_VERSION="$2"
             shift 2
             ;;
+        --wheel-dir)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --wheel-dir requires a directory path." >&2
+                exit 1
+            fi
+            WHEEL_DIR="$2"
+            shift 2
+            ;;
         -h|--help)
             echo "Usage: $0 [OPTIONS]"
             echo "Options:"
             echo "  --ref REF          Git reference to checkout (default: $DEEPGEMM_GIT_REF)"
             echo "  --cuda-version VER CUDA version (auto-detected if not provided)"
+            echo "  --wheel-dir PATH   If set, build wheel into PATH but do not install"
             echo "  -h, --help         Show this help message"
             exit 0
             ;;
@@ -57,16 +67,15 @@ fi
 CUDA_MAJOR="${CUDA_VERSION%%.*}"
 CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
 CUDA_MINOR="${CUDA_MINOR%%.*}"
-
 echo "CUDA version: $CUDA_VERSION (major: $CUDA_MAJOR, minor: $CUDA_MINOR)"
 
 # Check CUDA version requirement
 if [ "$CUDA_MAJOR" -lt 12 ] || { [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -lt 8 ]; }; then
-    echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
+    echo "Skipping DeepGEMM build/installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
     exit 0
 fi
 
-echo "Installing DeepGEMM from source..."
+echo "Preparing DeepGEMM build..."
 echo "Repository: $DEEPGEMM_GIT_REPO"
 echo "Reference: $DEEPGEMM_GIT_REF"
 
@@ -76,23 +85,31 @@ trap 'rm -rf "$INSTALL_DIR"' EXIT
 
 # Clone the repository
 git clone --recursive --shallow-submodules "$DEEPGEMM_GIT_REPO" "$INSTALL_DIR/deepgemm"
-
-echo "🏗️  Building DeepGEMM"
 pushd "$INSTALL_DIR/deepgemm"
 
 # Checkout the specific reference
 git checkout "$DEEPGEMM_GIT_REF"
 
-# Build DeepGEMM
+# Clean previous build artifacts
 # (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
-rm -rf build dist
-rm -rf *.egg-info
+rm -rf build dist *.egg-info
+
+# Build wheel
+echo "🏗️  Building DeepGEMM wheel..."
 python3 setup.py bdist_wheel
 
-# Install the wheel
+# If --wheel-dir was specified, copy wheels there and exit
+if [ -n "$WHEEL_DIR" ]; then
+    mkdir -p "$WHEEL_DIR"
+    cp dist/*.whl "$WHEEL_DIR"/
+    echo "✅ Wheel built and copied to $WHEEL_DIR"
+    popd
+    exit 0
+fi
+
+# Default behaviour: install built wheel
 if command -v uv >/dev/null 2>&1; then
     echo "Installing DeepGEMM wheel using uv..."
-    # Use --system in Docker contexts, respect user's environment otherwise
     if [ -n "$VLLM_DOCKER_BUILD_CONTEXT" ]; then
         uv pip install --system dist/*.whl
     else
@@ -104,5 +121,4 @@ else
 fi
 
 popd
-
 echo "✅ DeepGEMM installation completed successfully"

From 7df331c66b242a3109a95fb434a1badd196966d7 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 22 Nov 2025 13:07:18 -0800
Subject: [PATCH 341/578] [BugFix] Fix chunked prompt logprobs + preemption
 (#29071)

---
 tests/conftest.py                  | 27 +++++++++--
 tests/v1/sample/test_logprobs.py   | 76 ++++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_input_batch.py  | 14 ------
 vllm/v1/worker/gpu_model_runner.py | 20 ++++++--
 vllm/v1/worker/tpu_input_batch.py  | 10 ----
 vllm/v1/worker/tpu_model_runner.py | 11 +++++
 6 files changed, 127 insertions(+), 31 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 5afdb225b892..163593eb3f14 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -853,6 +853,7 @@ def generate(
     @staticmethod
     def _final_steps_generate_w_logprobs(
         req_outputs: list[RequestOutput],
+        include_prompt_token_ids: bool = False,
     ) -> list[TokensTextLogprobsPromptLogprobs]:
         outputs: list[TokensTextLogprobsPromptLogprobs] = []
         for req_output in req_outputs:
@@ -861,9 +862,26 @@ def _final_steps_generate_w_logprobs(
                 output_str = sample.text
                 output_ids = list(sample.token_ids)
                 output_logprobs = sample.logprobs
-            outputs.append(
-                (output_ids, output_str, output_logprobs, req_output.prompt_logprobs)
-            )
+            if include_prompt_token_ids:
+                outputs.append(
+                    (  # type: ignore[arg-type]
+                        output_ids,
+                        output_str,
+                        output_logprobs,
+                        req_output.prompt_token_ids,
+                        req_output.prompt_logprobs,
+                    )
+                )
+            else:
+                outputs.append(
+                    (
+                        output_ids,
+                        output_str,
+                        output_logprobs,
+                        req_output.prompt_logprobs,
+                    )
+                )
+
         return outputs
 
     def generate_w_logprobs(
@@ -873,6 +891,7 @@ def generate_w_logprobs(
         images: PromptImageInput | None = None,
         audios: PromptAudioInput | None = None,
         videos: PromptVideoInput | None = None,
+        include_prompt_token_ids: bool = False,
         **kwargs: Any,
     ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
         inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
@@ -882,7 +901,7 @@ def generate_w_logprobs(
         )
 
         toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(
-            req_outputs
+            req_outputs, include_prompt_token_ids
         )
         # Omit prompt logprobs if not required by sampling params
         return (
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index c0b0e1ea226e..c89c33be80c1 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -605,3 +605,79 @@ def test_spec_decode_logprobs(
         )
         assert ref_logprob.rank == spec_logprob.rank
         assert ref_logprob.decoded_token == spec_logprob.decoded_token
+
+
+def test_prompt_logprobs_with_chunking_and_preemption():
+    """Test that prompt logprobs are correctly returned when using
+    both chunked prefill and preemption.
+
+    This test ensures that the num_prompt_logprobs tracking persists
+    across preemptions and prefill chunks.
+    """
+
+    # Create prompts that will trigger chunking and preemption
+    prompts = [
+        "The following numbers of the sequence "
+        + ", ".join(str(i) for i in range(10))
+        + " are:",
+        "In one word, the capital of France is ",
+    ] + [f"Tell me about the number {i}: " for i in range(32)]
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=40,
+        min_tokens=20,
+        prompt_logprobs=2,  # Request prompt logprobs
+    )
+
+    with VllmRunner(
+        "Qwen/Qwen3-0.6B",
+        max_model_len=512,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=48,  # Force prefill chunking
+        num_gpu_blocks_override=32,  # Force preemptions
+        disable_log_stats=False,
+        gpu_memory_utilization=0.25,
+    ) as vllm_model:
+        metrics_before = vllm_model.llm.get_metrics()
+
+        # Generate with prompt logprobs using generate_w_logprobs which
+        # returns (output_ids, output_str, output_logprobs, prompt_logprobs)
+        outputs = vllm_model.generate_w_logprobs(
+            prompts, sampling_params=sampling_params, include_prompt_token_ids=True
+        )
+
+        # Verify that all outputs have prompt logprobs
+        for i, output in enumerate(outputs):
+            _, _, _, prompt_token_ids, prompt_logprobs = output
+            assert prompt_logprobs is not None and len(prompt_logprobs) > 0, (
+                f"Output {i} missing prompt logprobs"
+            )
+            assert len(prompt_logprobs) == len(prompt_token_ids), (
+                "Unexpected number of prompt logprob positions"
+            )
+
+            # Each position should have the requested number of logprobs
+            for pos, logprobs_dict in enumerate(prompt_logprobs):
+                if logprobs_dict is not None:  # First token may be None
+                    assert (
+                        sampling_params.prompt_logprobs
+                        <= len(logprobs_dict)
+                        <= sampling_params.prompt_logprobs + 1
+                    ), (
+                        f"Output {i} position {pos} has {len(logprobs_dict)} "
+                        f"logprobs, expected {sampling_params.prompt_logprobs}"
+                    )
+
+        # Check that we actually had preemptions
+        metrics_after = vllm_model.llm.get_metrics()
+        preemptions_before = next(
+            (m.value for m in metrics_before if m.name == "vllm:num_preemptions"), 0
+        )
+        preemptions_after = next(
+            (m.value for m in metrics_after if m.name == "vllm:num_preemptions"), 0
+        )
+        preemptions = preemptions_after - preemptions_before
+        assert preemptions > 0, "Test did not trigger any preemptions"
+
+        print(f"Test passed with {preemptions} preemptions")
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 7b4bc1d2a224..d6fef450c028 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -219,9 +219,6 @@ def __init__(
         self.generators: dict[int, torch.Generator] = {}
 
         self.num_logprobs: dict[str, int] = {}
-        # NOTE(rob): num_prompt_logprobs only includes reqs
-        # that are currently in the prefill phase.
-        self.num_prompt_logprobs: dict[str, int] = {}
 
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
@@ -385,12 +382,6 @@ def add_request(
                     if sampling_params.logprobs == -1
                     else sampling_params.logprobs
                 )
-            if sampling_params.prompt_logprobs is not None:
-                self.num_prompt_logprobs[req_id] = (
-                    self.vocab_size
-                    if sampling_params.prompt_logprobs == -1
-                    else sampling_params.prompt_logprobs
-                )
 
             if sampling_params.allowed_token_ids:
                 self.has_allowed_token_ids.add(req_id)
@@ -488,7 +479,6 @@ def remove_request(self, req_id: str) -> int | None:
         self.repetition_penalties_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
-        self.num_prompt_logprobs.pop(req_id, None)
         self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
 
         self.has_allowed_token_ids.discard(req_id)
@@ -972,10 +962,6 @@ def no_penalties(self) -> bool:
     def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
 
-    @property
-    def no_prompt_logprob(self) -> bool:
-        return not self.num_prompt_logprobs
-
     @property
     def no_allowed_token_ids(self) -> bool:
         return len(self.has_allowed_token_ids) == 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 979f97758703..49285a7b8e0a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -393,6 +393,9 @@ def __init__(
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
         self.comm_stream = torch.cuda.Stream()
 
         # Input Batch
@@ -687,6 +690,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
+            self.num_prompt_logprobs.pop(req_id, None)
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
         # scheduled_req_ids overlap. This happens when a request is aborted and
@@ -755,6 +759,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             )
             self.requests[req_id] = req_state
 
+            if sampling_params and sampling_params.prompt_logprobs is not None:
+                self.num_prompt_logprobs[req_id] = (
+                    self.input_batch.vocab_size
+                    if sampling_params.prompt_logprobs == -1
+                    else sampling_params.prompt_logprobs
+                )
+
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             if self.uses_mrope:
                 self._init_mrope_positions(req_state)
@@ -2671,7 +2682,7 @@ def execute_model(
                         scheduler_output, self.vllm_config
                     )
                 if self.cache_config.kv_sharing_fast_prefill:
-                    assert not self.input_batch.num_prompt_logprobs, (
+                    assert not self.num_prompt_logprobs, (
                         "--kv-sharing-fast-prefill produces incorrect "
                         "logprobs for prompt tokens, tokens, please disable "
                         "it when the requests need prompt logprobs"
@@ -3436,7 +3447,7 @@ def _get_prompt_logprobs_dict(
         hidden_states: torch.Tensor,
         num_scheduled_tokens: dict[str, int],
     ) -> dict[str, LogprobsTensors | None]:
-        num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
+        num_prompt_logprobs_dict = self.num_prompt_logprobs
         if not num_prompt_logprobs_dict:
             return {}
 
@@ -3447,7 +3458,10 @@ def _get_prompt_logprobs_dict(
         # maintainable loop over optimal performance.
         completed_prefill_reqs = []
         for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items():
-            num_tokens = num_scheduled_tokens[req_id]
+            num_tokens = num_scheduled_tokens.get(req_id)
+            if num_tokens is None:
+                # This can happen if the request was preempted in prefill stage.
+                continue
 
             # Get metadata for this request.
             request = self.requests[req_id]
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 6bf4f9193184..2ed65ca9d31c 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -149,9 +149,6 @@ def __init__(
         self.generators: dict[int, torch.Generator] = {}
 
         self.num_logprobs: dict[str, int] = {}
-        # NOTE(rob): num_prompt_logprobs only includes reqs
-        # that are currently in the prefill phase.
-        self.num_prompt_logprobs: dict[str, int] = {}
 
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
@@ -256,8 +253,6 @@ def add_request(
 
         if sampling_params.logprobs is not None:
             self.num_logprobs[req_id] = sampling_params.logprobs
-        if sampling_params.prompt_logprobs is not None:
-            self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
         if sampling_params.logit_bias is not None:
             self.logit_bias[req_index] = sampling_params.logit_bias
 
@@ -317,7 +312,6 @@ def remove_request(self, req_id: str) -> int | None:
         self.repetition_penalties_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
-        self.num_prompt_logprobs.pop(req_id, None)
         self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
 
         # LoRA
@@ -584,10 +578,6 @@ def no_penalties(self) -> bool:
     def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
 
-    @property
-    def no_prompt_logprob(self) -> bool:
-        return not self.num_prompt_logprobs
-
     @property
     def no_allowed_token_ids(self) -> bool:
         return len(self.has_allowed_token_ids) == 0
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 5f6012ec614c..72d4474b8962 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -247,6 +247,9 @@ def __init__(
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
 
         # Initialize input batch early to avoid AttributeError in _update_states
         self.input_batch = InputBatch(
@@ -420,6 +423,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
+            self.num_prompt_logprobs.pop(req_id, None)
 
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
@@ -477,6 +481,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
                 lora_request=new_req_data.lora_request,
             )
 
+            if sampling_params and sampling_params.prompt_logprobs is not None:
+                self.num_prompt_logprobs[req_id] = (
+                    self.input_batch.vocab_size
+                    if sampling_params.prompt_logprobs == -1
+                    else sampling_params.prompt_logprobs
+                )
+
             req_ids_to_add.append(req_id)
 
         # Update the states of the running/resumed requests.

From df78aeef084cf35eecc6ba52640de8c390c99543 Mon Sep 17 00:00:00 2001
From: Yizhou <136800916+yiz-liu@users.noreply.github.com>
Date: Sun, 23 Nov 2025 05:10:31 +0800
Subject: [PATCH 342/578] Refactor: Move CUDA graph dispatch logic earlier
 (#27382)

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
---
 vllm/v1/worker/gpu_model_runner.py | 50 +++++++++++++++---------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 49285a7b8e0a..6a54e02f861e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3740,6 +3740,31 @@ def _dummy_run(
             dp_rank = self.parallel_config.data_parallel_rank
             num_tokens_after_padding = int(num_tokens_across_dp[dp_rank])
 
+        # filter out the valid batch descriptor
+        _cg_mode, batch_descriptor = (
+            self.cudagraph_dispatcher.dispatch(
+                BatchDescriptor(
+                    num_tokens=num_tokens_after_padding,
+                    uniform_decode=uniform_decode,
+                    has_lora=activate_lora and self.lora_config is not None,
+                )
+            )
+            if not is_profile
+            else (CUDAGraphMode.NONE, None)
+        )
+        if cudagraph_runtime_mode is not None:
+            # we allow forcing NONE when the dispatcher disagrees to support
+            # warm ups for cudagraph capture
+            assert (
+                cudagraph_runtime_mode == CUDAGraphMode.NONE
+                or cudagraph_runtime_mode == _cg_mode
+            ), (
+                f"Cudagraph runtime mode mismatch at dummy_run. "
+                f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}."
+            )
+        else:
+            cudagraph_runtime_mode = _cg_mode
+
         attn_metadata: PerLayerAttnMetadata | None = None
 
         # If force_attention is True, we always capture attention. Otherwise,
@@ -3814,31 +3839,6 @@ def _dummy_run(
                     num_tokens_after_padding, None, False
                 )
 
-            # filter out the valid batch descriptor
-            _cg_mode, batch_descriptor = (
-                self.cudagraph_dispatcher.dispatch(
-                    BatchDescriptor(
-                        num_tokens=num_tokens_after_padding,
-                        uniform_decode=uniform_decode,
-                        has_lora=activate_lora and self.lora_config is not None,
-                    )
-                )
-                if not is_profile
-                else (CUDAGraphMode.NONE, None)
-            )
-            if cudagraph_runtime_mode is not None:
-                # we allow forcing NONE when the dispatcher disagrees to support
-                # warm ups for cudagraph capture
-                assert (
-                    cudagraph_runtime_mode == CUDAGraphMode.NONE
-                    or cudagraph_runtime_mode == _cg_mode
-                ), (
-                    f"Cudagraph runtime mode mismatch at dummy_run. "
-                    f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}."
-                )
-            else:
-                cudagraph_runtime_mode = _cg_mode
-
             if ubatch_slices is not None:
                 # Adjust values to reflect a single ubatch.
                 # TODO(sage,lucas): this is cruft that should be addressed in

From 472fdee97472cae444635508fcf73ebe28f79980 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sat, 22 Nov 2025 16:50:02 -0500
Subject: [PATCH 343/578] [Chore] Update batch invariant code owner (#29246)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .github/CODEOWNERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 0e834c057c40..3247408e1163 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -9,6 +9,7 @@
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
+/vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@@ -59,6 +60,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/kv_connector/nixl_integration @NickLucche
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
+/tests/v1/determinism @yewentao256 
 
 # Transformers modeling backend
 /vllm/model_executor/models/transformers @hmellor

From 4587063267d2751ac183c16421aeda3f335cee39 Mon Sep 17 00:00:00 2001
From: Qidong Su <soodoshll@gmail.com>
Date: Sat, 22 Nov 2025 18:25:13 -0500
Subject: [PATCH 344/578] Patch DeepEP when building docker image with CUDA 13
 (#29154)

Signed-off-by: Qidong Su <soodoshll@gmail.com>
---
 tools/ep_kernels/install_python_libraries.sh | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 77af3f68a050..1cea1bef8dbc 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -96,6 +96,13 @@ clone_repo() {
     fi
 }
 
+deepep_cuda13_patch() {
+    cuda_version_major=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
+    if [ ${cuda_version_major} -ge 13 ]; then
+        sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py"
+    fi
+}
+
 do_build() {
     local repo=$1
     local name=$2
@@ -107,6 +114,10 @@ do_build() {
     clone_repo "$repo" "$name" "$key" "$commit"
     cd "$name"
 
+    if [ "$name" == "DeepEP" ]; then
+        deepep_cuda13_patch
+    fi
+
     if [ "$MODE" = "install" ]; then
         echo "Installing $name into environment"
         eval "$extra_env" uv pip install --no-build-isolation -vvv .

From 5f96c00c557fc68c352d1b9bb1d6f9c9bb9f133d Mon Sep 17 00:00:00 2001
From: jiahanc <173873397+jiahanc@users.noreply.github.com>
Date: Sat, 22 Nov 2025 16:39:30 -0800
Subject: [PATCH 345/578] [Fix] Add SM check to flashinfer MOE backend (#29144)

Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 .../layers/quantization/utils/flashinfer_utils.py      | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 7eba8359b92f..eef7a0896c37 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -282,6 +282,16 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
 
     flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
     if flashinfer_moe_backend in backend_map:
+        if (
+            flashinfer_moe_backend == "latency"
+            and not current_platform.is_device_capability(100)
+        ):
+            logger.info_once(
+                "Flashinfer TRTLLM MOE backend is only supported on "
+                "SM100 and later, using CUTLASS backend instead",
+                scope="local",
+            )
+            return FlashinferMoeBackend.CUTLASS
         return backend_map[flashinfer_moe_backend]
     elif current_platform.is_device_capability(90):
         return FlashinferMoeBackend.CUTLASS

From 3ed767ec064fbebbf5d8de829d390fa4a1bf0a0b Mon Sep 17 00:00:00 2001
From: Michael Act <michael.a.c.tulenan@gdplabs.id>
Date: Sun, 23 Nov 2025 09:58:28 +0700
Subject: [PATCH 346/578] docs: fixes distributed executor backend config for
 multi-node vllm (#29173)

Signed-off-by: Michael Act <michael.a.c.tulenan@gdplabs.id>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 docs/serving/parallelism_scaling.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md
index 14cd3b057791..a32840ea73b9 100644
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -118,14 +118,16 @@ The common practice is to set the tensor parallel size to the number of GPUs in
 ```bash
 vllm serve /path/to/the/model/in/the/container \
     --tensor-parallel-size 8 \
-    --pipeline-parallel-size 2
+    --pipeline-parallel-size 2 \
+    --distributed-executor-backend ray
 ```
 
 Alternatively, you can set `tensor_parallel_size` to the total number of GPUs in the cluster:
 
 ```bash
 vllm serve /path/to/the/model/in/the/container \
-     --tensor-parallel-size 16
+     --tensor-parallel-size 16 \
+     --distributed-executor-backend ray
 ```
 
 ## Optimizing network communication for tensor parallelism

From 389aa1b2ebf3726fef6aac737e0020075324d138 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 23 Nov 2025 10:58:48 +0800
Subject: [PATCH 347/578] [Doc] Update more docs with respect to V1 (#29188)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/configuration/conserving_memory.md       |   3 -
 docs/configuration/optimization.md            |   4 +-
 docs/usage/reproducibility.md                 |  34 ++---
 docs/usage/v1_guide.md                        | 136 +++++++++---------
 examples/offline_inference/reproducibility.py |   5 +-
 .../models/language/generation/test_common.py |   7 -
 6 files changed, 89 insertions(+), 100 deletions(-)

diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 5ce43c798405..0aa89a89eae5 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -49,9 +49,6 @@ llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
 
 By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
 
-!!! warning
-    CUDA graph capture takes up more memory in V1 than in V0.
-
 You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
 
 ??? code
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index b0d390d7e1cb..fdd9c317b022 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -31,9 +31,7 @@ In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as re
 
 Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
 
-In vLLM V1, **chunked prefill is always enabled by default**. This is different from vLLM V0, where it was conditionally enabled based on model characteristics.
-
-With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
+In V1, **chunked prefill is enabled by default whenever possible**. With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
 
 This policy has two benefits:
 
diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md
index afc25b63902e..a8e49d0a3398 100644
--- a/docs/usage/reproducibility.md
+++ b/docs/usage/reproducibility.md
@@ -1,21 +1,23 @@
 # Reproducibility
 
 vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. To achieve
-reproducible results, you need to turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
+reproducible results:
+
+- In offline mode, you can either set `VLLM_ENABLE_V1_MULTIPROCESSING=0` which makes scheduling deterministic,
+  or enable [batch invariance](../features/batch_invariance.md) to make the outputs insensitive to scheduling.
+- In online mode, you can only enable [batch invariance](../features/batch_invariance.md).
 
 Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py)
 
 !!! warning
 
-    Applying the above settings [changes the random state in user code](#locality-of-random-state).
+    Setting `VLLM_ENABLE_V1_MULTIPROCESSING=0` will change the random state of user code 
+    (i.e. the code that constructs [LLM][vllm.LLM] class).
 
 !!! note
 
     Even with the above settings, vLLM only provides reproducibility
     when it runs on the same hardware and the same vLLM version.
-    Also, the online serving API (`vllm serve`) does not support reproducibility
-    because it is almost impossible to make the scheduling deterministic in the
-    online setting.
 
 ## Setting the global seed
 
@@ -23,25 +25,17 @@ The `seed` parameter in vLLM is used to control the random states for various ra
 
 If a specific seed value is provided, the random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly.
 
-However, in some cases, setting the seed will also [change the random state in user code](#locality-of-random-state).
-
 ### Default Behavior
 
 In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`.
 
-!!! note
-
-    It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
-    for workflows such as speculative decoding.
-    
-    For more information, see: <https://github.com/vllm-project/vllm/pull/17929>
-
-### Locality of random state
+It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
+for workflows such as speculative decoding. For more information, see: <https://github.com/vllm-project/vllm/pull/17929>
 
-The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM under the following conditions:
+!!! note
 
-- For V0: The seed is specified.
-- For V1: The workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
+    The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM 
+    only if the workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
 
-By default, these conditions are not active so you can use vLLM without having to worry about
-accidentally making deterministic subsequent operations that rely on random state.
+    By default, `VLLM_ENABLE_V1_MULTIPROCESSING=1` so you can use vLLM without having to worry about
+    accidentally making deterministic subsequent operations that rely on random state.
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 22f4e6761ea9..5f647aafd61d 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -4,9 +4,7 @@
 
     We have fully deprecated V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
 
-V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
-
-## Why vLLM V1?
+    If you have a use case that works on V0 Engine but not V1, please share it on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
 vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
 
@@ -32,16 +30,44 @@ Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-
 
 This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
 
-## Current Status
+## Differences from V0
+
+This section lists some differences in behavior between V0 and V1.
+
+### Chunked Prefill
+
+Chunked prefill is enabled by default whenever possible, unlike in V0 where it was conditionally enabled based on model characteristics.
+
+### CUDA Graphs
+
+CUDA graph capture takes up more memory in V1 than in V0.
+
+### Semantic Changes to Logprobs
+
+#### Logprobs Calculation
+
+By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
+before applying any logits post-processing such as temperature scaling or penalty
+adjustments). As a result, the returned logprobs do not reflect the final adjusted
+probabilities used during sampling.
+
+You can adjust this behavior by setting the `--logprobs-mode` flag.
+Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`.
+Raw means the values before applying any logit processors, like bad words.
+Processed means the values after applying all processors, including temperature and top_k/top_p.
+
+#### Prompt Logprobs with Prefix Caching
+
+While V1 supports passing prompt logprobs with prefix caching enabled, it no longer caches the logprobs.
+For a request requiring prompt logprobs, the engine will ignore the prefix cache and recompute the prefill of full prompt to generate the logprobs.
+
+## Feature Support
 
-For each item, our progress towards V1 support falls into one of the following states:
+For each item, its support in vLLM V1 falls into one of the following states:
 
-- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
-- **🟢 Functional**: Fully operational, with ongoing optimizations.
-- **🚧 WIP**: Under active development.
-- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).
-- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
-- **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
+- **🟢 Functional**: Fully operational with optimizations comparable to or better than V0.
+- **🟡 In Progress**: Planned to be in vLLM V1, with open PRs/RFCs.
+- **🔴 Removed**: Dropped from vLLM V1. Will only consider re-introducing if there is strong demand.
 
 !!! note
     vLLM V1’s unified scheduler treats both prompt and output tokens the same
@@ -57,13 +83,13 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 
 ### Hardware
 
-| Hardware   | Status                                        |
-|------------|-----------------------------------------------|
-| **NVIDIA** | <nobr>🚀</nobr>                               |
-| **AMD**    | <nobr>🟢</nobr>                               |
+| Hardware         | Status                                        |
+|------------------|-----------------------------------------------|
+| **NVIDIA**       | <nobr>🟢</nobr>                               |
+| **AMD**          | <nobr>🟢</nobr>                               |
 | **INTEL GPU**    | <nobr>🟢</nobr>                               |
-| **TPU**    | <nobr>🟢</nobr>                               |
-| **CPU**    | <nobr>🟢 (x86\_64/aarch64) 🟡 (MacOS) </nobr> |
+| **TPU**          | <nobr>🟢</nobr>                               |
+| **CPU**          | <nobr>🟢</nobr>                               |
 
 !!! note
 
@@ -78,23 +104,21 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 
 ### Models
 
-| Model Type                  | Status                                                                             |
-|-----------------------------|------------------------------------------------------------------------------------|
-| **Decoder-only Models**     | <nobr>🚀 Optimized</nobr>                                                          |
-| **Encoder-Decoder Models**  | <nobr>🟢 Whisper only</nobr>                                                       |
-| **Embedding Models**        | <nobr>🟢 Functional</nobr>                                                         |
-| **Mamba Models**            | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr>                                            |
-| **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |
+| Model Type                  | Status                                                                  |
+|-----------------------------|-------------------------------------------------------------------------|
+| **Decoder-only Models**     | <nobr>🟢</nobr>                                                         |
+| **Encoder-Decoder Models**  | <nobr>🟢 (Whisper), 🔴 (Others) </nobr>                                |
+| **Pooling Models**          | <nobr>🟢</nobr>                                                         |
+| **Mamba Models**            | <nobr>🟢</nobr>                                                         |
+| **Multimodal Models**       | <nobr>🟢</nobr>                                                         |
 
 See below for the status of models that are not yet supported or have more features planned in V1.
 
-#### Embedding Models
+#### Pooling Models
 
-The initial basic support is now functional.
+Now fully supported, with prefix caching and chunked prefill newly available for last-pooling models.
 
-Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
-which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)
-to enable simultaneous generation and embedding using the same engine instance in V1.
+We are working on enabling prefix caching and chunked prefill for more categories of pooling models.
 
 #### Mamba Models
 
@@ -112,24 +136,25 @@ Please note that prefix caching is not yet supported for any of the above models
 
 Whisper is supported. Other models requiring cross-attention between separate
 encoder and decoder (e.g., `BartForConditionalGeneration`,
-`MllamaForConditionalGeneration`) are not supported.
+`MllamaForConditionalGeneration`) are no longer supported.
 
 ### Features
 
 | Feature                                     | Status                                                                            |
 |---------------------------------------------|-----------------------------------------------------------------------------------|
-| **Prefix Caching**                          | <nobr>🚀 Optimized</nobr>                                                         |
-| **Chunked Prefill**                         | <nobr>🚀 Optimized</nobr>                                                         |
-| **LoRA**                                    | <nobr>🚀 Optimized</nobr>                                                         |
+| **Prefix Caching**                          | <nobr>🟢 Functional</nobr>                                                        |
+| **Chunked Prefill**                         | <nobr>🟢 Functional</nobr>                                                        |
+| **LoRA**                                    | <nobr>🟢 Functional</nobr>                                                        |
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
-| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices (<https://github.com/vllm-project/vllm/pull/15191>)</nobr>|
-| **Spec Decode**                             | <nobr>🚀 Optimized</nobr>                                                         |
-| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
+| **FP8 KV Cache**                            | <nobr>🟢 Functional</nobr>                                                        |
+| **Spec Decode**                             | <nobr>🟢 Functional</nobr>                                                        |
+| **Prompt Logprobs with Prefix Caching**     | <nobr>🟢 Functional</nobr>                                                        |
 | **Structured Output Alternative Backends**  | <nobr>🟢 Functional</nobr>                                                        |
-| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
-| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
-| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
-| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
+| **Concurrent Partial Prefills**             | <nobr>🟡 [In Progress](https://github.com/vllm-project/vllm/issues/14003)</nobr>  |
+| **best_of**                                 | <nobr>🔴 [Removed](https://github.com/vllm-project/vllm/issues/13361)</nobr>      |
+| **Per-Request Logits Processors**           | <nobr>🔴 [Removed](https://github.com/vllm-project/vllm/pull/13360)</nobr>        |
+| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Removed</nobr>                                                           |
+| **Request-level Structured Output Backend** | <nobr>🔴 Removed</nobr>                                                           |
 
 !!! note
 
@@ -139,37 +164,16 @@ encoder and decoder (e.g., `BartForConditionalGeneration`,
     prefix caching, and speculative decoding without a strict separation between prefill
     and decode phases.
 
-#### Semantic Changes to Logprobs
-
-vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
-differences compared to V0:
-
-##### Logprobs Calculation
-
-By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
-before applying any logits post-processing such as temperature scaling or penalty
-adjustments). As a result, the returned logprobs do not reflect the final adjusted
-probabilities used during sampling.
-
-You can adjust this behavior by setting the `--logprobs-mode` flag.
-Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`.
-Raw means the values before applying any logit processors, like bad words.
-Processed means the values after applying all processors, including temperature and top_k/top_p.
-
-##### Prompt Logprobs with Prefix Caching
-
-Logprobs are not cached. For a request requiring prompt logprobs, the engine will ignore the prefix cache and recompute the prefill of full prompt to generate the logprobs.
-
-#### Deprecated Features
+#### Removed Features
 
-As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
+As part of the major architectural rework in vLLM V1, several legacy features have been removed.
 
 ##### Sampling features
 
-- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
+- **best_of**: This feature has been removed due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
 - **Per-Request Logits Processors**: In V0, users could pass custom
   processing functions to adjust logits on a per-request basis. In vLLM V1, this
-  feature has been deprecated. Instead, we now support **global logits processors**
+  feature has been removed. Instead, we now support **global logits processors**
   which are set at startup time, see [RFC #17799](https://github.com/vllm-project/vllm/issues/17799).
 
 ##### KV Cache features
@@ -179,4 +183,4 @@ to handle request preemptions.
 
 ##### Structured Output features
 
-- **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now.
+- **Request-level Structured Output Backend**: Removed; alternative backends (outlines, guidance) with fallbacks are supported now.
diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py
index e135bc1b2abb..72c1e841dca4 100644
--- a/examples/offline_inference/reproducibility.py
+++ b/examples/offline_inference/reproducibility.py
@@ -11,8 +11,11 @@
 
 from vllm import LLM, SamplingParams
 
-# Turn off multiprocessing to make the scheduling deterministic.
+# Either:
+## Turn off multiprocessing to make the scheduling deterministic, or
 os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+## Enable batch invariance to get consistent results regardless of scheduling.
+os.environ["VLLM_BATCH_INVARIANT"] = "1"
 
 prompts = [
     "Hello, my name is",
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 0cdb7c9a603f..df6c2cab7814 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -10,13 +10,6 @@
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
-# These have unsupported head_dim for FA. We do not
-# have a clean way to fall back, so we fail with
-# a clear msg when it happens.
-# https://github.com/vllm-project/vllm/issues/14524
-# NOTE(woosuk): Skipping these tests until V1 supports them.
-# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
-
 # This list contains the model that are using AITER kernel.
 # Skip model that are not using AITER tests.
 # When more AITER kernels are added, this list will not be

From 20ee418adc279f29e76e7770c4f688c4fc070274 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 22 Nov 2025 20:12:50 -0800
Subject: [PATCH 348/578] [Model Runner V2] Minor fix for cudagraph_utils
 (#29256)

---
 vllm/v1/worker/gpu/cudagraph_utils.py | 19 +++++--------------
 vllm/v1/worker/gpu/model_runner.py    |  1 +
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 763bd6183462..654bd60e558b 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import gc
-from contextlib import contextmanager
+from unittest.mock import patch
 
 import numpy as np
 import torch
@@ -140,6 +139,7 @@ def capture_graph(
             attn_metadata,
             self.vllm_config,
             num_tokens=batch_size,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
             num_tokens_across_dp=num_tokens_across_dp,
         ):
             hidden_states = model(
@@ -148,15 +148,16 @@ def capture_graph(
             )
             if self.hidden_states is None:
                 self.hidden_states = torch.empty_like(hidden_states)
-        torch.cuda.synchronize()
 
         # Capture the graph.
         graph = torch.cuda.CUDAGraph()
         with (
+            patch("torch.cuda.empty_cache", lambda: None),
             set_forward_context(
                 attn_metadata,
                 self.vllm_config,
                 num_tokens=batch_size,
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
                 num_tokens_across_dp=num_tokens_across_dp,
             ),
             torch.cuda.graph(graph, self.pool),
@@ -183,7 +184,7 @@ def capture(
         if is_global_first_rank():
             sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs")
 
-        with freeze_gc(), graph_capture(device=self.device):
+        with graph_capture(device=self.device):
             for batch_size in sizes_to_capture:
                 self.capture_graph(
                     batch_size,
@@ -199,13 +200,3 @@ def run(self, batch_size: int) -> torch.Tensor:
         self.graphs[batch_size].replay()
         assert self.hidden_states is not None
         return self.hidden_states[:batch_size]
-
-
-@contextmanager
-def freeze_gc():
-    gc.collect()
-    gc.freeze()
-    try:
-        yield
-    finally:
-        gc.unfreeze()
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 9ca37ff282d8..9d6e2cf92a8c 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -298,6 +298,7 @@ def capture_model(self) -> int:
             return 0
 
         start_time = time.perf_counter()
+        torch.cuda.empty_cache()
         start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
         with self.maybe_setup_dummy_loras(self.lora_config):

From 71362ffab4ef3f5f99ecae2f345f8c689e903f7d Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Sat, 22 Nov 2025 22:42:49 -0600
Subject: [PATCH 349/578] [CI/Build][AMD] Skip
 test_multi_shared_storage_connector_consistency  in test_multi_connector.py
 due to hipErrorLaunchFailure  when calling .cpu() (#29253)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/kv_connector/unit/test_multi_connector.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index 1c1ac915c758..ffa7d884d276 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -20,6 +20,7 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
     NixlKVConnectorStats,
 )
+from vllm.platforms import current_platform
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -69,6 +70,13 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
     return True
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason=(
+        "hipErrorLaunchFailure when running this test, see issue:"
+        "https://github.com/ROCm/pytorch/issues/2822"
+    ),
+)
 def test_multi_shared_storage_connector_consistency():
     """
     Tests that MultiConnector with two SharedStorageConnectors saves

From 3999442f1c1d091dda370bc8cb2022f54b7d805b Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Sat, 22 Nov 2025 22:45:08 -0600
Subject: [PATCH 350/578] [CI/Build][AMD] Add check for flash_att_varlen_func
 to test_tree_attention.py (#29252)

Signed-off-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
---
 tests/v1/spec_decode/test_tree_attention.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index 6958d62dc7e9..a4ee53008ce8 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -3,6 +3,7 @@
 
 import math
 
+import pytest
 import torch
 
 from tests.v1.attention.utils import (
@@ -11,9 +12,16 @@
     try_get_attention_backend,
 )
 from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
 from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 
+if not is_flash_attn_varlen_func_available():
+    pytest.skip(
+        "This test requires flash_attn_varlen_func, but it's not available.",
+        allow_module_level=True,
+    )
+
 
 class MockAttentionLayer(torch.nn.Module):
     _q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")

From 55c21c88363811feb2aeb5aecac3cd48683e4705 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Sat, 22 Nov 2025 23:05:00 -0600
Subject: [PATCH 351/578] [ROCm][CI] Fix "Cannot re-initialize CUDA in forked
 subprocess" in test_pynccl.py  (#29119)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 requirements/rocm-test.txt       | 3 +++
 tests/distributed/test_pynccl.py | 8 +++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index eabb5065bfce..2d57e7e16786 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -40,5 +40,8 @@ mteb[bm25s]>=1.38.11, <2
 # Required for eval tests
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
 
+# Required for multiprocessed tests that use spawn method
+multiprocess==0.70.16
+
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index c3085beeb356..c7c9d0602def 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import multiprocessing
 import os
 
+import multiprocess as mp
 import numpy as np
 import pytest
 import torch
@@ -20,10 +20,12 @@
 )
 from vllm.utils.system_utils import update_environment_variables
 
+mp.set_start_method("spawn", force=True)
+
 
 def distributed_run(fn, world_size):
     number_of_processes = world_size
-    processes: list[multiprocessing.Process] = []
+    processes: list[mp.Process] = []
     for i in range(number_of_processes):
         env: dict[str, str] = {}
         env["RANK"] = str(i)
@@ -32,7 +34,7 @@ def distributed_run(fn, world_size):
         env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
         env["MASTER_ADDR"] = "localhost"
         env["MASTER_PORT"] = "12345"
-        p = multiprocessing.Process(target=fn, args=(env,))
+        p = mp.Process(target=fn, args=(env,))
         processes.append(p)
         p.start()
 

From 6fb0215eee44cf5e4b28f57e6739ef4a51945127 Mon Sep 17 00:00:00 2001
From: Luke <yq0536@gmail.com>
Date: Sun, 23 Nov 2025 06:43:21 -0500
Subject: [PATCH 352/578] [Bugfix] Use lazy string reference for
 DeepseekV3Config in config registry (#28958)

Signed-off-by: Luke <yq0536@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/transformers_utils/config.py           | 4 ++--
 vllm/transformers_utils/configs/__init__.py | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index db7bf228f411..3d282da8c611 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -27,7 +27,7 @@
     RevisionNotFoundError,
 )
 from packaging.version import Version
-from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig
+from transformers import GenerationConfig, PretrainedConfig
 from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 from transformers.models.auto.image_processing_auto import get_image_processor_config
 from transformers.models.auto.modeling_auto import (
@@ -84,7 +84,7 @@ def __getitem__(self, key):
     afmoe="AfmoeConfig",
     chatglm="ChatGLMConfig",
     deepseek_vl_v2="DeepseekVLV2Config",
-    deepseek_v32=DeepseekV3Config,
+    deepseek_v32="DeepseekV3Config",
     flex_olmo="FlexOlmoConfig",
     kimi_linear="KimiLinearConfig",
     kimi_vl="KimiVLConfig",
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index dcae05a15fec..d28fd8d03337 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -5,8 +5,13 @@
 
 - There is no configuration file defined by HF Hub or Transformers library.
 - There is a need to override the existing config to support vLLM.
+- The HF model_type isn't recognized by the Transformers library but can
+  be mapped to an existing Transformers config, such as
+  deepseek-ai/DeepSeek-V3.2-Exp.
 """
 
+from transformers import DeepseekV3Config
+
 from vllm.transformers_utils.configs.afmoe import AfmoeConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
@@ -44,6 +49,7 @@
     "AfmoeConfig",
     "ChatGLMConfig",
     "DeepseekVLV2Config",
+    "DeepseekV3Config",
     "DotsOCRConfig",
     "EAGLEConfig",
     "FlexOlmoConfig",

From 7f12c82fa62cee7e815332c1ab2714bdee2a075e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 23 Nov 2025 09:42:52 -0800
Subject: [PATCH 353/578] [Model Runner V2] Change bookkeeping logic in
 preparation for spec decoding (#29194)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/async_utils.py     |   7 +-
 vllm/v1/worker/gpu/attn_utils.py      |  14 +-
 vllm/v1/worker/gpu/cudagraph_utils.py |   8 +-
 vllm/v1/worker/gpu/input_batch.py     | 174 ++++++++++++++++-------
 vllm/v1/worker/gpu/model_runner.py    | 191 ++++++++++++++++----------
 vllm/v1/worker/gpu/states.py          |  13 +-
 6 files changed, 268 insertions(+), 139 deletions(-)

diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
index e523090aa217..421fb29a7f87 100644
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
 
-import numpy as np
 import torch
 
 from vllm.v1.outputs import (
@@ -18,7 +17,7 @@ def __init__(
         self,
         model_runner_output: ModelRunnerOutput,
         sampler_output: SamplerOutput,
-        num_sampled_tokens: np.ndarray,
+        num_sampled_tokens: torch.Tensor,
         copy_stream: torch.cuda.Stream,
         copy_event: torch.cuda.Event,
     ):
@@ -52,6 +51,7 @@ def __init__(
                 )
             else:
                 self.logprobs_tensors = None
+            self.num_sampled_tokens = num_sampled_tokens.to("cpu", non_blocking=True)
             self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
             if self.model_runner_output.prompt_logprobs_dict:
                 for k, v in self.model_runner_output.prompt_logprobs_dict.items():
@@ -63,6 +63,7 @@ def __init__(
 
     def get_output(self) -> ModelRunnerOutput:
         self.copy_event.synchronize()
+        num_sampled_tokens_np = self.num_sampled_tokens.numpy()
 
         # NOTE(woosuk): The following code is to ensure compatibility with
         # the existing model runner.
@@ -71,7 +72,7 @@ def get_output(self) -> ModelRunnerOutput:
         sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
         num_reqs = len(sampled_token_ids)
         for i in range(num_reqs):
-            del sampled_token_ids[i][self.num_sampled_tokens[i] :]
+            del sampled_token_ids[i][num_sampled_tokens_np[i] :]
         self.model_runner_output.sampled_token_ids = sampled_token_ids
 
         if self.logprobs_tensors is not None:
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index 222db565dff1..4510a1c5ca1e 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -3,6 +3,7 @@
 from collections.abc import Sequence
 from typing import Any, cast
 
+import numpy as np
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
@@ -145,8 +146,9 @@ def build_attn_metadata(
     num_reqs: int,
     num_tokens: int,
     query_start_loc: CpuGpuBuffer,
-    seq_lens: CpuGpuBuffer,
-    num_computed_tokens_cpu: torch.Tensor,
+    seq_lens: torch.Tensor,
+    seq_lens_np: np.ndarray,
+    num_computed_tokens_cpu: torch.Tensor | None,
     block_tables: Sequence[torch.Tensor],
     slot_mappings: torch.Tensor,
     kv_cache_config: KVCacheConfig,
@@ -154,9 +156,9 @@ def build_attn_metadata(
     query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
     query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
     max_query_len = int(query_start_loc.np[: num_reqs + 1].max())
-    seq_lens_gpu = seq_lens.gpu[:num_reqs]
-    seq_lens_cpu = seq_lens.cpu[:num_reqs]
-    max_seq_len = int(seq_lens.np[:num_reqs].max())
+    seq_lens = seq_lens[:num_reqs]
+    seq_lens_cpu = torch.from_numpy(seq_lens_np)
+    max_seq_len = int(seq_lens_np.max())
 
     attn_metadata: dict[str, Any] = {}
     kv_cache_groups = kv_cache_config.kv_cache_groups
@@ -167,7 +169,7 @@ def build_attn_metadata(
         common_attn_metadata = CommonAttentionMetadata(
             query_start_loc=query_start_loc_gpu,
             query_start_loc_cpu=query_start_loc_cpu,
-            seq_lens=seq_lens_gpu,
+            seq_lens=seq_lens,
             seq_lens_cpu=seq_lens_cpu,
             max_seq_len=max_seq_len,
             num_computed_tokens_cpu=num_computed_tokens_cpu,
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 654bd60e558b..dccf747369e4 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -101,14 +101,13 @@ def capture_graph(
 
         # Prepare dummy inputs.
         input_ids = input_buffers.input_ids.gpu[:batch_size]
-        positions = input_buffers.positions.gpu[:batch_size]
+        positions = input_buffers.positions[:batch_size]
 
         input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1)
         input_buffers.query_start_loc.np[batch_size:] = batch_size
         input_buffers.query_start_loc.copy_to_gpu()
-        input_buffers.seq_lens.np[:batch_size] = self.max_model_len
-        input_buffers.seq_lens.np[batch_size:] = 0
-        input_buffers.seq_lens.copy_to_gpu()
+        input_buffers.seq_lens[:batch_size] = self.max_model_len
+        input_buffers.seq_lens[batch_size:] = 0
 
         input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables]
         slot_mappings = block_tables.slot_mappings[:, :batch_size]
@@ -119,6 +118,7 @@ def capture_graph(
             num_tokens=batch_size,
             query_start_loc=input_buffers.query_start_loc,
             seq_lens=input_buffers.seq_lens,
+            seq_lens_np=np.full(batch_size, self.max_model_len, dtype=np.int32),
             num_computed_tokens_cpu=None,  # FIXME
             block_tables=input_block_tables,
             slot_mappings=slot_mappings,
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 8313b32d2979..b671c093113b 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -32,9 +32,9 @@ def __init__(
 
         self.idx_mapping = self._make_buffer(max_num_reqs, dtype=torch.int32)
         self.input_ids = self._make_buffer(max_num_tokens, dtype=torch.int32)
-        self.positions = self._make_buffer(max_num_tokens, dtype=torch.int64)
+        self.positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=device)
         self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
-        self.seq_lens = self._make_buffer(max_num_reqs, dtype=torch.int32)
+        self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
 
         # Structured outputs.
         self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
@@ -107,13 +107,15 @@ def make_dummy(
         query_start_loc_np = input_buffers.query_start_loc.np[: num_reqs + 1]
         query_start_loc = input_buffers.query_start_loc.copy_to_gpu()[: num_reqs + 1]
         # seq_len equals to query_len
-        input_buffers.seq_lens.np[:num_reqs] = num_scheduled_tokens
-        input_buffers.seq_lens.np[num_reqs:] = 0
-        seq_lens_np = input_buffers.seq_lens.np[:num_reqs]
-        seq_lens = input_buffers.seq_lens.copy_to_gpu()[:num_reqs]
+        seq_lens_np = np.full(num_reqs, num_tokens // num_reqs, dtype=np.int32)
+        seq_lens_np[-1] += num_tokens % num_reqs
+        input_buffers.seq_lens[:num_reqs] = num_tokens // num_reqs
+        input_buffers.seq_lens[num_reqs - 1] += num_tokens % num_reqs
+        input_buffers.seq_lens[num_reqs:] = 0
+        seq_lens = input_buffers.seq_lens[:num_reqs]
 
         input_ids = input_buffers.input_ids.copy_to_gpu(num_tokens)
-        positions = input_buffers.positions.copy_to_gpu(num_tokens)
+        positions = input_buffers.positions[:num_tokens]
         # attn_metadata = defaultdict(lambda: None)
         logits_indices = query_start_loc[1:] - 1
         return cls(
@@ -141,27 +143,25 @@ def make_dummy(
     [
         types.none(
             types.int32[:],  # idx_mapping
-            types.int32[:, :],  # token_ids
-            types.int32[:],  # num_computed_tokens
             types.int32[:],  # num_scheduled_tokens
+            types.int32[:, :],  # prefill_token_ids
+            types.int32[:],  # num_computed_prefill_tokens
+            types.int32[:],  # prefill_len
             types.int32[:],  # input_ids
-            types.int64[:],  # positions
             types.int32[:],  # query_start_loc
-            types.int32[:],  # seq_lens
         )
     ],
     nopython=True,
     cache=True,
 )
-def _prepare_inputs(
+def _prepare_prefill_inputs(
     idx_mapping: np.ndarray,  # batch_idx -> req_idx
-    token_ids: np.ndarray,  # [N, max_model_len]
-    num_computed_tokens: np.ndarray,  # [N]
     num_scheduled_tokens: np.ndarray,  # [B]
+    prefill_token_ids: np.ndarray,  # [N, max_model_len]
+    num_computed_prefill_tokens: np.ndarray,  # [N]
+    prefill_len: np.ndarray,  # [N]
     input_ids: np.ndarray,  # [num_input_tokens]
-    positions: np.ndarray,  # [num_input_tokens]
     query_start_loc: np.ndarray,  # [B + 1]
-    seq_lens: np.ndarray,  # [B]
 ) -> None:
     num_reqs = num_scheduled_tokens.shape[0]
     query_start_loc[0] = 0
@@ -170,62 +170,112 @@ def _prepare_inputs(
     for i in range(num_reqs):
         req_idx = idx_mapping[i]
         query_len = num_scheduled_tokens[i]
-        start = num_computed_tokens[req_idx]
-        end = start + query_len
-        seq_lens[i] = end
+
+        start = num_computed_prefill_tokens[req_idx]
+        end = min(start + query_len, prefill_len[req_idx])
+        n = end - start
 
         start_idx = cu_num_tokens
-        end_idx = start_idx + query_len
-        input_ids[start_idx:end_idx] = token_ids[req_idx, start:end]
-        positions[start_idx:end_idx] = np.arange(start, end, dtype=np.int64)
+        input_ids[start_idx : start_idx + n] = prefill_token_ids[req_idx, start:end]
 
-        cu_num_tokens = end_idx
+        cu_num_tokens = start_idx + query_len
         query_start_loc[i + 1] = cu_num_tokens
 
     # Pad the inputs for CUDA graphs.
     # Note: pad query_start_loc to be non-decreasing, as kernels
     # like FlashAttention requires that
     query_start_loc[num_reqs + 1 :].fill(cu_num_tokens)
-    # Fill unused with 0 for full cuda graph mode.
-    seq_lens[num_reqs:].fill(0)
 
 
-def prepare_inputs(
+def prepare_prefill_inputs(
     idx_mapping: np.ndarray,
-    prefill_token_ids: np.ndarray,
-    num_computed_tokens: np.ndarray,
     num_scheduled_tokens: np.ndarray,
+    total_num_tokens: int,
+    prefill_token_ids: np.ndarray,
+    num_computed_prefill_tokens: np.ndarray,
+    prefill_len: np.ndarray,
     input_ids: CpuGpuBuffer,
-    positions: CpuGpuBuffer,
     query_start_loc: CpuGpuBuffer,
-    seq_lens: CpuGpuBuffer,
-    num_tokens: int,
 ) -> None:
-    _prepare_inputs(
+    _prepare_prefill_inputs(
         idx_mapping,
-        prefill_token_ids,
-        num_computed_tokens,
         num_scheduled_tokens,
+        prefill_token_ids,
+        num_computed_prefill_tokens,
+        prefill_len,
         input_ids.np,
-        positions.np,
         query_start_loc.np,
-        seq_lens.np,
     )
-    input_ids.copy_to_gpu(num_tokens)
-    positions.copy_to_gpu(num_tokens)
+    input_ids.copy_to_gpu(total_num_tokens)
     # NOTE(woosuk): We should copy the whole query_start_loc and seq_lens
     # tensors from CPU to GPU, because they may include paddings needed
     # for full CUDA graph mode.
     query_start_loc.copy_to_gpu()
-    seq_lens.copy_to_gpu()
-    return
 
 
 @triton.jit
-def _combine_last_token_ids_kernel(
+def _prepare_pos_seq_lens_kernel(
+    pos_ptr,
+    seq_lens_ptr,
+    idx_mapping_ptr,
+    query_start_loc_ptr,
+    num_computed_tokens_ptr,
+    max_num_reqs,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_id = tl.program_id(0)
+    num_reqs = tl.num_programs(0) - 1
+    if req_id == num_reqs:
+        # Pad unused seq_lens as 0 for full CUDA graphs.
+        for i in tl.range(num_reqs, max_num_reqs, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            mask = block < max_num_reqs
+            tl.store(seq_lens_ptr + block, 0, mask=mask)
+        return
+
+    req_state_idx = tl.load(idx_mapping_ptr + req_id)
+    num_computed_tokens = tl.load(num_computed_tokens_ptr + req_state_idx)
+
+    start = tl.load(query_start_loc_ptr + req_id)
+    end = tl.load(query_start_loc_ptr + req_id + 1)
+    query_len = end - start
+
+    seq_len = num_computed_tokens + query_len
+    tl.store(seq_lens_ptr + req_id, seq_len)
+
+    for i in tl.range(0, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        pos = num_computed_tokens + block
+        tl.store(pos_ptr + start + block, pos, mask=mask)
+
+
+def prepare_pos_seq_lens(
+    idx_mapping: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    num_computed_tokens: torch.Tensor,
+    pos: torch.Tensor,
+    seq_lens: torch.Tensor,
+) -> None:
+    num_reqs = idx_mapping.shape[0]
+    # NOTE(woosuk): We do +1 because the last thread block is used
+    # to pad unused seq_lens as 0 for full CUDA graphs.
+    _prepare_pos_seq_lens_kernel[(num_reqs + 1,)](
+        pos,
+        seq_lens,
+        idx_mapping,
+        query_start_loc,
+        num_computed_tokens,
+        seq_lens.shape[0],
+        BLOCK_SIZE=1024,
+    )
+
+
+@triton.jit
+def _combine_sampled_and_draft_tokens_kernel(
     input_ids_ptr,
     idx_mapping_ptr,
-    last_token_ids_ptr,
+    last_sampled_tokens_ptr,
     query_start_loc_ptr,
     seq_lens_ptr,
     prefill_len_ptr,
@@ -239,26 +289,56 @@ def _combine_last_token_ids_kernel(
         # Handling prefill tokens.
         return
 
-    last_token_id = tl.load(last_token_ids_ptr + req_state_idx)
+    last_token_id = tl.load(last_sampled_tokens_ptr + req_state_idx)
     end = tl.load(query_start_loc_ptr + batch_idx + 1)
     tl.store(input_ids_ptr + end - 1, last_token_id)
 
 
-def combine_last_token_ids(
+def combine_sampled_and_draft_tokens(
     input_ids: torch.Tensor,
     idx_mapping: torch.Tensor,
-    last_token_ids: torch.Tensor,
+    last_sampled_tokens: torch.Tensor,
     query_start_loc: torch.Tensor,
     seq_lens: torch.Tensor,
     prefill_len: torch.Tensor,
 ) -> torch.Tensor:
     num_reqs = seq_lens.shape[0]
-    _combine_last_token_ids_kernel[(num_reqs,)](
+    _combine_sampled_and_draft_tokens_kernel[(num_reqs,)](
         input_ids,
         idx_mapping,
-        last_token_ids,
+        last_sampled_tokens,
         query_start_loc,
         seq_lens,
         prefill_len,
     )
     return input_ids
+
+
+@triton.jit
+def _update_num_computed_tokens_kernel(
+    idx_mapping_ptr,
+    num_computed_tokens_ptr,
+    query_start_loc_ptr,
+):
+    req_id = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + req_id)
+
+    start = tl.load(query_start_loc_ptr + req_id)
+    end = tl.load(query_start_loc_ptr + req_id + 1)
+    query_len = end - start
+
+    n = tl.load(num_computed_tokens_ptr + req_state_idx)
+    tl.store(num_computed_tokens_ptr + req_state_idx, n + query_len)
+
+
+def update_num_computed_tokens(
+    idx_mapping: torch.Tensor,
+    num_computed_tokens: torch.Tensor,
+    query_start_loc: torch.Tensor,
+) -> None:
+    num_reqs = idx_mapping.shape[0]
+    _update_num_computed_tokens_kernel[(num_reqs,)](
+        idx_mapping,
+        num_computed_tokens,
+        query_start_loc,
+    )
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 9d6e2cf92a8c..bacfbd6c2f46 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -39,8 +39,10 @@
 from vllm.v1.worker.gpu.input_batch import (
     InputBatch,
     InputBuffers,
-    combine_last_token_ids,
-    prepare_inputs,
+    combine_sampled_and_draft_tokens,
+    prepare_pos_seq_lens,
+    prepare_prefill_inputs,
+    update_num_computed_tokens,
 )
 from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
 from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
@@ -179,6 +181,9 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             self.vllm_config,
             self.device,
         )
+        # TODO(woosuk): Support other backends.
+        if not all(b.get_name() == "FLASH_ATTN" for b in self.attn_backends.values()):
+            raise NotImplementedError("Only FLASH_ATTN backend is supported currently.")
 
         self.kv_caches: list[torch.Tensor] = []
         init_kv_cache(
@@ -196,8 +201,8 @@ def prepare_dummy_attn_metadata(self, input_batch: InputBatch) -> None:
         slot_mappings = self.block_tables.get_dummy_slot_mappings(
             input_batch.num_tokens
         )
-        num_computed_tokens_cpu = torch.zeros(
-            input_batch.num_reqs, dtype=torch.int32, device="cpu"
+        num_computed_tokens = torch.zeros(
+            input_batch.num_reqs, dtype=torch.int32, device=self.device
         )
         attn_metadata = build_attn_metadata(
             attn_metadata_builders=self.attn_metadata_builders,
@@ -205,7 +210,8 @@ def prepare_dummy_attn_metadata(self, input_batch: InputBatch) -> None:
             num_tokens=input_batch.num_tokens,
             query_start_loc=self.input_buffers.query_start_loc,
             seq_lens=self.input_buffers.seq_lens,
-            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            seq_lens_np=input_batch.seq_lens_np,
+            num_computed_tokens_cpu=num_computed_tokens,
             block_tables=block_tables,
             slot_mappings=slot_mappings,
             kv_cache_config=self.kv_cache_config,
@@ -368,6 +374,9 @@ def update_states(self, scheduler_output: SchedulerOutput) -> None:
                 cu_num_new_blocks[i].append(x + len(block_ids))
                 new_block_ids[i].extend(block_ids)
             overwrite.append(True)
+        # Update the GPU tensors for request states.
+        if scheduler_output.scheduled_new_reqs:
+            self.req_states.prefill_len.copy_to_gpu()
 
         # Add new blocks for the existing requests.
         cached_reqs = scheduler_output.scheduled_cached_reqs
@@ -421,46 +430,60 @@ def prepare_inputs(
         # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
         block_tables = self.block_tables.gather_block_tables(idx_mapping)
 
-        prepare_inputs(
+        # Copy prefill tokens from CPU to GPU and get query_start_loc.
+        prepare_prefill_inputs(
             idx_mapping_np,
-            self.req_states.prefill_token_ids,
-            self.req_states.num_computed_tokens,
             num_scheduled_tokens,
+            num_tokens,
+            self.req_states.prefill_token_ids,
+            self.req_states.num_computed_prefill_tokens,
+            self.req_states.prefill_len.np,
             self.input_buffers.input_ids,
-            self.input_buffers.positions,
             self.input_buffers.query_start_loc,
-            self.input_buffers.seq_lens,
-            num_tokens,
         )
-
         query_start_loc = self.input_buffers.query_start_loc
         query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
         query_start_loc_np = query_start_loc.np[: num_reqs + 1]
-        seq_lens_gpu = self.input_buffers.seq_lens.gpu[:num_reqs]
-        seq_lens_np = self.input_buffers.seq_lens.np[:num_reqs]
 
-        # Some input token ids are directly read from the last sampled tokens.
-        combine_last_token_ids(
+        # Prepare positions and seq_lens.
+        prepare_pos_seq_lens(
+            idx_mapping,
+            query_start_loc_gpu,
+            self.req_states.num_computed_tokens,
+            self.input_buffers.positions,
+            self.input_buffers.seq_lens,
+        )
+        seq_lens = self.input_buffers.seq_lens[:num_reqs]
+
+        # Some input token ids are directly read from the last sampled tokens
+        # and draft tokens.
+        combine_sampled_and_draft_tokens(
             self.input_buffers.input_ids.gpu,
             idx_mapping,
             self.req_states.last_sampled_tokens,
             query_start_loc_gpu,
-            seq_lens_gpu,
-            self.req_states.prefill_len.copy_to_gpu(),
+            seq_lens,
+            self.req_states.prefill_len.gpu,
         )
 
         # Compute slot mappings: [num_kv_cache_groups, num_tokens]
         slot_mappings = self.block_tables.compute_slot_mappings(
-            query_start_loc_gpu, self.input_buffers.positions.gpu[:num_tokens]
-        )
-
-        num_computed_tokens_cpu = torch.from_numpy(
-            self.req_states.num_computed_tokens[idx_mapping_np]
+            query_start_loc_gpu, self.input_buffers.positions[:num_tokens]
         )
 
         # Logits indices to sample next token from.
         logits_indices = query_start_loc_gpu[1:] - 1
 
+        # Get num_computed_tokens.
+        # HACK(woosuk): Here, we use num_computed_tokens on GPU instead of
+        # num_computed_tokens_cpu. This works for most cases.
+        num_computed_tokens = self.req_states.num_computed_tokens[idx_mapping]
+        # HACK(woosuk): Only GPU has the exact seq_lens because at this point
+        # CPU does not know how many draft tokens are accepted/rejected in the
+        # previous step. Therefore, we use max_model_len to be safe.
+        # NOTE(woosuk): This only works for FA3 backend.
+        seq_lens_np = np.full(num_reqs, self.max_model_len, dtype=np.int32)
+
         # Layer name -> attention metadata.
         attn_metadata = build_attn_metadata(
             attn_metadata_builders=self.attn_metadata_builders,
@@ -468,14 +491,15 @@ def prepare_inputs(
             num_tokens=num_tokens,
             query_start_loc=self.input_buffers.query_start_loc,
             seq_lens=self.input_buffers.seq_lens,
-            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            seq_lens_np=seq_lens_np,
+            num_computed_tokens_cpu=num_computed_tokens,
             block_tables=block_tables,
             slot_mappings=slot_mappings,
             kv_cache_config=self.kv_cache_config,
         )
 
         input_ids = self.input_buffers.input_ids.gpu[:num_tokens_after_padding]
-        positions = self.input_buffers.positions.gpu[:num_tokens_after_padding]
+        positions = self.input_buffers.positions[:num_tokens_after_padding]
         return InputBatch(
             req_ids=req_ids,
             num_reqs=num_reqs,
@@ -486,7 +510,7 @@ def prepare_inputs(
             num_tokens_after_padding=num_tokens_after_padding,
             query_start_loc=query_start_loc_gpu,
             query_start_loc_np=query_start_loc_np,
-            seq_lens=seq_lens_gpu,
+            seq_lens=seq_lens,
             seq_lens_np=seq_lens_np,
             input_ids=input_ids,
             positions=positions,
@@ -500,11 +524,12 @@ def sample(
         input_batch: InputBatch,
         sampling_metadata: SamplingMetadata,
         grammar_output: GrammarOutput | None,
-    ) -> SamplerOutput:
+    ) -> tuple[SamplerOutput, torch.Tensor]:
         sample_hidden_states = hidden_states[input_batch.logits_indices]
         logits = self.model.compute_logits(sample_hidden_states)
         if grammar_output is not None:
             # Apply grammar bitmask to the logits in-place.
+            # TODO(woosuk): Make compatible with spec decoding.
             with async_barrier(self.structured_outputs_event):
                 apply_grammar_bitmask(
                     logits,
@@ -513,8 +538,14 @@ def sample(
                     grammar_output.grammar_bitmask,
                     self.input_buffers,
                 )
+
         sampler_output = self.sampler(logits, sampling_metadata)
-        return sampler_output
+        # Get the number of sampled tokens.
+        # 0 if chunked-prefilling, 1 if not.
+        prefill_len = self.req_states.prefill_len.gpu[input_batch.idx_mapping]
+        is_chunked_prefilling = input_batch.seq_lens < prefill_len
+        num_sampled = (~is_chunked_prefilling).int()
+        return sampler_output, num_sampled
 
     def compute_prompt_logprobs(
         self,
@@ -527,11 +558,11 @@ def compute_prompt_logprobs(
             # No request asks for prompt logprobs.
             return {}
 
-        num_computed_tokens = self.req_states.num_computed_tokens[idx_mapping_np]
         prompt_lens = self.req_states.prompt_len[idx_mapping_np]
         # NOTE(woosuk): -1 because the last prompt token's hidden state is not
         # needed for prompt logprobs.
-        includes_prompt = num_computed_tokens < prompt_lens - 1
+        computed_prefill = self.req_states.num_computed_prefill_tokens[idx_mapping_np]
+        includes_prompt = computed_prefill < prompt_lens - 1
         # NOTE(woosuk): If the request was resumed after preemption, its prompt
         # logprobs must have been computed before preemption. Skip.
         resumed_after_prompt = (
@@ -550,8 +581,8 @@ def compute_prompt_logprobs(
         token_ids[n - 1] = 0
 
         # Handle chunked prompts.
-        seq_lens = self.input_buffers.seq_lens.np[: input_batch.num_reqs]
-        is_prompt_chunked = seq_lens < prompt_lens
+        pos_after_step = computed_prefill + input_batch.num_scheduled_tokens
+        is_prompt_chunked = pos_after_step < prompt_lens
         prefill_token_ids = self.req_states.prefill_token_ids
         query_start_loc = self.input_buffers.query_start_loc.np
         for i, req_id in enumerate(input_batch.req_ids):
@@ -561,7 +592,7 @@ def compute_prompt_logprobs(
                 continue
             # The prompt is chunked. Get the next prompt token.
             req_idx = input_batch.idx_mapping_np[i]
-            next_prompt_token = int(prefill_token_ids[req_idx, seq_lens[i]])
+            next_prompt_token = int(prefill_token_ids[req_idx, pos_after_step[i]])
             idx = int(query_start_loc[i + 1] - 1)
             # Set the next prompt token.
             # NOTE(woosuk): This triggers a GPU operation.
@@ -617,48 +648,27 @@ def compute_prompt_logprobs(
 
     def postprocess(
         self,
-        sampler_output: SamplerOutput,
-        prompt_logprobs_dict: dict[str, LogprobsTensors],
         input_batch: InputBatch,
-    ) -> AsyncOutput | ModelRunnerOutput:
-        # Store the last sampled token ids.
-        self.req_states.last_sampled_tokens[input_batch.idx_mapping] = (
-            sampler_output.sampled_token_ids
+        sampled_tokens: torch.Tensor,
+        num_sampled: torch.Tensor,
+    ) -> None:
+        # Update the number of computed tokens.
+        update_num_computed_tokens(
+            input_batch.idx_mapping,
+            self.req_states.num_computed_tokens,
+            input_batch.query_start_loc,
         )
-        # Get the number of sampled tokens.
-        # 0 if chunked-prefilling, 1 if not.
         idx_mapping_np = input_batch.idx_mapping_np
-        is_chunked_prefilling = (
-            input_batch.seq_lens_np < self.req_states.num_tokens[idx_mapping_np]
-        )
-        num_sampled_tokens = (~is_chunked_prefilling).astype(np.int32)
-        # Increment the number of tokens.
-        self.req_states.num_tokens[idx_mapping_np] += num_sampled_tokens
-        # Increment the number of computed tokens.
-        self.req_states.num_computed_tokens[idx_mapping_np] += (
-            input_batch.num_scheduled_tokens
+        computed_prefill = self.req_states.num_computed_prefill_tokens
+        # TODO(woosuk): Simplify this.
+        computed_prefill[idx_mapping_np] = np.minimum(
+            computed_prefill[idx_mapping_np] + input_batch.num_scheduled_tokens,
+            self.req_states.prefill_len.np[idx_mapping_np],
         )
 
-        model_runner_output = ModelRunnerOutput(
-            req_ids=input_batch.req_ids,
-            req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)},
-            sampled_token_ids=None,  # type: ignore
-            logprobs=None,
-            prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore
-            pooler_output=[],
-            kv_connector_output=None,
-            num_nans_in_logits=None,
-        )
-        async_output = AsyncOutput(
-            model_runner_output=model_runner_output,
-            sampler_output=sampler_output,
-            num_sampled_tokens=num_sampled_tokens,
-            copy_stream=self.output_copy_stream,
-            copy_event=self.output_copy_event,
-        )
-        if self.use_async_scheduling:
-            return async_output
-        return async_output.get_output()
+        # Store the last sampled token ids.
+        last_sampled = sampled_tokens
+        self.req_states.last_sampled_tokens[input_batch.idx_mapping] = last_sampled
 
     def get_cudagraph_and_dp_padding(
         self,
@@ -782,6 +792,7 @@ def execute_model(
             )
         else:
             # Run PyTorch model in eager mode.
+            # TODO(woosuk): Support piecewise CUDA graph.
             with set_forward_context(
                 input_batch.attn_metadata,
                 self.vllm_config,
@@ -807,13 +818,41 @@ def sample_tokens(
         self.execute_model_state = None  # type: ignore
         assert sampling_metadata is not None
 
-        sampler_output = self.sample(
+        sampler_output, num_sampled_tokens = self.sample(
             hidden_states, input_batch, sampling_metadata, grammar_output
         )
         prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch)
-        output = self.postprocess(
-            sampler_output,
-            prompt_logprobs_dict,
-            input_batch,
+
+        # Prepare the model runner output.
+        model_runner_output = ModelRunnerOutput(
+            req_ids=input_batch.req_ids,
+            # NOTE(woosuk): req_id_to_index is unused in this model runner.
+            # Only for compatibility with the existing model runner and scheduler.
+            req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)},
+            sampled_token_ids=None,  # type: ignore
+            logprobs=None,
+            prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore
+            pooler_output=[],
+            kv_connector_output=None,
+            num_nans_in_logits=None,
+        )
+        async_output = AsyncOutput(
+            model_runner_output=model_runner_output,
+            sampler_output=sampler_output,
+            num_sampled_tokens=num_sampled_tokens,
+            copy_stream=self.output_copy_stream,
+            copy_event=self.output_copy_event,
+        )
+
+        # Postprocess results and update request states.
+        # NOTE: This is intentionally done after creating the AsyncOutput,
+        # ensuring that `copy_event` is recorded before calling postprocess.
+        # This sequencing may slightly reduce latency as async D2H copy does not
+        # need to wait for the postprocess to finish.
+        self.postprocess(
+            input_batch, sampler_output.sampled_token_ids, num_sampled_tokens
         )
-        return output
+
+        if self.use_async_scheduling:
+            return async_output
+        return async_output.get_output()
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 5d05c3f57790..e8a3207a3a53 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -85,8 +85,12 @@ def __init__(
             dtype=np.int32,
         )
         self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
-        self.num_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
-        self.num_computed_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
+
+        # Number of computed tokens.
+        self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens = torch.zeros(
+            self.max_num_reqs, dtype=torch.int32, device=device
+        )
 
         # Last sampled tokens.
         self.last_sampled_tokens = torch.zeros(
@@ -145,7 +149,10 @@ def add_request(
         )
         self.prefill_len.np[req_idx] = prefill_len
         self.prefill_token_ids[req_idx, :prefill_len] = prefill_token_ids
-        self.num_tokens[req_idx] = prefill_len
+
+        self.num_computed_prefill_tokens[req_idx] = num_computed_tokens
+        # FIXME(woosuk): This triggers a GPU operation whenever adding a new request.
+        # Optimize this.
         self.num_computed_tokens[req_idx] = num_computed_tokens
 
         if lora_request is not None:

From b004c00418268daa61b3526358b661165a360f7d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 23 Nov 2025 10:09:06 -0800
Subject: [PATCH 354/578] [Model Runner V2] Support spec decoding [1/N]
 (#29274)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py             | 128 +++++++++++++++---
 vllm/v1/worker/gpu/model_runner.py            |  86 ++++++++++--
 vllm/v1/worker/gpu/spec_decode/__init__.py    |   0
 .../gpu/spec_decode/rejection_sample.py       |  71 ++++++++++
 vllm/v1/worker/gpu/states.py                  |  94 +++++++++++++
 5 files changed, 350 insertions(+), 29 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/spec_decode/__init__.py
 create mode 100644 vllm/v1/worker/gpu/spec_decode/rejection_sample.py

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index b671c093113b..7675cb45170b 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -35,6 +35,7 @@ def __init__(
         self.positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=device)
         self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
         self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
+        self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
 
         # Structured outputs.
         self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
@@ -64,6 +65,7 @@ class InputBatch:
     # sum(num_scheduled_tokens)
     num_tokens: int
     num_tokens_after_padding: int
+    num_draft_tokens: int
 
     # [num_reqs + 1]
     query_start_loc: torch.Tensor
@@ -80,8 +82,10 @@ class InputBatch:
     # layer_name -> Metadata
     attn_metadata: dict[str, Any]
 
-    # [num_reqs]
+    # [total_num_logits]
     logits_indices: torch.Tensor
+    # [num_reqs + 1]
+    cu_num_logits: torch.Tensor
 
     @classmethod
     def make_dummy(
@@ -118,6 +122,7 @@ def make_dummy(
         positions = input_buffers.positions[:num_tokens]
         # attn_metadata = defaultdict(lambda: None)
         logits_indices = query_start_loc[1:] - 1
+        cu_num_logits = torch.arange(num_reqs + 1, device=device, dtype=torch.int32)
         return cls(
             req_ids=req_ids,
             num_reqs=num_reqs,
@@ -126,6 +131,7 @@ def make_dummy(
             num_scheduled_tokens=num_scheduled_tokens,
             num_tokens=num_tokens,
             num_tokens_after_padding=num_tokens,
+            num_draft_tokens=0,
             query_start_loc=query_start_loc,
             query_start_loc_np=query_start_loc_np,
             seq_lens=seq_lens,
@@ -134,6 +140,7 @@ def make_dummy(
             positions=positions,
             attn_metadata=None,  # type: ignore
             logits_indices=logits_indices,
+            cu_num_logits=cu_num_logits,
         )
 
 
@@ -279,19 +286,53 @@ def _combine_sampled_and_draft_tokens_kernel(
     query_start_loc_ptr,
     seq_lens_ptr,
     prefill_len_ptr,
+    draft_tokens_ptr,
+    draft_tokens_stride,
+    cu_num_logits_ptr,
+    logits_indices_ptr,
+    BLOCK_SIZE: tl.constexpr,
 ):
     batch_idx = tl.program_id(0)
     req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
 
+    # Get the number of logits and draft tokens.
+    cu_num_logits_start = tl.load(cu_num_logits_ptr + batch_idx)
+    cu_num_logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
+    num_logits = cu_num_logits_end - cu_num_logits_start
+    num_draft_tokens = num_logits - 1
+
+    # Compute the logits indices.
+    block = tl.arange(0, BLOCK_SIZE)
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    logits_start = query_end - num_logits
+    tl.store(
+        logits_indices_ptr + cu_num_logits_start + block,
+        logits_start + block,
+        mask=block < num_logits,
+    )
+
     seq_len = tl.load(seq_lens_ptr + batch_idx)
     prefill_len = tl.load(prefill_len_ptr + req_state_idx)
     if seq_len <= prefill_len:
-        # Handling prefill tokens.
+        # Handling prefill tokens. No sampled or draft tokens.
         return
 
+    # Write the last sampled token ID to input_ids.
     last_token_id = tl.load(last_sampled_tokens_ptr + req_state_idx)
-    end = tl.load(query_start_loc_ptr + batch_idx + 1)
-    tl.store(input_ids_ptr + end - 1, last_token_id)
+    tl.store(input_ids_ptr + query_end - num_logits, last_token_id)
+
+    # Write the draft tokens (if any) to input_ids.
+    if num_draft_tokens > 0:
+        mask = block < num_draft_tokens
+        draft_tokens = tl.load(
+            draft_tokens_ptr + req_state_idx * draft_tokens_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            input_ids_ptr + query_end - num_draft_tokens + block,
+            draft_tokens,
+            mask=mask,
+        )
 
 
 def combine_sampled_and_draft_tokens(
@@ -301,8 +342,18 @@ def combine_sampled_and_draft_tokens(
     query_start_loc: torch.Tensor,
     seq_lens: torch.Tensor,
     prefill_len: torch.Tensor,
+    draft_tokens: torch.Tensor,
+    cu_num_logits: torch.Tensor,
+    num_logits: int,
 ) -> torch.Tensor:
     num_reqs = seq_lens.shape[0]
+    num_speculative_steps = draft_tokens.shape[-1]
+
+    logits_indices = torch.empty(
+        num_logits,
+        dtype=torch.int64,
+        device=input_ids.device,
+    )
     _combine_sampled_and_draft_tokens_kernel[(num_reqs,)](
         input_ids,
         idx_mapping,
@@ -310,35 +361,80 @@ def combine_sampled_and_draft_tokens(
         query_start_loc,
         seq_lens,
         prefill_len,
+        draft_tokens,
+        draft_tokens.stride(0),
+        cu_num_logits,
+        logits_indices,
+        # NOTE(woosuk): Add 1 to ensure the block can cover the last sampled token
+        # in addition to all draft tokens.
+        BLOCK_SIZE=triton.next_power_of_2(num_speculative_steps + 1),
     )
-    return input_ids
+    return logits_indices
 
 
 @triton.jit
-def _update_num_computed_tokens_kernel(
+def _post_update_kernel(
     idx_mapping_ptr,
     num_computed_tokens_ptr,
+    last_sampled_tokens_ptr,
+    sampled_tokens_ptr,
+    sampled_tokens_stride,
+    num_sampled_ptr,
     query_start_loc_ptr,
+    cu_num_logits_ptr,
 ):
     req_id = tl.program_id(0)
     req_state_idx = tl.load(idx_mapping_ptr + req_id)
 
-    start = tl.load(query_start_loc_ptr + req_id)
-    end = tl.load(query_start_loc_ptr + req_id + 1)
-    query_len = end - start
-
-    n = tl.load(num_computed_tokens_ptr + req_state_idx)
-    tl.store(num_computed_tokens_ptr + req_state_idx, n + query_len)
-
-
-def update_num_computed_tokens(
+    num_sampled = tl.load(num_sampled_ptr + req_id)
+    if num_sampled > 0:
+        token_id = tl.load(
+            sampled_tokens_ptr + req_id * sampled_tokens_stride + num_sampled - 1
+        )
+        tl.store(last_sampled_tokens_ptr + req_state_idx, token_id)
+
+    query_start = tl.load(query_start_loc_ptr + req_id)
+    query_end = tl.load(query_start_loc_ptr + req_id + 1)
+    query_len = query_end - query_start
+
+    num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
+    num_computed += query_len
+    # Consider the rejected tokens in spec decoding.
+    if num_sampled > 0:
+        # NOTE(woosuk): We must skip num_sampled == 0 to account for chunked prefills.
+        logits_start = tl.load(cu_num_logits_ptr + req_id)
+        logits_end = tl.load(cu_num_logits_ptr + req_id + 1)
+        num_logits = logits_end - logits_start
+        num_rejected = num_logits - num_sampled
+        num_computed -= num_rejected
+    tl.store(num_computed_tokens_ptr + req_state_idx, num_computed)
+
+
+def post_update(
+    # [num_reqs]
     idx_mapping: torch.Tensor,
+    # [max_num_reqs]
     num_computed_tokens: torch.Tensor,
+    # [max_num_reqs]
+    last_sampled_tokens: torch.Tensor,
+    # [num_reqs, num_speculative_steps + 1]
+    sampled_tokens: torch.Tensor,
+    # [num_reqs]
+    num_sampled: torch.Tensor,
+    # [num_reqs + 1]
     query_start_loc: torch.Tensor,
+    # [num_reqs + 1]
+    cu_num_logits: torch.Tensor,
 ) -> None:
     num_reqs = idx_mapping.shape[0]
-    _update_num_computed_tokens_kernel[(num_reqs,)](
+    _post_update_kernel[(num_reqs,)](
         idx_mapping,
         num_computed_tokens,
+        last_sampled_tokens,
+        sampled_tokens,
+        sampled_tokens.stride(0),
+        num_sampled,
         query_start_loc,
+        cu_num_logits,
+        num_warps=1,
     )
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index bacfbd6c2f46..4b4ee92176f2 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -40,11 +40,12 @@
     InputBatch,
     InputBuffers,
     combine_sampled_and_draft_tokens,
+    post_update,
     prepare_pos_seq_lens,
     prepare_prefill_inputs,
-    update_num_computed_tokens,
 )
 from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
+from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
 from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
@@ -100,10 +101,18 @@ def __init__(
             self.input_prep_event = None
             self.structured_outputs_event = None
 
+        if self.speculative_config is not None:
+            self.do_spec_decode = True
+            self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+        else:
+            self.do_spec_decode = False
+            self.num_speculative_steps = 0
+
         self.req_states = RequestState(
             max_num_reqs=self.max_num_reqs,
             max_model_len=self.max_model_len,
             max_num_batched_tokens=self.max_num_tokens,
+            num_speculative_steps=self.num_speculative_steps,
             vocab_size=self.vocab_size,
             device=self.device,
             pin_memory=self.pin_memory,
@@ -427,6 +436,32 @@ def prepare_inputs(
         idx_mapping_np = idx_mapping.np[:num_reqs]
         idx_mapping = idx_mapping.copy_to_gpu(num_reqs)
 
+        # Get the number of draft tokens for each request.
+        if not scheduler_output.scheduled_spec_decode_tokens:
+            # No draft token scheduled (common case).
+            total_num_draft_tokens = 0
+            total_num_logits = num_reqs
+            cu_num_logits = torch.arange(
+                num_reqs + 1, device=self.device, dtype=torch.int32
+            )
+        else:
+            draft_tokens = scheduler_output.scheduled_spec_decode_tokens
+            num_draft_tokens = np.array(
+                [
+                    len(draft_tokens[req_id]) if req_id in draft_tokens else 0
+                    for req_id in req_ids
+                ],
+                dtype=np.int32,
+            )
+            total_num_draft_tokens = int(num_draft_tokens.sum())
+            total_num_logits = num_reqs + total_num_draft_tokens
+
+            np.cumsum(
+                num_draft_tokens + 1,
+                out=self.input_buffers.cu_num_logits.np[1 : num_reqs + 1],
+            )
+            cu_num_logits = self.input_buffers.cu_num_logits.copy_to_gpu(num_reqs + 1)
+
         # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
         block_tables = self.block_tables.gather_block_tables(idx_mapping)
 
@@ -456,14 +491,17 @@ def prepare_inputs(
         seq_lens = self.input_buffers.seq_lens[:num_reqs]
 
         # Some input token ids are directly read from the last sampled tokens
-        # and draft tokens.
-        combine_sampled_and_draft_tokens(
+        # and draft tokens. Also, get the logits indices to sample tokens from.
+        logits_indices = combine_sampled_and_draft_tokens(
             self.input_buffers.input_ids.gpu,
             idx_mapping,
             self.req_states.last_sampled_tokens,
             query_start_loc_gpu,
             seq_lens,
             self.req_states.prefill_len.gpu,
+            self.req_states.draft_tokens,
+            cu_num_logits,
+            total_num_logits,
         )
 
         # Compute slot mappings: [num_kv_cache_groups, num_tokens]
@@ -471,9 +509,6 @@ def prepare_inputs(
             query_start_loc_gpu, self.input_buffers.positions[:num_tokens]
         )
 
-        # Logits indices to sample next token from.
-        logits_indices = query_start_loc_gpu[1:] - 1
-
         # Get num_computed_tokens.
         # HACK(woosuk): Here, we use num_computed_tokens on GPU instead of
         # num_computed_tokens_cpu. This works for most cases.
@@ -508,6 +543,7 @@ def prepare_inputs(
             num_scheduled_tokens=num_scheduled_tokens,
             num_tokens=num_tokens,
             num_tokens_after_padding=num_tokens_after_padding,
+            num_draft_tokens=total_num_draft_tokens,
             query_start_loc=query_start_loc_gpu,
             query_start_loc_np=query_start_loc_np,
             seq_lens=seq_lens,
@@ -516,6 +552,7 @@ def prepare_inputs(
             positions=positions,
             attn_metadata=attn_metadata,
             logits_indices=logits_indices,
+            cu_num_logits=cu_num_logits,
         )
 
     def sample(
@@ -530,6 +567,7 @@ def sample(
         if grammar_output is not None:
             # Apply grammar bitmask to the logits in-place.
             # TODO(woosuk): Make compatible with spec decoding.
+            assert input_batch.num_draft_tokens == 0
             with async_barrier(self.structured_outputs_event):
                 apply_grammar_bitmask(
                     logits,
@@ -539,12 +577,28 @@ def sample(
                     self.input_buffers,
                 )
 
+        # Sample tokens and compute logprobs (if needed).
         sampler_output = self.sampler(logits, sampling_metadata)
+
         # Get the number of sampled tokens.
-        # 0 if chunked-prefilling, 1 if not.
         prefill_len = self.req_states.prefill_len.gpu[input_batch.idx_mapping]
         is_chunked_prefilling = input_batch.seq_lens < prefill_len
-        num_sampled = (~is_chunked_prefilling).int()
+        if input_batch.num_draft_tokens == 0:
+            # No draft tokens (common case).
+            # 0 if chunked-prefilling, 1 if not.
+            num_sampled = (~is_chunked_prefilling).int()
+        else:
+            # Draft tokens for spec decoding.
+            input_ids = input_batch.input_ids[input_batch.logits_indices]
+            sampled_tokens, num_sampled = rejection_sample(
+                sampler_output.sampled_token_ids,
+                input_ids,
+                input_batch.cu_num_logits,
+                self.num_speculative_steps,
+            )
+            num_sampled *= ~is_chunked_prefilling
+            sampler_output.sampled_token_ids = sampled_tokens
+            # TODO(woosuk): Support logprobs with spec decoding.
         return sampler_output, num_sampled
 
     def compute_prompt_logprobs(
@@ -653,11 +707,17 @@ def postprocess(
         num_sampled: torch.Tensor,
     ) -> None:
         # Update the number of computed tokens.
-        update_num_computed_tokens(
+        post_update(
             input_batch.idx_mapping,
             self.req_states.num_computed_tokens,
+            self.req_states.last_sampled_tokens,
+            sampled_tokens,
+            num_sampled,
             input_batch.query_start_loc,
+            input_batch.cu_num_logits,
         )
+
+        # Update the number of computed prefill tokens.
         idx_mapping_np = input_batch.idx_mapping_np
         computed_prefill = self.req_states.num_computed_prefill_tokens
         # TODO(woosuk): Simplify this.
@@ -666,10 +726,6 @@ def postprocess(
             self.req_states.prefill_len.np[idx_mapping_np],
         )
 
-        # Store the last sampled token ids.
-        last_sampled = sampled_tokens
-        self.req_states.last_sampled_tokens[input_batch.idx_mapping] = last_sampled
-
     def get_cudagraph_and_dp_padding(
         self,
         scheduler_output: SchedulerOutput,
@@ -761,6 +817,10 @@ def execute_model(
                 sampling_metadata = self.req_states.make_sampling_metadata(
                     input_batch.idx_mapping_np, pos
                 )
+                if input_batch.num_draft_tokens > 0:
+                    sampling_metadata = self.req_states.expand_sampling_metadata(
+                        sampling_metadata, input_batch.cu_num_logits
+                    )
 
                 if self.lora_config:
                     # Activate LoRA adapters.
diff --git a/vllm/v1/worker/gpu/spec_decode/__init__.py b/vllm/v1/worker/gpu/spec_decode/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
new file mode 100644
index 000000000000..8a7bf28bacbd
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _rejection_sample_kernel(
+    sampled_ptr,  # [num_reqs, num_speculative_steps + 1]
+    sampled_stride,
+    num_sampled_ptr,  # [num_reqs]
+    target_sampled_ptr,  # [num_draft_tokens + num_reqs]
+    input_ids_ptr,  # [num_draft_tokens + num_reqs]
+    cu_num_logits_ptr,  # [num_reqs + 1]
+):
+    req_idx = tl.program_id(0)
+    start_idx = tl.load(cu_num_logits_ptr + req_idx)
+    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
+    num_tokens = end_idx - start_idx
+
+    num_sampled = 0
+    rejected = False
+    for i in range(num_tokens - 1):
+        if not rejected:
+            target_sampled = tl.load(target_sampled_ptr + start_idx + i)
+            draft_sampled = tl.load(input_ids_ptr + start_idx + i + 1)
+            tl.store(sampled_ptr + req_idx * sampled_stride + i, target_sampled)
+            num_sampled += 1
+            if target_sampled != draft_sampled:
+                rejected = True
+    if not rejected:
+        target_sampled = tl.load(target_sampled_ptr + start_idx + num_tokens - 1)
+        tl.store(
+            sampled_ptr + req_idx * sampled_stride + num_tokens - 1, target_sampled
+        )
+        num_sampled += 1
+    tl.store(num_sampled_ptr + req_idx, num_sampled)
+
+
+def rejection_sample(
+    # [num_draft_tokens + num_reqs]
+    target_sampled: torch.Tensor,
+    # [num_draft_tokens + num_reqs]
+    input_ids: torch.Tensor,
+    # [num_reqs + 1]
+    cu_num_logits: torch.Tensor,
+    num_speculative_steps: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    num_reqs = cu_num_logits.shape[0] - 1
+    sampled = torch.empty(
+        num_reqs,
+        num_speculative_steps + 1,
+        dtype=target_sampled.dtype,
+        device=target_sampled.device,
+    )
+    num_sampled = torch.empty(
+        num_reqs,
+        dtype=torch.int32,
+        device=target_sampled.device,
+    )
+    _rejection_sample_kernel[(num_reqs,)](
+        sampled,
+        sampled.stride(0),
+        num_sampled,
+        target_sampled,
+        input_ids,
+        cu_num_logits,
+        num_warps=1,
+    )
+    return sampled, num_sampled
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index e8a3207a3a53..513d45d95d7c 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -7,6 +7,7 @@
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
+from vllm.triton_utils import tl, triton
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.utils import CpuGpuBuffer
 
@@ -63,6 +64,7 @@ def __init__(
         max_num_reqs: int,
         max_model_len: int,
         max_num_batched_tokens: int,
+        num_speculative_steps: int,
         vocab_size: int,
         device: torch.device,
         pin_memory: bool,
@@ -70,6 +72,7 @@ def __init__(
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
         self.max_num_batched_tokens = max_num_batched_tokens
+        self.num_speculative_steps = num_speculative_steps
         self.vocab_size = vocab_size
         self.device = device
         self.pin_memory = pin_memory
@@ -100,6 +103,14 @@ def __init__(
             device=device,
         )
 
+        # Draft tokens.
+        self.draft_tokens = torch.zeros(
+            self.max_num_reqs,
+            self.num_speculative_steps,
+            dtype=torch.int64,
+            device=device,
+        )
+
         # LoRA.
         self.lora_ids = np.zeros(self.max_num_reqs, dtype=np.int32)
         self.lora_ids.fill(NO_LORA_ID)
@@ -226,6 +237,17 @@ def make_sampling_metadata(
             max_num_logprobs=max_num_logprobs,
         )
 
+    def expand_sampling_metadata(
+        self,
+        sampling_metadata: SamplingMetadata,
+        cu_num_logits: torch.Tensor,
+    ) -> SamplingMetadata:
+        # For draft tokens, we need to expand the sampling param tensors as
+        # each request samples multiple tokens in each step.
+        return expand_sampling_metadata(
+            sampling_metadata, cu_num_logits, self.num_speculative_steps
+        )
+
     def make_lora_inputs(
         self,
         req_ids: list[str],
@@ -270,3 +292,75 @@ def copy_np_to_gpu(self, x: np.ndarray) -> torch.Tensor:
 class ExtraData:
     lora_request: LoRARequest | None
     in_progress_prompt_logprobs: list[LogprobsTensors] = field(default_factory=list)
+
+
+# NOTE(woosuk): Re-compilation can happen at runtime since top_p and top_k can be None.
+@triton.jit
+def _expand_sampling_metadata_kernel(
+    temp_ptr,
+    expanded_temp_ptr,
+    top_p_ptr,
+    expanded_top_p_ptr,
+    top_k_ptr,
+    expanded_top_k_ptr,
+    seeds_ptr,
+    expanded_seeds_ptr,
+    cu_num_logits_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    start_idx = tl.load(cu_num_logits_ptr + req_idx)
+    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
+    num_tokens = end_idx - start_idx
+
+    block = tl.arange(0, BLOCK_SIZE)
+    mask = block < num_tokens
+
+    temp = tl.load(temp_ptr + req_idx)
+    tl.store(expanded_temp_ptr + start_idx + block, temp, mask=mask)
+
+    if top_p_ptr is not None:
+        top_p = tl.load(top_p_ptr + req_idx)
+        tl.store(expanded_top_p_ptr + start_idx + block, top_p, mask=mask)
+
+    if top_k_ptr is not None:
+        top_k = tl.load(top_k_ptr + req_idx)
+        tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
+
+    seed = tl.load(seeds_ptr + req_idx)
+    tl.store(expanded_seeds_ptr + start_idx + block, seed, mask=mask)
+
+
+def expand_sampling_metadata(
+    sampling_metadata: SamplingMetadata,
+    cu_num_logits: torch.Tensor,
+    num_speculative_steps: int,
+) -> SamplingMetadata:
+    total_num_logits = sampling_metadata.pos.shape[0]
+    create_empty = lambda x: x.new_empty(total_num_logits) if x is not None else None
+    expanded_temp = create_empty(sampling_metadata.temperature)
+    expanded_top_p = create_empty(sampling_metadata.top_p)
+    expanded_top_k = create_empty(sampling_metadata.top_k)
+    expanded_seeds = create_empty(sampling_metadata.seeds)
+
+    num_reqs = cu_num_logits.shape[0] - 1
+    _expand_sampling_metadata_kernel[(num_reqs,)](
+        sampling_metadata.temperature,
+        expanded_temp,
+        sampling_metadata.top_p,
+        expanded_top_p,
+        sampling_metadata.top_k,
+        expanded_top_k,
+        sampling_metadata.seeds,
+        expanded_seeds,
+        cu_num_logits,
+        BLOCK_SIZE=triton.next_power_of_2(num_speculative_steps + 1),
+    )
+    return SamplingMetadata(
+        temperature=expanded_temp,
+        top_p=expanded_top_p,
+        top_k=expanded_top_k,
+        seeds=expanded_seeds,
+        pos=sampling_metadata.pos,
+        max_num_logprobs=sampling_metadata.max_num_logprobs,
+    )

From 62d54ba46db25b95de2d21e46f4b57b5502ed747 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 23 Nov 2025 11:15:32 -0800
Subject: [PATCH 355/578] [Model Runner V2] Optimize CUDA graph capture time
 (#29275)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py | 5 ++++-
 vllm/v1/worker/gpu/model_runner.py    | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index dccf747369e4..ba783e2d0c6f 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -106,7 +106,10 @@ def capture_graph(
         input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1)
         input_buffers.query_start_loc.np[batch_size:] = batch_size
         input_buffers.query_start_loc.copy_to_gpu()
-        input_buffers.seq_lens[:batch_size] = self.max_model_len
+        # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len)
+        # for seq_lens. This leads to a mismatch between seq_lens (GPU) and
+        # seq_lens_np (CPU), which might cause issues in some attention backends.
+        input_buffers.seq_lens[:batch_size] = 1
         input_buffers.seq_lens[batch_size:] = 0
 
         input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables]
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 4b4ee92176f2..6e332ee4b75b 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -313,6 +313,7 @@ def capture_model(self) -> int:
             return 0
 
         start_time = time.perf_counter()
+        gc.collect()
         torch.cuda.empty_cache()
         start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 

From 3e1ad406559c3b520eeda0e681ea68d33daf1be1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 23 Nov 2025 14:13:00 -0800
Subject: [PATCH 356/578] [Model Runner V2] Add apply_temperature option to
 gumbel_sample (#29276)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sampler.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index 499e9d3b1538..c48ed2d8ca16 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -68,9 +68,10 @@ def sample(
 
         sampled = gumbel_sample(
             logits,
-            is_greedy,
+            sampling_metadata.temperature,
             sampling_metadata.seeds,
             sampling_metadata.pos,
+            apply_temperature=False,
         )
         return sampled, logits if return_logits else None
 
@@ -85,9 +86,10 @@ def _gumbel_sample_kernel(
     logits_stride,
     seeds_ptr,
     pos_ptr,
-    is_greedy_ptr,
+    temp_ptr,
     vocab_size,
     BLOCK_SIZE: tl.constexpr,
+    APPLY_TEMPERATURE: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
     block_idx = tl.program_id(1)
@@ -99,8 +101,8 @@ def _gumbel_sample_kernel(
         other=float("-inf"),
     )
 
-    is_greedy = tl.load(is_greedy_ptr + req_idx)
-    if not is_greedy:
+    temp = tl.load(temp_ptr + req_idx)
+    if temp != 0.0:
         # Calculate the seed for gumbel noise.
         seed = tl.load(seeds_ptr + req_idx)
         pos = tl.load(pos_ptr + req_idx)
@@ -111,6 +113,11 @@ def _gumbel_sample_kernel(
         gumbel_noise = -tl.log(-tl.log(r + 1e-20) + 1e-20)
         gumbel_noise = gumbel_noise.to(tl.float32)
 
+        # Apply temperature.
+        if APPLY_TEMPERATURE:
+            # NOTE(woosuk): Use div_rn to match the behavior of torch.
+            logits = tl.div_rn(logits, temp.to(tl.float32))
+
         # Apply gumbel noise.
         logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
 
@@ -123,9 +130,10 @@ def _gumbel_sample_kernel(
 
 def gumbel_sample(
     logits: torch.Tensor,  # [num_reqs, vocab_size]
-    is_greedy: torch.Tensor,  # [num_reqs]
+    temperature: torch.Tensor,  # [num_reqs]
     seed: torch.Tensor,  # [num_reqs]
     pos: torch.Tensor,  # [num_reqs]
+    apply_temperature: bool,
 ) -> torch.Tensor:
     num_reqs, vocab_size = logits.shape
     BLOCK_SIZE = 1024
@@ -151,9 +159,10 @@ def gumbel_sample(
         logits.stride(0),
         seed,
         pos,
-        is_greedy,
+        temperature,
         vocab_size,
         BLOCK_SIZE=BLOCK_SIZE,
+        APPLY_TEMPERATURE=apply_temperature,
     )
     # NOTE(woosuk): Use int64 for later indexing.
     max_block_idx = local_max.argmax(dim=-1, keepdim=True)

From c309bb5245b6d05228c9d2f9c8f3e769c08d9194 Mon Sep 17 00:00:00 2001
From: Josh Moore <joshiemoore98@gmail.com>
Date: Sun, 23 Nov 2025 19:47:54 -0500
Subject: [PATCH 357/578] [Bugfix] Update Gradio OpenAI Chatbot Webserver
 example to new Gradio message history format (#29249)

Signed-off-by: joshiemoore <joshiemoore98@gmail.com>
---
 .../gradio_openai_chatbot_webserver.py        | 20 ++++++-------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index d5d0a07a2918..c76c60cc4472 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -25,25 +25,17 @@
 from openai import OpenAI
 
 
-def format_history_to_openai(history):
-    history_openai_format = [
-        {"role": "system", "content": "You are a great AI assistant."}
-    ]
-    for human, assistant in history:
-        history_openai_format.append({"role": "user", "content": human})
-        history_openai_format.append({"role": "assistant", "content": assistant})
-    return history_openai_format
-
-
 def predict(message, history, client, model_name, temp, stop_token_ids):
-    # Format history to OpenAI chat format
-    history_openai_format = format_history_to_openai(history)
-    history_openai_format.append({"role": "user", "content": message})
+    messages = [
+        {"role": "system", "content": "You are a great AI assistant."},
+        *history,
+        {"role": "user", "content": message},
+    ]
 
     # Send request to OpenAI API (vLLM server)
     stream = client.chat.completions.create(
         model=model_name,
-        messages=history_openai_format,
+        messages=messages,
         temperature=temp,
         stream=True,
         extra_body={

From 1073ba68b0a741de6de0a7230795acba151084f2 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 24 Nov 2025 10:27:23 +0800
Subject: [PATCH 358/578] [LoRA] Optimize 3D MoE logic (#29222)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_gptoss_tp.py                 |   7 +-
 vllm/lora/layers/__init__.py                 |   3 +-
 vllm/lora/layers/base.py                     |   4 +-
 vllm/lora/layers/base_linear.py              |   6 +-
 vllm/lora/layers/column_parallel_linear.py   |   4 +-
 vllm/lora/layers/fused_moe.py                | 349 +++++++++++++++----
 vllm/lora/layers/logits_processor.py         |   6 +-
 vllm/lora/layers/vocal_parallel_embedding.py |   7 +-
 vllm/lora/models.py                          |  99 ++++--
 vllm/lora/utils.py                           |  12 +-
 vllm/model_executor/models/gpt_oss.py        |   1 +
 11 files changed, 395 insertions(+), 103 deletions(-)

diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py
index 711d514a39eb..f4269750feb6 100644
--- a/tests/lora/test_gptoss_tp.py
+++ b/tests/lora/test_gptoss_tp.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -84,14 +86,17 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
 
 
 @multi_gpu_test(num_gpus=2)
-def test_gpt_oss_lora_tp2(gptoss20b_lora_files):
+@pytest.mark.parametrize("fully_sharded_loras", [False, True])
+def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
         enable_lora=True,
         max_loras=2,
         max_lora_rank=8,
+        max_num_seqs=16,
         tensor_parallel_size=2,
+        fully_sharded_loras=fully_sharded_loras,
         compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
             cudagraph_specialize_lora=False,
         ),
diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py
index 8a4f5ff175d4..25364a588136 100644
--- a/vllm/lora/layers/__init__.py
+++ b/vllm/lora/layers/__init__.py
@@ -11,7 +11,7 @@
     QKVParallelLinearWithLoRA,
     QKVParallelLinearWithShardedLoRA,
 )
-from vllm.lora.layers.fused_moe import FusedMoEWithLoRA
+from vllm.lora.layers.fused_moe import FusedMoE3DWithLoRA, FusedMoEWithLoRA
 from vllm.lora.layers.logits_processor import LogitsProcessorWithLoRA
 from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA
 from vllm.lora.layers.row_parallel_linear import (
@@ -38,4 +38,5 @@
     "ReplicatedLinearWithLoRA",
     "LoRAMapping",
     "FusedMoEWithLoRA",
+    "FusedMoE3DWithLoRA",
 ]
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
index 62326c05b2bd..3bfb88c00762 100644
--- a/vllm/lora/layers/base.py
+++ b/vllm/lora/layers/base.py
@@ -42,8 +42,8 @@ def reset_lora(self, index: int):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         """Overwrites lora tensors at index."""
         ...
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index e85c5bd70b07..06ecc8d2f634 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -94,13 +94,15 @@ def reset_lora(self, index: int):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         # Except for QKVParallelLinearWithLoRA and
         # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
         # store weights in a tuple of size 1. These two layers will
         # override this function.
+        assert isinstance(lora_a, torch.Tensor)
+        assert isinstance(lora_b, torch.Tensor)
         assert (
             len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1
         )
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index 273c4950e323..3e21d426c304 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -246,8 +246,8 @@ def slice_lora_b(
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         self.reset_lora(index)
 
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 5aeaca8de5e5..0eb6562bec6c 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -42,7 +42,7 @@ def __init__(self, base_layer: FusedMoE) -> None:
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
         self.device = base_layer.w2_weight.device
-        self.w13_slices = 2
+        self._w13_slices = 2
         self._inject_lora_into_fused_moe()
 
     def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]:
@@ -160,7 +160,7 @@ def wrapper(*args, **kwargs):
                     op_prefix="w13",
                     num_loras=self.max_loras,
                     rank=max_lora_rank,
-                    num_slices=self.w13_slices,
+                    num_slices=self._w13_slices,
                     M=M,
                     layer=layer,
                     top_k=top_k,
@@ -230,7 +230,7 @@ def wrapper(*args, **kwargs):
                 CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
-                max_lora_rank = self.w2_lora_a_stacked.shape[-2]
+                max_lora_rank = self.w2_lora_a_stacked[0].shape[-2]
                 shrink_config, expand_config = self._get_lora_moe_configs(
                     op_prefix="w2",
                     num_loras=self.max_loras,
@@ -258,8 +258,8 @@ def wrapper(*args, **kwargs):
                 self.punica_wrapper.add_lora_fused_moe(
                     intermediate_cache3,
                     intermediate_cache2,
-                    (self.w2_lora_a_stacked,),
-                    (self.w2_lora_b_stacked,),
+                    self.w2_lora_a_stacked,
+                    self.w2_lora_b_stacked,
                     topk_weights,
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -292,22 +292,12 @@ def wrapper(*args, **kwargs):
             self.base_layer.quant_method, m_fused_moe_fn
         )
 
-    def create_lora_weights(
+    def _create_lora_a_weights(
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: PretrainedConfig | None = None,
-    ) -> None:
-        """Initializes lora matrices."""
-        assert self.w13_slices == 2
-        self.max_loras = lora_config.max_loras
-        self.fully_sharded = lora_config.fully_sharded_loras
-
-        self.adapter_enabled = torch.tensor(
-            [0] * (max_loras + 1), dtype=torch.int, device=self.device
-        )
-
-        self.w13_lora_a_stacked = tuple(
+    ):
+        self.w13_lora_a_stacked: tuple[torch.Tensor, ...] = tuple(
             torch.zeros(
                 (
                     max_loras,
@@ -320,10 +310,23 @@ def create_lora_weights(
                 dtype=lora_config.lora_dtype,
                 device=self.device,
             )
-            for _ in range(self.w13_slices)
+            for _ in range(self._w13_slices)
+        )
+        self.w2_lora_a_stacked: tuple[torch.Tensor, ...] = (
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    lora_config.max_lora_rank,
+                    self.base_layer.intermediate_size_per_partition,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ),
         )
 
-        self.w13_lora_b_stacked = tuple(
+    def _create_lora_b_weights(self, max_loras: int, lora_config: LoRAConfig):
+        self.w13_lora_b_stacked: tuple[torch.Tensor, ...] = tuple(
             torch.zeros(
                 (
                     max_loras,
@@ -334,34 +337,42 @@ def create_lora_weights(
                 dtype=lora_config.lora_dtype,
                 device=self.device,
             )
-            for _ in range(self.w13_slices)
+            for _ in range(self._w13_slices)
         )
-
-        self.w2_lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                lora_config.max_lora_rank,
-                self.base_layer.intermediate_size_per_partition,
+        self.w2_lora_b_stacked: tuple[torch.Tensor, ...] = (
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.hidden_size
+                    if not self.fully_sharded
+                    else divide(self.base_layer.hidden_size, self.tp_size),
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
             ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
         )
-        self.w2_lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                self.base_layer.hidden_size
-                if not self.fully_sharded
-                else divide(self.base_layer.hidden_size, self.tp_size),
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: PretrainedConfig | None = None,
+    ) -> None:
+        """Initializes lora matrices."""
+        self.max_loras = lora_config.max_loras
+        self.fully_sharded = lora_config.fully_sharded_loras
+
+        self.adapter_enabled = torch.tensor(
+            [0] * (max_loras + 1), dtype=torch.int, device=self.device
         )
 
+        self._create_lora_a_weights(max_loras, lora_config)
+        self._create_lora_b_weights(max_loras, lora_config)
         # They will be used by 'LoRALayerWeights.create_dummy_lora_weights'
         # to create a dummy LoRA weights.
+        # TODO Optimize this section
         self.lora_a_stacked = []
         self.lora_b_stacked = []
         for lora_id in range(max_loras):
@@ -370,36 +381,43 @@ def create_lora_weights(
                 self.lora_a_stacked.append(
                     self.w13_lora_a_stacked[0][lora_id][experts_id]
                 )
-                self.lora_a_stacked.append(self.w2_lora_a_stacked[lora_id][experts_id])
                 self.lora_a_stacked.append(
-                    self.w13_lora_a_stacked[1][lora_id][experts_id]
+                    self.w2_lora_a_stacked[0][lora_id][experts_id]
                 )
 
                 self.lora_b_stacked.append(
                     self.w13_lora_b_stacked[0][lora_id][experts_id]
                 )
-                self.lora_b_stacked.append(self.w2_lora_b_stacked[lora_id][experts_id])
+                self.lora_b_stacked.append(
+                    self.w2_lora_b_stacked[0][lora_id][experts_id]
+                )
+
+                self.lora_a_stacked.append(
+                    self.w13_lora_a_stacked[1][lora_id][experts_id]
+                )
                 self.lora_b_stacked.append(
                     self.w13_lora_b_stacked[1][lora_id][experts_id]
                 )
 
     def reset_lora(self, index: int):
         """Resets the lora weights at index back to 0."""
-        for pos in range(self.w13_slices):
+        for pos in range(self._w13_slices):
             self.w13_lora_a_stacked[pos][index] = 0
             self.w13_lora_b_stacked[pos][index] = 0
 
-        self.w2_lora_a_stacked[index] = 0
-        self.w2_lora_b_stacked[index] = 0
+        self.w2_lora_a_stacked[0][index] = 0
+        self.w2_lora_b_stacked[0][index] = 0
         self.adapter_enabled[index] = 0
 
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         """Overwrites lora tensors at index."""
+        assert isinstance(lora_a, list)
+        assert isinstance(lora_b, list)
         self.reset_lora(index)
         self.adapter_enabled[index] = 1
         for eid in range(len(lora_a) // 3):
@@ -432,7 +450,7 @@ def set_lora(
                     w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :]
                     w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :]
 
-                    w2_shard_size = self.w2_lora_b_stacked[index, eid].shape[0]
+                    w2_shard_size = self.w2_lora_b_stacked[0][index, eid].shape[0]
                     w2_start_idx = self.tp_rank * w2_shard_size
                     w2_end_idx = (self.tp_rank + 1) * w2_shard_size
                     w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :]
@@ -454,14 +472,32 @@ def set_lora(
                 index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1]
             ].copy_(w3_lora_b, non_blocking=True)
 
-            self.w2_lora_a_stacked[
+            self.w2_lora_a_stacked[0][
                 index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1]
             ].copy_(w2_lora_a, non_blocking=True)
 
-            self.w2_lora_b_stacked[
+            self.w2_lora_b_stacked[0][
                 index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1]
             ].copy_(w2_lora_b, non_blocking=True)
 
+    def forward(self, *args, **kwargs):
+        return self.base_layer.forward(*args, **kwargs)
+
+    def maybe_all_reduce_tensor_model_parallel(self, *args, **kwargs):
+        return self.base_layer.maybe_all_reduce_tensor_model_parallel(*args, **kwargs)
+
+    @property
+    def _shared_experts(self):
+        return self.base_layer._shared_experts
+
+    @property
+    def quant_method(self):
+        return self.base_layer.quant_method
+
+    @property
+    def is_internal_router(self) -> bool:
+        return self.base_layer.is_internal_router
+
     @classmethod
     def can_replace_layer(
         cls,
@@ -472,22 +508,209 @@ def can_replace_layer(
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
         # return type(source_layer) is FusedMoE
-        return isinstance(source_layer, FusedMoE)
 
-    def forward(self, *args, **kwargs):
-        return self.base_layer.forward(*args, **kwargs)
+        return type(source_layer) is FusedMoE and len(packed_modules_list) == 2
 
-    def maybe_all_reduce_tensor_model_parallel(self, *args, **kwargs):
-        return self.base_layer.maybe_all_reduce_tensor_model_parallel(*args, **kwargs)
+
+class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
+    def __init__(self, base_layer):
+        super().__init__(base_layer)
+        self._w13_slices = 1
+
+    def _create_lora_b_weights(self, max_loras, lora_config):
+        self.w13_lora_b_stacked: tuple[torch.Tensor] = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.intermediate_size_per_partition * 2,
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+            for _ in range(self._w13_slices)
+        )
+        self.w2_lora_b_stacked: tuple[torch.Tensor] = (
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.hidden_size
+                    if not self.fully_sharded
+                    else divide(self.base_layer.hidden_size, self.tp_size),
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ),
+        )
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: PretrainedConfig | None = None,
+    ) -> None:
+        """Initializes lora matrices."""
+        self.max_loras = lora_config.max_loras
+        self.fully_sharded = lora_config.fully_sharded_loras
+
+        self.adapter_enabled = torch.tensor(
+            [0] * (max_loras + 1), dtype=torch.int, device=self.device
+        )
+
+        self._create_lora_a_weights(max_loras, lora_config)
+        self._create_lora_b_weights(max_loras, lora_config)
+
+    def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor:
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w13_lora_a
+
+        # w13_lora_a shape (num_experts,rank,input_size)
+        current_lora_rank = w13_lora_a.shape[1]
+        assert current_lora_rank % self.tp_size == 0
+
+        sliced_rank = current_lora_rank // self.tp_size
+        start_idx = self.tp_rank * sliced_rank
+        end_idx = (self.tp_rank + 1) * sliced_rank
+        return w13_lora_a[:, start_idx:end_idx, :]
+
+    def _slice_w13_b(self, w13_lora_b: torch.Tensor, is_interleave: bool = True):
+        if self.tp_size == 1:
+            return w13_lora_b
+
+        # w13_lora_b shape (num_experts,output_size,rank)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        if is_interleave:
+            # For models like GPT-OSS, the weights of w1 (gate_proj) and w3 (up_proj)
+            # in the interleaved order, and corresponding LoRA need to be processed.
+            w1_lora_b = w13_lora_b[:, ::2, :]
+            w3_lora_b = w13_lora_b[:, 1::2, :]
+            sliced_w1_lora_b = w1_lora_b[:, start_idx:end_idx, :]
+            sliced_w3_lora_b = w3_lora_b[:, start_idx:end_idx, :]
+
+            return torch.stack([sliced_w1_lora_b, sliced_w3_lora_b], dim=2).flatten(
+                1, 2
+            )
+        else:
+            slice_size = w13_lora_b.shape[1] // 2
+            w1_lora_b = w13_lora_b[:, :slice_size, :]
+            w3_lora_b = w13_lora_b[:, slice_size:, :]
+            sliced_w1_lora_b = w1_lora_b[:, start_idx:end_idx, :]
+            sliced_w3_lora_b = w3_lora_b[:, start_idx:end_idx, :]
+
+            return torch.cat([sliced_w1_lora_b, sliced_w3_lora_b], dim=1)
+
+    def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor:
+        if self.tp_size == 1:
+            return w2_lora_a
+        # w2_lora_a shape (num_experts,rank,input_size)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+
+        return w2_lora_a[:, :, start_idx:end_idx]
+
+    def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor:
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w2_lora_b
+        # Based on S-LoRA, we slice W2 B along the hidden_size dim.
+        # w2_lora_b shape (num_experts,output_size,rank)
+        current_lora_size = w2_lora_b.shape[1]
+
+        sliced_size = current_lora_size // self.tp_size
+        start_idx = self.tp_rank * sliced_size
+        end_idx = (self.tp_rank + 1) * sliced_size
+        return w2_lora_b[:, start_idx:end_idx, :]
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
+    ):
+        """Overwrites lora tensors at index."""
+        # Make mypy happy
+        assert isinstance(lora_a, list)
+        assert isinstance(lora_b, list)
+        assert len(lora_a) == len(lora_b) == 2
+
+        self.reset_lora(index)
+        self.adapter_enabled[index] = 1
+
+        num_experts = self.w13_lora_a_stacked[0].shape[1]
+        w13_lora_a, w2_lora_a = lora_a
+        w13_lora_b, w2_lora_b = lora_b
+
+        # (num_experts,rank,input_size)
+        w13_lora_a = w13_lora_a.reshape(num_experts, -1, w13_lora_a.shape[-1])
+        w2_lora_a = w2_lora_a.reshape(num_experts, -1, w2_lora_a.shape[-1])
+        # (output_size,num_experts,rank)
+        w13_lora_b = w13_lora_b.reshape(w13_lora_b.shape[0], num_experts, -1)
+        w2_lora_b = w2_lora_b.reshape(w2_lora_b.shape[0], num_experts, -1)
+        # (num_experts,output_size,rank)
+        w13_lora_b = w13_lora_b.permute(1, 0, 2)
+        w2_lora_b = w2_lora_b.permute(1, 0, 2)
+
+        sliced_w13_lora_a = self._slice_w13_a(w13_lora_a)
+        sliced_w13_lora_b = self._slice_w13_b(w13_lora_b, is_interleave=True)
+
+        sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
+        sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
+
+        self.w13_lora_a_stacked[0][
+            index, :, : sliced_w13_lora_a.shape[1], : sliced_w13_lora_a.shape[2]
+        ].copy_(sliced_w13_lora_a, non_blocking=True)
+        self.w2_lora_a_stacked[0][
+            index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
+        ].copy_(sliced_w2_lora_a, non_blocking=True)
+
+        self.w13_lora_b_stacked[0][
+            index, :, : sliced_w13_lora_b.shape[1], : sliced_w13_lora_b.shape[2]
+        ].copy_(sliced_w13_lora_b, non_blocking=True)
+        self.w2_lora_b_stacked[0][
+            index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
+        ].copy_(sliced_w2_lora_b, non_blocking=True)
 
     @property
-    def _shared_experts(self):
-        return self.base_layer._shared_experts
+    def w13_input_size(self):
+        """
+        Full size
+        """
+        return self.w13_lora_a_stacked[0].shape[-1]
 
     @property
-    def quant_method(self):
-        return self.base_layer.quant_method
+    def w13_output_size(self):
+        """
+        Full size
+        """
+        return self.w13_lora_b_stacked[0].shape[-2] * self.tp_size
 
     @property
-    def is_internal_router(self) -> bool:
-        return self.base_layer.is_internal_router
+    def w2_input_size(self):
+        """
+        Full size
+        """
+        return self.w2_lora_a_stacked[0].shape[-1] * self.tp_size
+
+    @property
+    def w2_output_size(self):
+        """
+        Full size
+        """
+        return self.w2_lora_a_stacked[0].shape[-2]
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None,
+    ) -> bool:
+        """Returns True if the layer can be replaced by this LoRA layer."""
+
+        return type(source_layer) is FusedMoE and len(packed_modules_list) == 1
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index 06f92652031e..c01984db4e64 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -128,9 +128,11 @@ def reset_lora(self, index: int):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
+        assert isinstance(lora_a, torch.Tensor)
+        assert isinstance(lora_b, torch.Tensor)
         self.reset_lora(index)
         self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
             lora_a, non_blocking=True
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
index 5b1f7886bc23..c87ca9e24dec 100644
--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -77,12 +77,15 @@ def reset_lora(self, index: int):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
+        assert isinstance(lora_a, torch.Tensor)
+        assert isinstance(lora_b, torch.Tensor)
         self.reset_lora(index)
         # NOTE self.lora_a_stacked is row-major, and lora_a is col-major,
         # so we need transpose here
+
         self.lora_a_stacked[index, : lora_a.shape[1], : lora_a.shape[0]].copy_(
             lora_a.T, non_blocking=True
         )
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index eb11cd0afc48..636f062feb7b 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -22,11 +22,13 @@
     from_layer_logits_processor,
     get_supported_lora_modules,
     is_base_embeddding_weights,
+    is_moe_model,
     is_regex_target_modules,
     parse_fine_tuned_lora_name,
     process_packed_modules_mapping,
     replace_submodule,
 )
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.interfaces import is_pooling_model
@@ -356,7 +358,11 @@ def __init__(
         self.modules: dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a set for compatibility with LRUCache.
         self._last_mapping: LoRAMapping | None = None
+        self._is_3d_moe_model = is_moe_model(self.model) and hasattr(
+            self.model, "is_3d_moe_weight"
+        )
         self._create_lora_modules()
+
         self.model.lora_manager = self
 
     def __len__(self) -> int:
@@ -400,22 +406,36 @@ def activate_adapter(
         self.lora_index_to_id[index] = lora_model.id
         for module_name, module in self.modules.items():
             module_lora = self._get_lora_layer_weights(lora_model, module_name)
-            if module_lora:
-                # Note (gnovack) - If MOE lora weights are not split into
-                # num_experts chunks, we split them here
-                if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor(
-                    module_lora.lora_a
-                ):
-                    # Handle FSDP file format where experts.base_layer is the
-                    # gate_up_proj and experts is the down_proj
-                    gate_up_proj_lora = self._get_lora_layer_weights(
-                        lora_model, module_name + ".base_layer"
-                    )
-
-                    assert gate_up_proj_lora is not None
-                    assert module_lora is not None
-
-                    down_proj_lora = module_lora
+            if not module_lora:
+                module.reset_lora(index)
+                continue
+            # Note (gnovack) - If MOE lora weights are not split into
+            # num_experts chunks, we split them here
+            if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor(
+                module_lora.lora_a
+            ):
+                # Handle PEFT file format where experts.base_layer is the
+                # gate_up_proj and experts is the down_proj
+                gate_up_proj_lora = self._get_lora_layer_weights(
+                    lora_model, module_name + ".base_layer"
+                )
+                down_proj_lora = module_lora
+                # FIXME Edge case where LoRA is not added to gate_up_proj
+                # or down_proj
+                assert gate_up_proj_lora is not None
+                assert down_proj_lora is not None
+                if self._is_3d_moe_model:
+                    module_lora.lora_a = [
+                        gate_up_proj_lora.lora_a,
+                        down_proj_lora.lora_a,
+                    ]
+                    module_lora.lora_b = [
+                        gate_up_proj_lora.lora_b,
+                        down_proj_lora.lora_b,
+                    ]
+                else:
+                    # Some 3D MoE models haven't added the `is_3d_moe_weight`
+                    # attribute yet, so fallback here
                     num_experts = module_lora.lora_a.shape[0] // module_lora.rank
 
                     gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
@@ -444,14 +464,12 @@ def activate_adapter(
 
                     module_lora.lora_a = lora_a
                     module_lora.lora_b = lora_b
+            module.set_lora(
+                index,
+                module_lora.lora_a,
+                module_lora.lora_b,
+            )
 
-                module.set_lora(
-                    index,
-                    module_lora.lora_a,
-                    module_lora.lora_b,
-                )
-            else:
-                module.reset_lora(index)
         return True
 
     def _deactivate_adapter(self, lora_id: int):
@@ -512,6 +530,13 @@ def _parent_module(module_name: str) -> str:
                 continue
             parts = module_name.split(".")[-1]
             packed_moduled_lst = self.packed_modules_mapping.get(parts, [])
+            if isinstance(module, FusedMoE):
+                # packed_moduled_lst is used here to just determine whether to
+                # instantiate FusedMoE3DWithLoRA or FusedMoEWithLoRA, and the
+                # difference between these two LoRA layers is whether the
+                # LoRA weights of w1 and w3 have already been fused on disk.
+
+                packed_moduled_lst = ["w13"] if self._is_3d_moe_model else ["w1", "w3"]
             new_module = replace_submodule(
                 self.model,
                 module_name,
@@ -560,6 +585,7 @@ def _parent_module(module_name: str) -> str:
             self._register_packed_modules(module_name)
             # All lora layers share the same punica_wrapper based on reference.
             new_module.set_mapping(self.punica_wrapper)
+        pass
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA), (
@@ -605,6 +631,30 @@ def create_dummy_lora(
                         module.lora_a_stacked[0].dtype,
                         "cpu",
                     )
+                    model.loras[module_name] = lora
+                elif module.__class__.__name__ == "FusedMoE3DWithLoRA":
+                    # Case for 3D moe model
+                    # w2
+                    lora = LoRALayerWeights.create_dummy_lora_weights(
+                        module_name,
+                        module.w2_input_size,
+                        module.w2_output_size,
+                        rank * module.w2_lora_a_stacked[0].shape[1],  # rank*num_experts
+                        module.w2_lora_a_stacked[0].dtype,
+                        "cpu",
+                    )
+                    model.loras[module_name] = lora
+                    # w13
+                    lora = LoRALayerWeights.create_dummy_lora_weights(
+                        module_name,
+                        module.w13_input_size,
+                        module.w13_output_size,
+                        rank
+                        * module.w13_lora_a_stacked[0].shape[1],  # rank*num_experts
+                        module.w13_lora_a_stacked[0].dtype,
+                        "cpu",
+                    )
+                    model.loras[module_name + ".base_layer"] = lora
                 else:
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
@@ -614,6 +664,7 @@ def create_dummy_lora(
                         module.lora_a_stacked[0].dtype,
                         "cpu",
                     )
+                    model.loras[module_name] = lora
             else:
                 parts = module_name.split(".")
                 replacements = self.packed_modules_mapping[parts[-1]]
@@ -629,7 +680,7 @@ def create_dummy_lora(
                     )
                     subloras.append(lora)
                 lora = PackedLoRALayerWeights.pack(subloras)
-            model.loras[module_name] = lora
+                model.loras[module_name] = lora
         return model
 
     def _match_target_modules(self, module_name: str):
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index a49a7d9d1669..12524994d496 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -23,6 +23,7 @@
     BaseLayerWithLoRA,
     ColumnParallelLinearWithLoRA,
     ColumnParallelLinearWithShardedLoRA,
+    FusedMoE3DWithLoRA,
     FusedMoEWithLoRA,
     LogitsProcessorWithLoRA,
     MergedColumnParallelLinearWithLoRA,
@@ -62,6 +63,7 @@
     MergedQKVParallelLinearWithShardedLoRA,
     RowParallelLinearWithShardedLoRA,
     FusedMoEWithLoRA,
+    FusedMoE3DWithLoRA,
 }
 
 
@@ -288,10 +290,12 @@ def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]:
             # the expert indices are expanded based on the configured number
             # of routed experts.
             packed_modules_mapping = get_packed_modules_mapping(model)
-
-            packed_modules_mapping["experts"] = [
-                weight_name.rstrip(".") for _, weight_name, _, _ in moe_packed_mapping
-            ]
+            if not hasattr(model, "is_3d_moe_weight"):
+                # 3D MoE LoRA does not need `packed_modules_mapping`
+                packed_modules_mapping["experts"] = [
+                    weight_name.rstrip(".")
+                    for _, weight_name, _, _ in moe_packed_mapping
+                ]
 
             return packed_modules_mapping
         else:
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 8835acb8ec65..1bc0ad38765d 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -656,6 +656,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
 
 class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
+    is_3d_moe_weight: bool = True
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     hf_to_vllm_mapper = WeightsMapper(

From 30854783add03ef4d669e3a0041f60d89061172e Mon Sep 17 00:00:00 2001
From: Zero <rockmanzero@naver.com>
Date: Mon, 24 Nov 2025 11:27:55 +0900
Subject: [PATCH 359/578] [Model] Add OpenCUA-7B support (#29068)

Signed-off-by: lim4349 <rockmanzero@naver.com>
Signed-off-by: Zero <rockmanzero@naver.com>
Co-authored-by: Cloud User <ubuntu@a100-80g-4.novalocal>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/models/supported_models.md        |   1 +
 tests/models/registry.py               |   3 +
 vllm/model_executor/models/opencua.py  | 271 +++++++++++++++++++++++++
 vllm/model_executor/models/registry.py |   4 +
 4 files changed, 279 insertions(+)
 create mode 100644 vllm/model_executor/models/opencua.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 626904a97415..404519f887dc 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -701,6 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
 | `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
+| `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
 | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
 | `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
 | `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b088e16756d7..758ec54493aa 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -725,6 +725,9 @@ def check_available_online(
     "NemotronH_Nano_VL_V2": _HfExamplesInfo(
         "nano_vl_dummy", is_available_online=False, trust_remote_code=True
     ),
+    "OpenCUAForConditionalGeneration": _HfExamplesInfo(
+        "xlangai/OpenCUA-7B", trust_remote_code=True
+    ),
     "Ovis": _HfExamplesInfo(
         "AIDC-AI/Ovis2-1B",
         trust_remote_code=True,
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
new file mode 100644
index 000000000000..121bf896fa6b
--- /dev/null
+++ b/vllm/model_executor/models/opencua.py
@@ -0,0 +1,271 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Adapted from Qwen2.5-VL implementation
+# Copyright 2025 The vLLM team.
+# Copyright 2025 XLANG Lab, The University of Hong Kong
+
+"""Inference-only OpenCUA-7B model compatible with HuggingFace weights."""
+
+from collections.abc import Mapping, Sequence
+from typing import Any
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature
+from transformers.models.qwen2_vl import (
+    Qwen2VLImageProcessor,
+    Qwen2VLProcessor,
+    Qwen2VLVideoProcessor,
+)
+
+from vllm.config import VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalFieldConfig,
+    MultiModalKwargs,
+)
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .qwen2_5_vl import (
+    Qwen2_5_VisionTransformer as OpenCUAVisionTransformer,
+)
+from .qwen2_5_vl import (
+    Qwen2_5_VLForConditionalGeneration,
+)
+from .qwen2_vl import (
+    Qwen2VLDummyInputsBuilder,
+    Qwen2VLMultiModalDataParser,
+    Qwen2VLProcessingInfo,
+    _create_qwen2vl_field_factory,
+)
+from .utils import (
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
+
+class OpenCUAProcessingInfo(Qwen2VLProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_hf_processor(self, **kwargs: object):
+        """Load OpenCUA processor."""
+        tokenizer = self.get_tokenizer()
+        vision_config = self.ctx.get_hf_image_processor_config()
+        return OpenCUAProcessor(
+            vision_config=vision_config,
+            tokenizer=tokenizer,
+            **kwargs,
+        )
+
+
+class OpenCUAProcessor(Qwen2VLProcessor):
+    def check_argument_for_proper_class(self, attribute_name: str, arg: object) -> None:
+        if attribute_name == "tokenizer":
+            return
+        return super().check_argument_for_proper_class(attribute_name, arg)
+
+    def __init__(
+        self,
+        vision_config: dict,
+        tokenizer: AnyTokenizer,
+        **kwargs,
+    ):
+        image_processor = Qwen2VLImageProcessor(**vision_config)
+        video_processor = Qwen2VLVideoProcessor(**vision_config)
+        chat_template = kwargs.pop("chat_template", None)
+
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            chat_template=chat_template,
+            **kwargs,
+        )
+
+        self.image_token = "<|media_placeholder|>"
+
+    def __call__(
+        self,
+        text=None,
+        images=None,
+        return_tensors=None,
+        **kwargs,
+    ):
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+            text_inputs = self.tokenizer(text, **kwargs)
+        else:
+            text_inputs = {}
+
+        image_inputs = {}
+        if images is not None:
+            if not isinstance(images, list):
+                images = [images]
+            if len(images) > 0:
+                image_inputs = self.image_processor(
+                    images, return_tensors=return_tensors or "pt"
+                )
+
+        combined_inputs = {**text_inputs, **image_inputs}
+
+        return BatchFeature(combined_inputs, tensor_type=return_tensors)
+
+
+class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo]):
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return Qwen2VLMultiModalDataParser(
+            self.info.get_hf_config().vision_config.spatial_merge_size
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _create_qwen2vl_field_factory(
+            self.info.get_hf_config().vision_config.spatial_merge_size
+        )(hf_inputs)
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        """vLLM이 prompt 업데이트를 처리하도록 False 반환."""
+        return False
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        hf_config = self.info.get_hf_config()
+
+        image_token_str = getattr(hf_processor, "image_token", "<|media_placeholder|>")
+        image_token_id = vocab.get(
+            image_token_str,
+            getattr(hf_config, "media_placeholder_token_id", 151664),
+        )
+
+        merge_length = image_processor.merge_size**2
+
+        def get_replacement_opencua(item_idx: int):
+            out_item = out_mm_kwargs["image"][item_idx]
+            grid_thw = out_item["image_grid_thw"].data
+            assert isinstance(grid_thw, torch.Tensor)
+
+            num_tokens = int(grid_thw.prod()) // merge_length
+            return [image_token_id] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement_opencua,
+            )
+        ]
+
+
+class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        image_token = "<|media_placeholder|>"
+
+        return image_token * num_images
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    OpenCUAMultiModalProcessor,
+    info=OpenCUAProcessingInfo,
+    dummy_inputs=OpenCUADummyInputsBuilder,
+)
+class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
+    merge_by_field_config = True
+    multimodal_cpu_fields = {"image_grid_thw"}
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            "vision_tower.": "visual.",
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        }
+    )
+
+    supports_encoder_tp_data = True
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<|media_placeholder|>"
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
+        self.config = config
+        self.vllm_config = vllm_config
+        self.multimodal_config = multimodal_config
+        self.quant_config = quant_config
+        self.is_multimodal_pruning_enabled = (
+            multimodal_config.is_multimodal_pruning_enabled()
+        )
+
+        if multimodal_config.get_limit_per_prompt("image"):
+            attn_backend_override = (
+                multimodal_config.mm_encoder_attn_backend
+                if multimodal_config is not None
+                else None
+            )
+            self.visual = OpenCUAVisionTransformer(
+                vision_config=config.vision_config,
+                norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "visual"),
+                use_data_parallel=self.use_data_parallel,
+                attn_backend_override=attn_backend_override,
+            )
+        else:
+            self.visual = None
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 494398760620..b3da64af750c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -289,6 +289,10 @@
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
+    "OpenCUAForConditionalGeneration": (
+        "opencua",
+        "OpenCUAForConditionalGeneration",
+    ),
     "InternS1ForConditionalGeneration": (
         "interns1",
         "InternS1ForConditionalGeneration",

From 5253f4276f333474f43d7f1cdaad6104d8f88f1f Mon Sep 17 00:00:00 2001
From: tongqiu <toqiu@amd.com>
Date: Mon, 24 Nov 2025 11:26:00 +0800
Subject: [PATCH 360/578] [ROCm] Support for Whisper v1 with Aiter Unified
 Attention and Aiter Flash Attention (#28376)

Signed-off-by: apinge <Tong.Qiu2@amd.com>
---
 vllm/v1/attention/backends/rocm_aiter_fa.py   | 22 ++++++++++++-------
 .../backends/rocm_aiter_unified_attn.py       | 14 ++++++++++--
 vllm/v1/attention/backends/rocm_attn.py       |  7 ++----
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index c8742e983520..ea911af3d19c 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -517,12 +517,9 @@ def __init__(
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        if attn_type != AttentionType.DECODER:
+        if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]:
             raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "FlashAttentionImpl"
+                "Encoder self-attention is not implemented for FlashAttentionImpl"
             )
 
     def extend_forward(
@@ -678,7 +675,14 @@ def forward(
         # performance to make sure it does not introduce any overhead.
         num_actual_tokens = attn_metadata.num_actual_tokens
         key_cache, value_cache = kv_cache.unbind(0)
-        if self.kv_sharing_target_layer_name is None:
+        # key and value may be None in the case of cross attention. They are
+        # calculated once based on the output from the encoder and then cached
+        # in KV cache.
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             # NOTE(woosuk): Here, key and value are padded while slot_mapping
@@ -704,8 +708,10 @@ def forward(
 
         # decode:extend:prefill
         query = query[:num_actual_tokens]
-        key = key[:num_actual_tokens]
-        value = value[:num_actual_tokens]
+        if key is not None:
+            key = key[:num_actual_tokens]
+        if value is not None:
+            value = value[:num_actual_tokens]
 
         output_actual_tokens = output[:num_actual_tokens]
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
index b2639c0df041..16fb52ab501c 100644
--- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
@@ -142,7 +142,14 @@ def forward(
 
         key_cache, value_cache = kv_cache.unbind(0)
 
-        if self.kv_sharing_target_layer_name is None:
+        # key and value may be None in the case of cross attention. They are
+        # calculated once based on the output from the encoder and then cached
+        # in KV cache.
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             ops.reshape_and_cache_flash(
@@ -169,7 +176,10 @@ def forward(
         max_seqlen_k = attn_metadata.max_seq_len
         block_table = attn_metadata.block_table
 
-        descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+        descale_shape = (
+            cu_seqlens_q.shape[0] - 1,
+            key.shape[1] if key is not None else self.num_kv_heads,
+        )
 
         self.unified_attention(
             q=query[:num_actual_tokens],
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 6dfdfc19ccba..868143cc192e 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -238,12 +238,9 @@ def __init__(
 
         RocmAttentionBackend.validate_head_size(head_size)
 
-        if attn_type != AttentionType.DECODER:
+        if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]:
             raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "RocmAttentionImpl"
+                "Encoder self-attention is not implemented for RocmAttentionImpl"
             )
 
         self.fp8_dtype = current_platform.fp8_dtype()

From 0ff70821c9b0b991197fa7f3264bf9dd78b8d4b3 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sun, 23 Nov 2025 20:18:55 -0800
Subject: [PATCH 361/578] [Core] Deprecate `xformers` (#29262)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 docker/Dockerfile.nightly_torch               |  35 +-
 .../contributing/ci/update_pytorch_version.md |  15 -
 docs/getting_started/quickstart.md            |   2 +-
 .../openai_embedding_long_text/service.sh     |   1 -
 requirements/cuda.txt                         |   1 -
 .../test_basic_correctness.py                 |   3 -
 tests/kernels/attention/test_attention.py     | 129 ------
 .../attention/test_attention_selector.py      |   8 +-
 tests/kernels/attention/test_mha_attn.py      |   4 -
 tests/kernels/utils.py                        |  78 +---
 tests/lora/test_minicpmv_tp.py                |  12 -
 tests/lora/test_qwen2vl.py                    |  15 -
 vllm/attention/backends/registry.py           |   1 -
 vllm/attention/layer.py                       |  38 --
 vllm/attention/ops/vit_attn_wrappers.py       |  38 +-
 vllm/attention/selector.py                    |   9 +-
 vllm/config/multimodal.py                     |   6 +
 vllm/envs.py                                  |   1 -
 vllm/model_executor/models/dots_ocr.py        |  27 +-
 vllm/model_executor/models/ernie45_vl.py      |  33 +-
 vllm/model_executor/models/glm4_1v.py         |  31 +-
 vllm/model_executor/models/keye.py            |  30 +-
 vllm/model_executor/models/paddleocr_vl.py    |  13 -
 vllm/model_executor/models/pixtral.py         |   1 +
 vllm/model_executor/models/qwen2_5_vl.py      |  25 +-
 vllm/model_executor/models/qwen2_vl.py        |  31 +-
 .../models/qwen3_omni_moe_thinker.py          |  12 +-
 vllm/model_executor/models/qwen3_vl.py        |  13 +-
 vllm/platforms/cuda.py                        |   7 +-
 vllm/utils/__init__.py                        |   1 -
 vllm/v1/attention/backends/xformers.py        | 420 ------------------
 31 files changed, 77 insertions(+), 963 deletions(-)
 delete mode 100644 vllm/v1/attention/backends/xformers.py

diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index b88b9c499220..d663c82c3885 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -76,34 +76,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
 uv pip install --system -r requirements/common.txt
 
-# must put before installing xformers, so it can install the correct version of xfomrers.
-ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-
-# Build xformers with cuda and torch nightly
-# following official xformers guidance: https://github.com/facebookresearch/xformers#build
-# todo(elainewy): cache xformers build result for faster build
-ARG max_jobs=16
-ENV MAX_JOBS=${max_jobs}
-ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-     --mount=type=cache,target=/root/.cache/uv \
-    echo 'git clone xformers...' \
-    && git clone https://github.com/facebookresearch/xformers.git --recursive \
-    && cd xformers \
-    && git checkout ${XFORMERS_COMMIT} \
-    && git submodule update --init --recursive \
-    && echo 'finish git clone xformers...' \
-    && rm -rf build \
-    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
-    && cd .. \
-    && rm -rf xformers
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system xformers-dist/*.whl --verbose
-
 # build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
@@ -233,11 +205,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm
     --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system vllm-dist/*.whl --verbose
 
-# install xformers again for the new environment
-RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
-
 ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 
 # install package for build flashinfer
@@ -307,7 +274,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/nightly_torch_test.txt
 
 # Logging to confirm the torch versions
-RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+RUN pip freeze | grep -E 'torch|vllm|flashinfer'
 
 # Logging to confirm all the packages are installed
 RUN pip freeze
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 09fd85a466ee..735bb2e20533 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -98,21 +98,6 @@ to warm it up so that future builds are faster.
     <img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
 </p>
 
-## Update dependencies
-
-Several vLLM dependencies like xFormers depend on PyTorch and need
-to be updated accordingly. Rather than waiting for all of them to publish new
-releases (which would take too much time), they can be built from
-source to unblock the update process.
-
-### xFormers
-
-```bash
-export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
-MAX_JOBS=16 uv pip install --system \
-    --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
-```
-
 ## Update all the different vLLM platforms
 
 Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 9e86f785b10c..94920dc5306b 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -283,7 +283,7 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
 
-- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
+- On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
 - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
 
 For AMD ROCm, you can further control the specific Attention implementation using the following variables:
diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh
index 1577de85f7ff..b5c92749466b 100644
--- a/examples/online_serving/openai_embedding_long_text/service.sh
+++ b/examples/online_serving/openai_embedding_long_text/service.sh
@@ -22,7 +22,6 @@ API_KEY=${API_KEY:-"your-api-key"}
 POOLING_TYPE=${POOLING_TYPE:-"auto"}  # auto, MEAN, CLS, LAST
 export VLLM_ENABLE_CHUNKED_PROCESSING=true
 export CUDA_VISIBLE_DEVICES=2,3,4,5
-# export VLLM_ATTENTION_BACKEND=XFORMERS
 
 echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
 echo "=================================================================="
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index d63fe9e1e77c..15e8aadc56f4 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,6 +9,5 @@ torch==2.9.0
 torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.2
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 0cf1e85d4e8e..521d6c33dd39 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -74,9 +74,6 @@ def test_models(
     model_executor: str,
     enable_prompt_embeds: bool,
 ) -> None:
-    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
-        pytest.skip(f"{backend} does not support gemma2 with full context length.")
-
     with monkeypatch.context() as m:
         m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 9662e73321eb..1a7d5ce0ddc1 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -13,12 +13,6 @@
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 
-if not current_platform.is_rocm():
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
-
-    from tests.kernels.utils import make_alibi_bias
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
@@ -448,129 +442,6 @@ def ref_multi_query_kv_attention(
     return torch.cat(ref_outputs, dim=0)
 
 
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
-    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention(
-    num_seqs: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    use_alibi: bool = False,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
-    # As the xformers library is already tested with its own tests, we can use
-    # a smaller MAX_SEQ_LEN here.
-    max_len = min(MAX_SEQ_LEN, 4096)
-    seq_lens = random.sample(range(1, max_len), num_seqs)
-    num_tokens = sum(seq_lens)
-
-    scale = float(1.0 / (head_size**0.5))
-    num_query_heads, num_kv_heads = num_heads
-    qkv = torch.empty(
-        num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
-    )
-    qkv.uniform_(-scale, scale)
-    query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
-
-    num_queries_per_kv = num_query_heads // num_kv_heads
-    if num_queries_per_kv > 1:
-        # Handle MQA and GQA
-        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
-        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
-    alibi_bias = None
-    if use_alibi:
-        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
-        attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
-        output = torch.empty_like(query)
-        start = 0
-        # Dynamic sequence length not supported with custom attn_bias.
-        for i, seq_len in enumerate(seq_lens):
-            end = start + seq_len
-            out = xops.memory_efficient_attention_forward(
-                query[None, start:end],
-                key[None, start:end],
-                value[None, start:end],
-                attn_bias=attn_bias[i],
-                p=0.0,
-                scale=scale,
-            )
-            output[start:end].copy_(out.view_as(query[start:end]))
-            start += seq_len
-        # xformers.AttentionBias to Tensor for use in reference impl.
-        alibi_bias = [
-            b.materialize((1, num_query_heads, i, i), device=device).squeeze()
-            for b, i in zip(attn_bias, seq_lens)
-        ]
-    else:
-        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
-        output = xops.memory_efficient_attention_forward(
-            query.unsqueeze(0),
-            key.unsqueeze(0),
-            value.unsqueeze(0),
-            attn_bias=attn_bias,
-            p=0.0,
-            scale=scale,
-        )
-        output = output.squeeze(0)
-
-    cu_seq_lens = [0]
-    for seq_len in seq_lens:
-        cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
-    ref_output = ref_multi_query_kv_attention(
-        cu_seq_lens,
-        query,
-        key,
-        value,
-        scale,
-        alibi_bias,
-        dtype,
-    )
-    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
-    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
-
-
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", [64])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
-    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention_with_alibi(
-    num_seqs: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> None:
-    return test_multi_query_kv_attention(
-        num_seqs,
-        num_heads,
-        head_size,
-        dtype,
-        seed,
-        device,
-        use_alibi=True,
-    )
-
-
 @pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
 def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
     head_size = 64
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 9be56a33f76c..cd34b520ea71 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -34,7 +34,7 @@ def clear_cache():
 }
 
 DEVICE_REGULAR_ATTN_BACKENDS = {
-    "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
+    "cuda": ["FLASHINFER", "FLASH_ATTN"],
     "hip": ["ROCM_ATTN"],
     "cpu": ["CPU_ATTN"],
 }
@@ -207,12 +207,6 @@ def test_env(
                     )
                     expected = "FLASHINFER"
                     assert backend.get_name() == expected
-                elif name == "XFORMERS":
-                    backend = get_attn_backend(
-                        32, torch.float16, None, block_size, use_mla=use_mla
-                    )
-                    expected = "XFORMERS"
-                    assert backend.get_name() == expected
                 elif name == "FLASH_ATTN":
                     backend = get_attn_backend(
                         32, torch.float16, None, block_size, use_mla=use_mla
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index a878ac6396ce..ae3c63cc62d6 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -24,10 +24,6 @@
 def clear_cache():
     """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
-    # Clear xformers availability cache
-    import vllm.attention.layer as layer_module
-
-    layer_module.USE_XFORMERS_OPS = None
 
 
 @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 5d5a26fbfc2c..9307ef7814a8 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -509,43 +509,6 @@ def pack_qkv(qkv: QKVInputs, device: torch.device | str) -> PackedQKVInputs:
     )
 
 
-def make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    num_kv_heads: int,
-    dtype: torch.dtype,
-    seq_lens: list[int],
-) -> list[Any]:
-    """Create ALiBi biases compatible with xFormers attention tests."""
-    from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias
-
-    if alibi_slopes is None:
-        return [None for _ in seq_lens]
-
-    attn_biases: list[Any] = []
-    num_heads = alibi_slopes.shape[0]
-    assert num_heads >= num_kv_heads, (
-        "ALiBi slopes expect at least as many heads as KV heads"
-    )
-
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
-        bias = bias[None, :] - bias[:, None]
-
-        padded_len = (seq_len + 7) // 8 * 8
-        bias_tensor = torch.empty(
-            1,
-            num_heads,
-            seq_len,
-            padded_len,
-            device=alibi_slopes.device,
-            dtype=dtype,
-        )[:, :, :, :seq_len].copy_(bias)
-        bias_tensor.mul_(alibi_slopes[:, None, None])
-        attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor))
-
-    return attn_biases
-
-
 def _make_metadata_tensors(
     seq_lens: list[int] | None,
     context_lens: list[int] | None,
@@ -649,23 +612,12 @@ def make_kv_cache(
 
     Returns:
 
-    * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
-    *     for backend 'XFORMERS'
     * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
     *     for backend 'FLASH_ATTN'
     """
-    if backend == "XFORMERS":
-        kv_cache = torch.rand((2, num_blocks, block_size * num_heads * head_size)).to(
-            device
-        )
-    elif backend == "FLASH_ATTN":
-        kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(
-            device
-        )
-    else:
-        raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
-        )
+    if backend != "FLASH_ATTN":
+        raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+    kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(device)
     if default_val is not None:
         kv_cache[:, :, :] = default_val
     return kv_cache
@@ -843,22 +795,14 @@ def assert_actual_matches_ideal(
     * output_under_test: actually observed output value
     """
     ideal_output = test_params.packed_qkvo.ideal_output
-    if backend == "XFORMERS":
-        torch.testing.assert_close(
-            ideal_output, output_under_test.view_as(ideal_output)
-        )
-
-    elif backend == "FLASH_ATTN":
-        # For FlashAttention override the accuracy thresholds to non default
-        # values since we notice a higher difference between the ideal and
-        # actual output.
-        torch.testing.assert_close(
-            ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
-        )
-    else:
-        raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
-        )
+    if backend != "FLASH_ATTN":
+        raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+    # For FlashAttention override the accuracy thresholds to non default
+    # values since we notice a higher difference between the ideal and
+    # actual output.
+    torch.testing.assert_close(
+        ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
+    )
 
 
 # Copied/modified from torch._refs.__init__.py
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 1cf8ed602b6a..e430826461a1 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -57,10 +57,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -84,10 +80,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
 @pytest.mark.skipif(
     current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
 )
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 @multi_gpu_test(num_gpus=4)
 def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
@@ -108,10 +100,6 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
 @pytest.mark.skipif(
     current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
 )
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 @multi_gpu_test(num_gpus=4)
 def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 1800ca107a42..7d8c940100ca 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -2,12 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
-import pytest
-
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 
 
@@ -142,10 +139,6 @@ def run_beam_search_test(
 QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen2vl_lora(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA"""
     config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
@@ -156,10 +149,6 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
         tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA through beam search."""
     config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
@@ -178,10 +167,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
         )
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen25vl_lora(qwen25vl_lora_files):
     """Test Qwen 2.5 VL model with LoRA"""
     config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 6747cf7743b1..125e4e382774 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -43,7 +43,6 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
 
     FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
     TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
-    XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
     ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
     ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
     ROCM_AITER_TRITON_MLA = (
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a8e796a1eab6..f1d57ac50fb9 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -51,31 +51,6 @@
 
 FP8_DTYPE = current_platform.fp8_dtype()
 logger = init_logger(__name__)
-USE_XFORMERS_OPS = None
-
-
-def check_xformers_availability():
-    global USE_XFORMERS_OPS
-    if USE_XFORMERS_OPS is not None:
-        return USE_XFORMERS_OPS
-
-    if current_platform.is_cuda() and current_platform.has_device_capability(100):
-        # Xformers FA is not compatible with B200
-        USE_XFORMERS_OPS = False
-    else:
-        try:
-            from importlib.util import find_spec
-
-            find_spec("xformers.ops")
-            USE_XFORMERS_OPS = True
-        except ImportError:
-            USE_XFORMERS_OPS = False
-
-    # the warning only needs to be shown once
-    if not USE_XFORMERS_OPS:
-        logger.warning("Xformers is not available, falling back.")
-
-    return USE_XFORMERS_OPS
 
 
 def check_upstream_fa_availability(dtype: torch.dtype):
@@ -533,7 +508,6 @@ def __init__(
             if backend
             in {
                 AttentionBackendEnum.TORCH_SDPA,
-                AttentionBackendEnum.XFORMERS,
                 AttentionBackendEnum.PALLAS,
                 AttentionBackendEnum.ROCM_AITER_FA,
                 AttentionBackendEnum.FLASH_ATTN,
@@ -549,12 +523,6 @@ def __init__(
             )
         )
 
-        if (
-            self.attn_backend == AttentionBackendEnum.XFORMERS
-            and not check_xformers_availability()
-        ):
-            self.attn_backend = AttentionBackendEnum.TORCH_SDPA
-
         self.is_flash_attn_backend = self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
@@ -614,12 +582,6 @@ def forward(
                 max_seqlen_k=kv_len,
                 softmax_scale=self.scale,
             )
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-
-            out = xops.memory_efficient_attention_forward(
-                query, key, value, scale=self.scale
-            )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             query, key, value = (x.transpose(1, 2) for x in (query, key, value))
             out = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 06a9f7cd8226..46f8f5117f7a 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -3,7 +3,7 @@
 """
 This file contains ops for ViT attention to be compatible with torch.compile
 as there are operations here not supported by torch.compile (for instance,
-`to_list` in xformers attn, or `.item()` in flash attention)
+`.item()` in flash attention)
 
 Using these ops and wrapping vision blocks with `torch.compile` can speed up
 throughput in vision models by ~5% relative on H100, and improve token
@@ -19,42 +19,6 @@
 from vllm.utils.torch_utils import direct_register_custom_op
 
 
-def xformers_attn_seqlens_wrapper(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-    attn_bias = BlockDiagonalMask.from_seqlens(
-        q_seqlen=seqlens.tolist(), kv_seqlen=None, device=q.device
-    )
-    context_layer = xops.memory_efficient_attention_forward(
-        q, k, v, attn_bias=attn_bias, p=0, scale=None
-    )
-    context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
-    return context_layer
-
-
-def xformers_attn_seqlens_wrapper_fake(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    b, s, h, d = q.shape
-    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
-
-
-direct_register_custom_op(
-    op_name="xformers_attn_seqlens_wrapper",
-    op_func=xformers_attn_seqlens_wrapper,
-    fake_impl=xformers_attn_seqlens_wrapper_fake,
-)
-
-
-def vit_xformers_attn_wrapper(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    return torch.ops.vllm.xformers_attn_seqlens_wrapper(q, k, v, seqlens)
-
-
 def flash_attn_maxseqlen_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index e9af08b2316d..ad19b58aa155 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -36,7 +36,14 @@ def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
     * None otherwise
     """
     backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
-    return None if backend_name is None else AttentionBackendEnum[backend_name]
+    if backend_name is None:
+        return None
+    if backend_name == "XFORMERS":
+        raise ValueError(
+            "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
+            "details). Please select a supported attention backend."
+        )
+    return AttentionBackendEnum[backend_name]
 
 
 # Global state allows a particular choice of backend
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 9f62b35ed515..00a81a319bf7 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -173,6 +173,12 @@ def _validate_mm_encoder_attn_backend(
         # We need to import the real type here (deferred to avoid circular import).
         from vllm.attention.backends.registry import AttentionBackendEnum
 
+        if isinstance(value, str) and value.upper() == "XFORMERS":
+            raise ValueError(
+                "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
+                "details). Please select a supported attention backend."
+            )
+
         if value is None or isinstance(value, AttentionBackendEnum):
             return value
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 9b1ed1fc680b..56558548d398 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -640,7 +640,6 @@ def get_vllm_port() -> int | None:
     # Example options:
     # - "TORCH_SDPA": use torch.nn.MultiheadAttention
     # - "FLASH_ATTN": use FlashAttention
-    # - "XFORMERS": use XFormers
     # - "FLASHINFER": use flashinfer
     # - "FLASHMLA": use FlashMLA
     # - "FLASH_ATTN_MLA": use FlashAttention for MLA
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 2d2251e83b5b..5460018d0d67 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -306,7 +306,6 @@ def __init__(
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -324,7 +323,6 @@ def forward(
         rotary_pos_emb: torch.Tensor | None = None,
         *,
         max_seqlen: int | None = None,
-        seqlens: list[int] | None = None,
     ) -> torch.Tensor:
         # [S, C] -> [S, B=1, C]
         x = hidden_states.unsqueeze(1)
@@ -374,16 +372,6 @@ def forward(
                 out_i = out_i.permute(0, 2, 1, 3)
                 outputs.append(out_i)
             context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
         else:
             raise RuntimeError("Unsupported attention backend")
 
@@ -545,14 +533,12 @@ def forward(
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,
-        seqlens: list[int] | None = None,
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
         return hidden_states
@@ -663,18 +649,14 @@ def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor:
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: list[list[int]]
@@ -694,14 +676,13 @@ def forward(
         )
         cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
 
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         for blk in self.blocks:
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         if self.post_trunk_norm is not None:
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index daa5bf03ea4a..07b34fbc8add 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -214,7 +214,6 @@ def __init__(
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -259,7 +258,6 @@ def forward(
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -311,20 +309,6 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -404,14 +388,12 @@ def forward(
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
         return hidden_states
@@ -562,18 +544,14 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, num_pad=0
@@ -598,8 +576,8 @@ def forward(
         if hidden_states.ndim == 2:
             hidden_states = hidden_states.unsqueeze(dim=1)
 
-        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
 
         for i, blk in enumerate(self.blocks):
             hidden_states = blk(
@@ -607,7 +585,6 @@ def forward(
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         final_output = self.ln(hidden_states)
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index d141e9549806..7e0370886884 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -309,7 +309,6 @@ def __init__(
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -345,7 +344,6 @@ def forward(
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -400,20 +398,6 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -461,7 +445,6 @@ def forward(
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
@@ -469,7 +452,6 @@ def forward(
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         x_fused_norm, residual = self.norm2(x, residual=x_attn)
         x = residual + self.mlp(x_fused_norm)
@@ -803,15 +785,14 @@ def rot_pos_emb(
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
-        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    ) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -836,8 +817,9 @@ def forward(
         ).cumsum(dim=0, dtype=torch.int32)
         cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
 
-        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         x = self.embeddings(
             x, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1]
         )
@@ -851,7 +833,6 @@ def forward(
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         # adapter
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 8fc3db296aa7..302260b95299 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -9,6 +9,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange
 from transformers import PretrainedConfig
 from transformers.activations import GELUActivation
@@ -424,7 +425,7 @@ def __init__(
 
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.TORCH_SDPA,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -451,7 +452,6 @@ def forward(
         )
 
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         batch_size = q.shape[0]
 
         if rope_emb is None:
@@ -498,17 +498,21 @@ def forward(
                 softmax_scale=self.scale,
             )
             context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (
+                    rearrange(x, "b s h d -> b h s d") for x in (q_i, k_i, v_i)
+                )
+                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
 
         context_layer = rearrange(context_layer, "b s h d -> b s (h d)").contiguous()
 
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index dee0c16ab0f6..74bb868492da 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -38,7 +38,6 @@
 )
 from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
-    vit_xformers_attn_wrapper,
 )
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -657,7 +656,6 @@ def forward(
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor | None,
         max_seqlen: torch.Tensor | None,
-        seqlens: torch.Tensor | None,
     ) -> torch.Tensor:
         batch_size, _, _ = hidden_states.shape
 
@@ -703,10 +701,6 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            if seqlens is None:
-                raise ValueError("xFormers attention backend requires seqlens tensor.")
-            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
         else:
             raise RuntimeError(
                 f"PaddleOCR-VL does not support {self.attn_backend} backend now."
@@ -818,7 +812,6 @@ def forward(
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor | None,
         max_seqlen: torch.Tensor | None,
-        seqlens: torch.Tensor | None,
     ) -> torch.Tensor:
         residual = hidden_states
 
@@ -828,7 +821,6 @@ def forward(
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         hidden_states = residual + hidden_states
@@ -870,7 +862,6 @@ def __init__(
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -943,14 +934,11 @@ def forward(
             cu_seqlens = cu_seqlens.to(device=device)
 
         max_seqlen = None
-        seqlens = None
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
 
         hidden_states = inputs_embeds
         for encoder_layer in self.layers:
@@ -959,7 +947,6 @@ def forward(
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
         return hidden_states
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 8a034fd72b02..6011d93a795d 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -74,6 +74,7 @@
 )
 
 try:
+    # Note: vLLM does not install xformers by default.
     from xformers import ops as xops
 
     if current_platform.is_cuda() and current_platform.has_device_capability(100):
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 1500a437613c..8c707c2561af 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -46,7 +46,6 @@
 from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
     vit_torch_sdpa_wrapper,
-    vit_xformers_attn_wrapper,
 )
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
@@ -375,7 +374,6 @@ def forward(
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -435,8 +433,6 @@ def forward(
                 v,
                 cu_seqlens,
             )
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
 
         output, _ = self.proj(context_layer)
         return output
@@ -448,9 +444,7 @@ def forward(
         "cu_seqlens": 0,
         "rotary_pos_emb_cos": 0,
         "rotary_pos_emb_sin": 0,
-        "seqlens": 0,
     },
-    mark_unbacked_dims={"seqlens": 0},
     enable_if=should_torch_compile_mm_vit,
 )
 class Qwen2_5_VisionBlock(nn.Module):
@@ -501,7 +495,6 @@ def forward(
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
@@ -509,7 +502,6 @@ def forward(
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         x_fused_norm, residual = self.norm2(x, residual=x_attn)
         x = residual + self.mlp(x_fused_norm)
@@ -670,7 +662,6 @@ def __init__(
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -822,17 +813,14 @@ def get_rope_by_thw(self, t, h, w):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     @staticmethod
     def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
@@ -897,10 +885,8 @@ def forward(
 
         # transformers
         # pre-compute seqlens for window/full attn to reduce cuMemcpy operations
-        max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(cu_seqlens)
-        max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
-            cu_window_seqlens
-        )
+        max_seqlen_full = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen_window = self.compute_attn_mask_seqlen(cu_window_seqlens)
 
         cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
         cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True)
@@ -927,11 +913,9 @@ def forward(
             if layer_num in self.fullatt_block_indexes:
                 cu_seqlens_now = cu_seqlens
                 max_seqlen_now = max_seqlen_full
-                seqlens_now = seqlens_full
             else:
                 cu_seqlens_now = cu_window_seqlens
                 max_seqlen_now = max_seqlen_window
-                seqlens_now = seqlens_window
 
             hidden_states = blk(
                 hidden_states,
@@ -939,7 +923,6 @@ def forward(
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen_now,
-                seqlens=seqlens_now,
             )
 
         # For Qwen2.5-VL-3B, float16 will overflow at last block
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 479a7871e364..9d1d023aed17 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -348,7 +348,6 @@ def __init__(
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -384,7 +383,6 @@ def forward(
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, 3 * head * head_dim]
         x, _ = self.qkv(x)
@@ -445,20 +443,6 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -509,7 +493,6 @@ def forward(
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -517,7 +500,6 @@ def forward(
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -728,18 +710,14 @@ def rot_pos_emb(
         sin_combined = sin[pos_ids].flatten(1)
         return cos_combined, sin_combined
 
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -771,7 +749,7 @@ def forward(
         x = x.unsqueeze(1)
 
         # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
         for blk in self.blocks:
             x = blk(
@@ -780,7 +758,6 @@ def forward(
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         # adapter
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 54ef56f83344..61f218f16d79 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -224,7 +224,6 @@ def forward(
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -232,7 +231,6 @@ def forward(
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -500,14 +498,11 @@ def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -533,7 +528,7 @@ def forward(
         hidden_states = hidden_states.unsqueeze(1)
         rotary_pos_emb_cos = rotary_pos_emb_cos.to(hidden_states.device)
         rotary_pos_emb_sin = rotary_pos_emb_sin.to(hidden_states.device)
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
 
         hidden_states_list = []
         deepstack_visual_indexes = self.deepstack_visual_indexes
@@ -545,7 +540,6 @@ def forward(
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
             if (
                 deepstack_visual_indexes is not None
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 90c4894d33e8..4cd6fa14c32d 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -235,7 +235,6 @@ def forward(
         rotary_pos_emb_cos: torch.Tensor,
         rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -243,7 +242,6 @@ def forward(
             rotary_pos_emb_cos=rotary_pos_emb_cos,
             rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -391,7 +389,6 @@ def __init__(
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -531,17 +528,14 @@ def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -569,7 +563,7 @@ def forward(
         cu_seqlens = torch.from_numpy(cu_seqlens)
 
         hidden_states = hidden_states.unsqueeze(1)
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
         deepstack_feature_lists = []
@@ -580,7 +574,6 @@ def forward(
                 rotary_pos_emb_cos=rotary_pos_emb_cos,
                 rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
             if layer_num in self.deepstack_visual_indexes:
                 deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index f9bf242b7194..06793a3d1bb1 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -277,12 +277,7 @@ def get_vit_attn_backend(
         except ImportError:
             pass
 
-        if cls.has_device_capability(100):
-            # xFormers doesn't support Blackwell, fall back to SDPA
-            # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
-            return AttentionBackendEnum.TORCH_SDPA
-        else:
-            return AttentionBackendEnum.XFORMERS
+        return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
     def get_valid_backends(
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 3ef44e770320..d94da71b289f 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -49,7 +49,6 @@ def __dir__() -> list[str]:
 # Possible string values of STR_BACKEND_ENV_VAR
 # register, corresponding to possible backends
 STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
 
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
deleted file mode 100644
index 5039c44b9c3e..000000000000
--- a/vllm/v1/attention/backends/xformers.py
+++ /dev/null
@@ -1,420 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer with XFormersAttention."""
-
-from dataclasses import dataclass
-from typing import ClassVar, Optional
-
-import torch
-
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionImpl,
-    AttentionType,
-    MultipleOf,
-)
-from vllm.attention.ops.triton_unified_attention import unified_attention
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder,
-    CommonAttentionMetadata,
-    split_decodes_and_prefills,
-)
-from vllm.v1.kv_cache_interface import AttentionSpec
-
-try:
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import (
-        AttentionBias,
-        PagedBlockDiagonalCausalWithOffsetPaddedKeysMask,
-    )
-
-    XFORMERS_AVAILABLE = True
-except ImportError:
-    XFORMERS_AVAILABLE = False
-
-from vllm import _custom_ops as ops
-
-logger = init_logger(__name__)
-
-
-class XFormersAttentionBackend(AttentionBackend):
-    accept_output_buffer: bool = True
-    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-
-    @staticmethod
-    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
-        return [MultipleOf(16)]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [
-            32,
-            40,
-            48,
-            56,
-            64,
-            72,
-            80,
-            88,
-            96,
-            104,
-            112,
-            120,
-            128,
-            136,
-            144,
-            152,
-            160,
-            168,
-            176,
-            184,
-            192,
-            200,
-            208,
-            216,
-            224,
-            232,
-            240,
-            248,
-            256,
-        ]
-
-    @staticmethod
-    def get_name() -> str:
-        return "XFORMERS"
-
-    @staticmethod
-    def get_impl_cls() -> type["XFormersAttentionImpl"]:
-        return XFormersAttentionImpl
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        cache_dtype_str: str = "auto",
-    ) -> tuple[int, ...]:
-        if block_size % 16 != 0:
-            raise ValueError("Block size must be a multiple of 16.")
-        return (2, num_blocks, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def get_builder_cls() -> type["XFormersAttentionMetadataBuilder"]:
-        return XFormersAttentionMetadataBuilder
-
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return False
-
-
-@dataclass
-class XFormersAttentionMetadata:
-    num_actual_tokens: int  # Number of tokens excluding padding.
-    max_query_len: int
-    query_start_loc: torch.Tensor
-    max_seq_len: int
-    seq_lens: torch.Tensor
-    block_table: torch.Tensor
-    slot_mapping: torch.Tensor
-
-    num_prefill_tokens: int = 0
-    num_decode_tokens: int = 0
-    num_prefills: int = 0
-    num_decodes: int = 0
-
-    # Biases for different attention types.
-    attn_bias: Optional["AttentionBias"] = None
-
-    # Self-attention prefill/decode metadata cache
-    _cached_prefill_metadata: Optional["XFormersAttentionMetadata"] = None
-    _cached_decode_metadata: Optional["XFormersAttentionMetadata"] = None
-
-    @property
-    def prefill_metadata(self) -> Optional["XFormersAttentionMetadata"]:
-        if self.num_prefills == 0:
-            return None
-
-        if self._cached_prefill_metadata is not None:
-            # Recover cached prefill-phase attention
-            # metadata structure
-            return self._cached_prefill_metadata
-
-        q_start_loc = self.query_start_loc[self.num_decodes :]
-        q_seqlens = torch.diff(q_start_loc)
-        kv_seqlens = self.seq_lens[self.num_decodes :]
-        # Construct & cache prefill-phase attention metadata structure
-        self._cached_prefill_metadata = XFormersAttentionMetadata(
-            num_actual_tokens=self.num_prefill_tokens,
-            max_query_len=int(q_seqlens.max().item()),
-            query_start_loc=q_start_loc - q_start_loc[0],
-            max_seq_len=int(kv_seqlens.max().item()),
-            seq_lens=kv_seqlens,
-            block_table=self.block_table[self.num_decodes :],
-            slot_mapping=self.slot_mapping[self.num_decode_tokens :],
-        )
-        return self._cached_prefill_metadata
-
-    @property
-    def decode_metadata(self) -> Optional["XFormersAttentionMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-
-        if self._cached_decode_metadata is not None:
-            # Recover cached decode-phase attention
-            # metadata structure
-            return self._cached_decode_metadata
-
-        q_start_loc = self.query_start_loc
-        q_seqlens = torch.diff(q_start_loc)
-        decode_kv_seqlens = self.seq_lens[: self.num_decodes]
-        # Construct & cache decode-phase attention metadata structure
-        self._cached_decode_metadata = XFormersAttentionMetadata(
-            num_actual_tokens=self.num_decode_tokens,
-            max_query_len=int(q_seqlens[: self.num_decodes].max().item()),
-            query_start_loc=q_start_loc[: self.num_decodes + 1],
-            max_seq_len=int(decode_kv_seqlens.max().item()),
-            seq_lens=decode_kv_seqlens,
-            block_table=self.block_table[: self.num_decodes],
-            slot_mapping=self.slot_mapping[: self.num_decode_tokens],
-            attn_bias=self.attn_bias,
-        )
-        return self._cached_decode_metadata
-
-
-class XFormersAttentionMetadataBuilder(
-    AttentionMetadataBuilder[XFormersAttentionMetadata]
-):
-    reorder_batch_threshold: int = 1
-
-    def __init__(
-        self,
-        kv_cache_spec: AttentionSpec,
-        layer_names: list[str],
-        vllm_config: VllmConfig,
-        device: torch.device,
-    ):
-        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
-
-        assert XFORMERS_AVAILABLE
-        self.block_size = kv_cache_spec.block_size
-        self._num_decodes = 0
-        self._num_decode_tokens = 0
-
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> XFormersAttentionMetadata:
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
-            )
-        )
-
-        num_actual_tokens = common_attn_metadata.num_actual_tokens
-        q_start_loc = common_attn_metadata.query_start_loc
-        q_seqlens = torch.diff(q_start_loc)
-        max_query_len = common_attn_metadata.max_query_len
-        kv_seqlens = common_attn_metadata.seq_lens
-        max_seq_len = common_attn_metadata.max_seq_len
-        block_table = common_attn_metadata.block_table_tensor
-        slot_mapping = common_attn_metadata.slot_mapping
-
-        bias = None
-        if num_decodes > 0:
-            # Construct the decoder bias.
-            decode_q_seqlens = q_seqlens[:num_decodes]
-            decode_kv_seqlens = kv_seqlens[:num_decodes]
-            bias = PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=decode_q_seqlens.tolist(),
-                kv_seqlen=decode_kv_seqlens.tolist(),
-                page_size=self.block_size,
-                block_tables=block_table[:num_decodes],
-                device=block_table.device,
-            )
-
-        return XFormersAttentionMetadata(
-            num_actual_tokens=num_actual_tokens,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            num_prefills=num_prefills,
-            num_decodes=num_decodes,
-            max_query_len=max_query_len,
-            query_start_loc=q_start_loc,
-            max_seq_len=max_seq_len,
-            seq_lens=kv_seqlens,
-            block_table=block_table,
-            slot_mapping=slot_mapping,
-            attn_bias=bias,
-        )
-
-
-class XFormersAttentionImpl(AttentionImpl):
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: list[float] | None,
-        sliding_window: int | None,
-        kv_cache_dtype: str,
-        logits_soft_cap: float | None = None,
-        attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: str | None = None,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if alibi_slopes is not None:
-            raise NotImplementedError("XFormers does not support alibi slopes yet.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.kv_cache_dtype = kv_cache_dtype
-        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        if sliding_window is None:
-            self.sliding_window = (-1, -1)
-        else:
-            self.sliding_window = (sliding_window - 1, 0)
-        if logits_soft_cap is None:
-            # Setting logits_soft_cap to 0 means no soft cap.
-            logits_soft_cap = 0
-        self.logits_soft_cap = logits_soft_cap
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "XFormersAttentionImpl."
-            )
-
-    def forward(
-        self,
-        layer: torch.nn.Module,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: XFormersAttentionMetadata,
-        output: torch.Tensor | None = None,
-        output_scale: torch.Tensor | None = None,
-        output_block_scale: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        """Forward pass with XFormers.
-
-        Args:
-            query: shape = [num_tokens, num_heads, head_size]
-            key: shape = [num_tokens, num_kv_heads, head_size]
-            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache: shape =
-                [2, num_blocks, block_size, num_kv_heads, head_size]
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        assert output is not None, "Output tensor must be provided."
-
-        if output_scale is not None or output_block_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for XFormersAttentionImpl"
-            )
-
-        if attn_metadata is None:
-            # Profiling run.
-            return output.fill_(0)
-
-        # Cache the input KVs.
-        key_cache, value_cache = kv_cache.unbind(0)
-        if self.kv_sharing_target_layer_name is None:
-            # Reshape the input keys and values and store them in the cache.
-            # Skip this if sharing KV cache with an earlier attention layer.
-            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-            # not padded. However, we don't need to do key[:num_actual_tokens]
-            # and value[:num_actual_tokens] because the reshape_and_cache_flash
-            # op uses the slot_mapping's shape to determine the number of
-            # actual tokens.
-            ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
-        num_actual_tokens = attn_metadata.num_actual_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        if prefill_meta := attn_metadata.prefill_metadata:
-            descale_shape = (prefill_meta.query_start_loc.shape[0] - 1, key.shape[1])
-            unified_attention(
-                q=query[num_decode_tokens:num_actual_tokens],
-                k=key_cache,
-                v=value_cache,
-                out=output[num_decode_tokens:num_actual_tokens],
-                cu_seqlens_q=prefill_meta.query_start_loc,
-                max_seqlen_q=prefill_meta.max_query_len,
-                seqused_k=prefill_meta.seq_lens,
-                max_seqlen_k=prefill_meta.max_seq_len,
-                softmax_scale=self.scale,
-                causal=True,
-                alibi_slopes=self.alibi_slopes,
-                window_size=self.sliding_window,
-                block_table=prefill_meta.block_table,
-                softcap=self.logits_soft_cap,
-                q_descale=None,  # Not supported
-                k_descale=layer._k_scale.expand(descale_shape),
-                v_descale=layer._v_scale.expand(descale_shape),
-            )
-
-        if decode_meta := attn_metadata.decode_metadata:
-            # Query for decode. KV is not needed because it is already cached.
-            decode_query = query[:num_decode_tokens]
-            # Reshape query to [1, B_T, G, H, D].
-            q = decode_query.view(
-                1, -1, self.num_kv_heads, self.num_queries_per_kv, self.head_size
-            )
-            # Reshape the k and v caches to [1, Bkv_T, G, H, D]
-            cache_k = key_cache.view(
-                1, -1, self.num_kv_heads, 1, self.head_size
-            ).expand(
-                1,
-                -1,
-                self.num_kv_heads,
-                self.num_queries_per_kv,
-                self.head_size,
-            )
-            cache_v = value_cache.view(
-                1, -1, self.num_kv_heads, 1, self.head_size
-            ).expand(
-                1,
-                -1,
-                self.num_kv_heads,
-                self.num_queries_per_kv,
-                self.head_size,
-            )
-
-            attn_bias = decode_meta.attn_bias
-            output[:num_decode_tokens] = xops.memory_efficient_attention_forward(
-                q,
-                cache_k,
-                cache_v,
-                attn_bias=attn_bias,
-                p=0.0,
-                scale=self.scale,
-            ).view(decode_query.shape)
-
-        # Reshape the output tensor.
-        return output

From ed40d85929f25a939b8d79ef3f2fb923f5a7bb54 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Mon, 24 Nov 2025 14:48:45 +0800
Subject: [PATCH 362/578] [BugFix] Fix R-VL model loading error (#29299)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
---
 examples/offline_inference/vision_language_multi_image.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index d6e169548f15..301265d4e17f 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1110,6 +1110,7 @@ def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         model=model_name,
         max_model_len=16384,
         max_num_seqs=16,
+        trust_remote_code=True,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 

From 68dfe28eaefc20ce8d847c3b3ccf712716d20c20 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Mon, 24 Nov 2025 18:02:28 +0800
Subject: [PATCH 363/578] [Feature][Benchmark] add --link-vars can filter when
 serve_param equal bench_param (#28909)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/benchmarks/sweep/serve.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 45ac446a7aed..1298e4acbd87 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -211,6 +211,7 @@ def run_combs(
     output_dir: Path,
     num_runs: int,
     dry_run: bool,
+    links: list[tuple[str, str]],
 ):
     all_data = list[dict[str, object]]()
     for serve_comb in serve_params:
@@ -226,6 +227,14 @@ def run_combs(
             else contextlib.nullcontext()
         ) as server:
             for bench_comb in bench_params:
+                should_run = all(
+                    serve_key in serve_comb
+                    and bench_key in bench_comb
+                    and serve_comb[serve_key] == bench_comb[bench_key]
+                    for serve_key, bench_key in links
+                )
+                if not should_run:
+                    continue
                 base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
 
                 comb_data = run_comb(
@@ -262,6 +271,7 @@ class SweepServeArgs:
     num_runs: int
     dry_run: bool
     resume: str | None
+    link_vars: list[tuple[str, str]] | None
 
     parser_name: ClassVar[str] = "serve"
     parser_help: ClassVar[str] = "Run vLLM server benchmark under multiple settings."
@@ -285,7 +295,7 @@ def from_cli_args(cls, args: argparse.Namespace):
         else:
             # i.e.: run bench_cmd without any modification
             bench_params = ParameterSweep.from_records([{}])
-
+        link_vars = cls.parse_link_vars(args.link_vars)
         num_runs = args.num_runs
         if num_runs < 1:
             raise ValueError("`num_runs` should be at least 1.")
@@ -301,6 +311,7 @@ def from_cli_args(cls, args: argparse.Namespace):
             num_runs=num_runs,
             dry_run=args.dry_run,
             resume=args.resume,
+            link_vars=link_vars,
         )
 
     @classmethod
@@ -376,8 +387,28 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
             "parameter combinations for which there are still no output files.",
         )
 
+        parser.add_argument(
+            "--link-vars",
+            type=str,
+            default="",
+            help=(
+                "Comma-separated list of linked variables between serve and bench, "
+                "e.g. max_num_seqs=max_concurrency,max_model_len=random_input_len"
+            ),
+        )
+
         return parser
 
+    @staticmethod
+    def parse_link_vars(s: str) -> list[tuple[str, str]]:
+        if not s:
+            return []
+        pairs = []
+        for item in s.split(","):
+            a, b = item.split("=")
+            pairs.append((a.strip(), b.strip()))
+        return pairs
+
 
 def run_main(args: SweepServeArgs):
     timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -397,6 +428,7 @@ def run_main(args: SweepServeArgs):
             output_dir=output_dir,
             num_runs=args.num_runs,
             dry_run=args.dry_run,
+            links=args.link_vars,
         )
     except BaseException as exc:
         raise RuntimeError(

From 8005e606bf280b7b6002f57e95ae3210ddc6f041 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?=
 <38908462+zhyajie@users.noreply.github.com>
Date: Mon, 24 Nov 2025 18:16:52 +0800
Subject: [PATCH 364/578] [Bugfix][Rocm] Fix shared expert weight loading
 failure in DeepSeek-MTP (#27563)

Signed-off-by: zhyajie <yajizhan@amd.com>
Co-authored-by: zhyajie <yajizhan@amd.com>
---
 vllm/model_executor/models/deepseek_mtp.py | 159 +++++++++++++++------
 vllm/model_executor/models/deepseek_v2.py  |  14 +-
 2 files changed, 124 insertions(+), 49 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index e028dc497aa6..6e23037b919a 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -1,15 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable
+import typing
+from collections.abc import Callable, Iterable
 
 import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -231,6 +233,9 @@ def compute_logits(
         return self.model.compute_logits(hidden_states, spec_step_idx)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        rocm_aiter_moe_shared_expert_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
         stacked_params_mapping = [
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
@@ -238,11 +243,16 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
         ]
 
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+        expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts,
+            num_experts=self.config.n_routed_experts
+            + (
+                self.config.n_shared_experts
+                if rocm_aiter_moe_shared_expert_enabled
+                else 0
+            ),
         )
 
         params_dict = dict(self.named_parameters())
@@ -253,6 +263,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
             if spec_layer is None:
                 continue
+            is_fusion_moe_shared_experts_layer = (
+                rocm_aiter_moe_shared_expert_enabled and ("mlp.shared_experts" in name)
+            )
             name = self._rewrite_spec_layer_name(spec_layer, name)
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
@@ -266,6 +279,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                 if ("mlp.experts." in name) and name not in params_dict:
                     continue
+                if is_fusion_moe_shared_experts_layer:
+                    continue
                 name_mapped = name.replace(weight_name, param_name)
 
                 # QKV fusion is optional, fall back to normal
@@ -286,45 +301,105 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(
-                        param,
-                        loaded_weight,
-                        name,
-                        shard_id=shard_id,
-                        expert_id=expert_id,
-                    )
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-
-                    # According to DeepSeek-V3 Technical Report, MTP modules
-                    # shares embedding layer. We only load the first weights.
-                    if (
-                        spec_layer != self.model.mtp_start_layer_idx
-                        and ".layers" not in name
-                    ):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
+                # Special handling: when AITER fusion_shared_experts is enabled,
+                # checkpoints may provide a single widened shared_experts tensor
+                # without explicit expert indices
+                # (e.g. ...mlp.shared_experts.gate_proj.weight).
+                # For models with multiple shared experts, split that tensor
+                # evenly into per-shared-expert slices and load them into
+                # appended expert slots mlp.experts.{n_routed_experts + j}.*
+                # accordingly.
+                num_chunks = 1
+                if is_fusion_moe_shared_experts_layer:
+                    num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
+                    # Determine split axis based on op type
+                    # gate/up: ColumnParallel → split along dim 0
+                    # down: RowParallel → split along dim 1
+                    split_dim = 1 if "down_proj.weight" in name else 0
+                    total = loaded_weight.shape[split_dim]
+                    assert total % num_chunks == 0, (
+                        f"Shared expert weight dim {total} "
+                        f"not divisible by num_chunks {num_chunks}"
                     )
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
+                    chunk_size = total // num_chunks
+
+                for j in range(num_chunks):
+                    chunk_name = name
+                    weight_to_load = loaded_weight
+
+                    if is_fusion_moe_shared_experts_layer:
+                        if split_dim == 0:
+                            weight_to_load = loaded_weight[
+                                j * chunk_size : (j + 1) * chunk_size, :
+                            ]
+                        else:
+                            weight_to_load = loaded_weight[
+                                :, j * chunk_size : (j + 1) * chunk_size
+                            ]
+                        # Synthesize an expert-style name so expert mapping
+                        # can route it
+                        chunk_name = name.replace(
+                            "mlp.shared_experts",
+                            f"mlp.experts.{self.config.n_routed_experts + j}",
+                        )
+
+                    # Use expert_params_mapping to locate the destination
+                    # param and delegate to its expert-aware weight_loader
+                    # with expert_id.
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if weight_name not in chunk_name:
+                            continue
+
+                        # Do not modify `name` since the loop may continue here
+                        # Instead, create a new variable
+                        name_mapped = chunk_name.replace(weight_name, param_name)
+
+                        param = params_dict[name_mapped]
+                        # We should ask the weight loader to return success or
+                        # not here since otherwise we may skip experts with
+                        # other available replicas.
+                        weight_loader = typing.cast(
+                            Callable[..., bool], param.weight_loader
+                        )
+                        success = weight_loader(
+                            param,
+                            weight_to_load,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                            return_success=True,
+                        )
+                        if success:
+                            if not is_fusion_moe_shared_experts_layer:
+                                name = name_mapped
+                            else:
+                                loaded_params.add(name_mapped)
+                            break
+                    else:
+                        # Skip loading extra bias for GPTQ models.
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+
+                        name = maybe_remap_kv_scale_name(name, params_dict)
+                        if name is None:
+                            continue
+
+                        # According to DeepSeek-V3 Technical Report, MTP modules
+                        # shares embedding layer. We only load the first weights.
+                        if (
+                            spec_layer != self.model.mtp_start_layer_idx
+                            and ".layers" not in name
+                        ):
+                            continue
+
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+            if not is_fusion_moe_shared_experts_layer:
+                loaded_params.add(name)
         return loaded_params
 
     def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 7cfd381592b4..ad932559b983 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1479,8 +1479,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             if spec_layer is not None:
                 continue  # skip spec decode layers for main model
 
-            is_fuse_shared_experts_layer = rocm_aiter_moe_shared_expert_enabled and (
-                "mlp.shared_experts" in name
+            is_fusion_moe_shared_experts_layer = (
+                rocm_aiter_moe_shared_expert_enabled and ("mlp.shared_experts" in name)
             )
 
             for param_name, weight_name, shard_id in stacked_params_mapping:
@@ -1495,7 +1495,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                 if ("mlp.experts." in name) and name not in params_dict:
                     continue
-                if is_fuse_shared_experts_layer:
+                if is_fusion_moe_shared_experts_layer:
                     continue
                 name_mapped = name.replace(weight_name, param_name)
 
@@ -1531,7 +1531,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 # appended expert slots mlp.experts.{n_routed_experts + j}.*
                 # accordingly.
                 num_chunks = 1
-                if is_fuse_shared_experts_layer:
+                if is_fusion_moe_shared_experts_layer:
                     num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
                     # Determine split axis based on op type
                     # gate/up: ColumnParallel → split along dim 0
@@ -1548,7 +1548,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                     chunk_name = name
                     weight_to_load = loaded_weight
 
-                    if is_fuse_shared_experts_layer:
+                    if is_fusion_moe_shared_experts_layer:
                         if split_dim == 0:
                             weight_to_load = loaded_weight[
                                 j * chunk_size : (j + 1) * chunk_size, :
@@ -1599,7 +1599,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                             return_success=True,
                         )
                         if success:
-                            if not is_fuse_shared_experts_layer:
+                            if not is_fusion_moe_shared_experts_layer:
                                 name = name_mapped
                             else:
                                 loaded_params.add(name_mapped)
@@ -1628,7 +1628,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                             param, "weight_loader", default_weight_loader
                         )
                         weight_loader(param, loaded_weight)
-            if not is_fuse_shared_experts_layer:
+            if not is_fusion_moe_shared_experts_layer:
                 loaded_params.add(name)
 
         return loaded_params

From eca7a8fb59c223c603922be0bd62f5c460972a50 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Mon, 24 Nov 2025 12:10:48 +0100
Subject: [PATCH 365/578] [Doc]: fix typos in various files (#29230)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 benchmarks/kernels/deepgemm/README.md | 2 +-
 vllm/config/vllm.py                   | 2 +-
 vllm/forward_context.py               | 2 +-
 vllm/v1/worker/gpu_input_batch.py     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md
index 41e68e047be8..a28c6956be0e 100644
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -2,7 +2,7 @@
 
 This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
 
-Currently this just includes dense GEMMs and only works on Hopper GPUs.
+Currently, this just includes dense GEMMs and only works on Hopper GPUs.
 
 ## Setup
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index d64e315b4fe3..8a3599416bc7 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -96,7 +96,7 @@ class VllmConfig:
     """`torch.compile` and cudagraph capture configuration for the model.
 
     As a shorthand, one can append compilation arguments via 
-    -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`).
+    -0.parameter=argument such as `-O.mode=3` (same as `-O='{"mode":3}'`).
 
     You can specify the full compilation config like so:
     `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 25fb7181a8f2..7cb490e391ab 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -153,7 +153,7 @@ def chunked_sizes(
     @contextmanager
     def sp_local_sizes(self, sequence_parallel_size: int):
         """
-        Context mamager for setting self.local_sizes. Same as self.chunked_sizes
+        Context manager for setting self.local_sizes. Same as self.chunked_sizes
         but without any chunking.
         """
         self.local_sizes = _compute_sp_num_tokens(
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d6fef450c028..4a2818ab1bfd 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -525,7 +525,7 @@ def swap_states(self, i1: int, i2: int) -> None:
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
         #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
-        # instead, we need to temporiarily copy the data for one of the indices
+        # instead, we need to temporarily copy the data for one of the indices
         # TODO(lucas): optimize this by only copying valid indices
         tmp = self.token_ids_cpu[i1, ...].copy()
         self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]

From 4de87866a8fdc9395ccd40e00c0f9075439b07ae Mon Sep 17 00:00:00 2001
From: R3hankhan <Rehan.Khan7@ibm.com>
Date: Mon, 24 Nov 2025 17:38:09 +0530
Subject: [PATCH 366/578] [CPU][IBM Z] Fix BF16 support and vectorize math
 operations for s390x (#28926)

Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com>
---
 csrc/cpu/cpu_attn_impl.hpp |   2 +-
 csrc/cpu/cpu_types_vxe.hpp | 586 +++++++++++++++++++++++++++++++++----
 2 files changed, 531 insertions(+), 57 deletions(-)

diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 12c6f5d3015c..98f55d7c014b 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -847,7 +847,7 @@ struct VecTypeTrait<c10::BFloat16> {
 };
 #endif
 
-#if !defined(__powerpc__)
+#if !defined(__powerpc__) && !defined(__s390x__)
 template <>
 struct VecTypeTrait<c10::Half> {
   using vec_t = vec_op::FP16Vec16;
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index 51bca37e699b..9efd8b7ec14a 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -4,6 +4,7 @@
 
 #include <vecintrin.h>
 #include <cmath>
+#include <limits>
 #include <torch/all.h>
 namespace vec_op {
 
@@ -174,8 +175,9 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   }
 
   explicit FP32Vec8(const BF16Vec8& v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+    // On big-endian s390x, place BF16 first to get correct byte order
+    reg.val[0] = (__vector float)vec_mergeh(v.reg, zero);
+    reg.val[1] = (__vector float)vec_mergel(v.reg, zero);
   }
 
   float reduce_sum() const {
@@ -189,51 +191,257 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   }
 
   FP32Vec8 exp() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::exp(ar.values[0]);
-    ret.val[0][1] = std::exp(ar.values[1]);
-    ret.val[0][2] = std::exp(ar.values[2]);
-    ret.val[0][3] = std::exp(ar.values[3]);
-    ret.val[1][0] = std::exp(ar.values[4]);
-    ret.val[1][1] = std::exp(ar.values[5]);
-    ret.val[1][2] = std::exp(ar.values[6]);
-    ret.val[1][3] = std::exp(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    f32x4x2_t out;
+
+    const __vector float log2e = vec_splats(1.44269504088896341f);
+    const __vector float one = vec_splats(1.0f);
+    const __vector float min_x = vec_splats(-87.3f);
+    const __vector float max_x = vec_splats(88.7f);
+
+    // 5th-degree minimax polynomial for 2^r (r in [0,1))
+    const __vector float c1 = vec_splats(0.6931471805599453f);
+    const __vector float c2 = vec_splats(0.240226506959101f);
+    const __vector float c3 = vec_splats(0.05550410866482158f);
+    const __vector float c4 = vec_splats(0.009618129107628477f);
+    const __vector float c5 = vec_splats(0.0013333558146428443f);
+
+    for (int i = 0; i < 2; i++) {
+      __vector float x = reg.val[i];
+
+      x = vec_max(x, min_x);
+      x = vec_min(x, max_x);
+
+      __vector float y = vec_mul(x, log2e);
+
+      __vector float kf = vec_floor(y);
+      __vector float r = vec_sub(y, kf);
+
+      __vector signed int k = vec_signed(kf);
+      const __vector signed int min_k = vec_splats((signed int)-126);
+      const __vector signed int max_k = vec_splats((signed int)127);
+      k = vec_min(vec_max(k, min_k), max_k);
+
+      // Build 2^k from exponent bits
+      __vector signed int exp_int = vec_add(k, vec_splats((signed int)127));
+      __vector unsigned int bits = (__vector unsigned int)exp_int;
+      bits = vec_sl(bits, vec_splats((unsigned int)23));
+      __vector float pow2k = (__vector float)bits;
+
+      // Improved minimax polynomial
+      __vector float poly = vec_madd(c5, r, c4);
+      poly = vec_madd(poly, r, c3);
+      poly = vec_madd(poly, r, c2);
+      poly = vec_madd(poly, r, c1);
+      poly = vec_madd(poly, r, one);
+
+      out.val[i] = vec_mul(pow2k, poly);
+    }
+
+    return FP32Vec8(out);
   }
 
   FP32Vec8 tanh() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::tanh(ar.values[0]);
-    ret.val[0][1] = std::tanh(ar.values[1]);
-    ret.val[0][2] = std::tanh(ar.values[2]);
-    ret.val[0][3] = std::tanh(ar.values[3]);
-    ret.val[1][0] = std::tanh(ar.values[4]);
-    ret.val[1][1] = std::tanh(ar.values[5]);
-    ret.val[1][2] = std::tanh(ar.values[6]);
-    ret.val[1][3] = std::tanh(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
+    const __vector float one = vec_splats(1.0f);
+    const __vector float two = vec_splats(2.0f);
+    const __vector float zero = vec_splats(0.0f);
+    const __vector float sat =
+        vec_splats(9.0f);  // beyond this, tanh(x) ~ sign(x)
+
+    f32x4x2_t out;
+
+    for (int i = 0; i < 2; i++) {
+      __vector float x = reg.val[i];
+      __vector float ax = vec_abs(x);
+
+      // sign(x): +1 or -1
+      __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+      // saturation mask: |x| > sat
+      __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+      // 2x
+      __vector float two_x = vec_mul(x, two);
+
+      // Build a temporary FP32Vec8 with both lanes = 2x, reuse exp()
+      f32x4x2_t tmp;
+      tmp.val[0] = two_x;
+      tmp.val[1] = two_x;
+      FP32Vec8 exp_2x_vec(tmp);
+
+      FP32Vec8 e2x = exp_2x_vec.exp();
+      __vector float e = e2x.reg.val[i];
+
+      // tanh(x) = (e - 1) / (e + 1)
+      __vector float num = vec_sub(e, one);
+      __vector float den = vec_add(e, one);
+
+      __vector float t = vec_div(num, den);
+
+      // For large |x|, clamp to sign(x)
+      out.val[i] = vec_sel(t, sign, saturated);
+    }
+
+    return FP32Vec8(out);
   }
 
   FP32Vec8 er() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::erf(ar.values[0]);
-    ret.val[0][1] = std::erf(ar.values[1]);
-    ret.val[0][2] = std::erf(ar.values[2]);
-    ret.val[0][3] = std::erf(ar.values[3]);
-    ret.val[1][0] = std::erf(ar.values[4]);
-    ret.val[1][1] = std::erf(ar.values[5]);
-    ret.val[1][2] = std::erf(ar.values[6]);
-    ret.val[1][3] = std::erf(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    // A&S 7.1.26 approximation:
+    // erf(x) = sign(x) * (1 - ((((a5*t + a4)*t + a3)*t + a2)*t + a1) * t *
+    // exp(-x^2)) t = 1 / (1 + p*|x|),  p = 0.3275911
+
+    const __vector float one = vec_splats(1.0f);
+    const __vector float zero = vec_splats(0.0f);
+    const __vector float p = vec_splats(0.3275911f);
+
+    // Polynomial coeffs
+    const __vector float a1 = vec_splats(0.254829592f);
+    const __vector float a2 = vec_splats(-0.284496736f);
+    const __vector float a3 = vec_splats(1.421413741f);
+    const __vector float a4 = vec_splats(-1.453152027f);
+    const __vector float a5 = vec_splats(1.061405429f);
+
+    // Threshold where erf(x) ~ sign(x)
+    const __vector float sat = vec_splats(6.0f);
+
+    f32x4x2_t out;
+
+    for (int lane = 0; lane < 2; lane++) {
+      __vector float x = reg.val[lane];
+      __vector float ax = vec_abs(x);
+
+      // sign(x)
+      __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+      // |x| > 6 → erf(x) = ±1
+      __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+      // t = 1 / (1 + p * |x|)
+      __vector float t = vec_madd(p, ax, one);
+      t = vec_div(one, t);
+
+      // poly = a5
+      __vector float poly = a5;
+      poly = vec_madd(poly, t, a4);
+      poly = vec_madd(poly, t, a3);
+      poly = vec_madd(poly, t, a2);
+      poly = vec_madd(poly, t, a1);
+
+      // full polynomial: poly = poly * t
+      poly = vec_mul(poly, t);
+
+      // Compute exp(-x^2)
+      __vector float x2 = vec_mul(x, x);
+      __vector float neg_x2 = vec_neg(x2);
+
+      f32x4x2_t tmp;
+      tmp.val[0] = neg_x2;
+      tmp.val[1] = neg_x2;
+      FP32Vec8 exp_neg_x2(tmp);
+
+      FP32Vec8 e = exp_neg_x2.exp();
+      __vector float ex = e.reg.val[lane];
+
+      // erf(x) = sign * (1 - poly * exp(-x^2))
+      __vector float term = vec_mul(poly, ex);
+      __vector float y = vec_sub(one, term);
+      y = vec_mul(y, sign);
+
+      // saturated → ±1
+      __vector float sat_val = vec_mul(sign, one);
+      out.val[lane] = vec_sel(y, sat_val, saturated);
+    }
+
+    return FP32Vec8(out);
+  }
+  // Elementwise sigmoid(x) = 1 / (1 + exp(-x))
+  FP32Vec8 sigmoid() const {
+    const __vector float one = vec_splats(1.0f);
+
+    f32x4x2_t neg;
+    for (int i = 0; i < 2; ++i) {
+      neg.val[i] = vec_neg(reg.val[i]);
+    }
+
+    FP32Vec8 neg_x(neg);
+    FP32Vec8 e = neg_x.exp();  // exp(-x)
+
+    f32x4x2_t denom;
+    for (int i = 0; i < 2; ++i) {
+      denom.val[i] = vec_add(one, e.reg.val[i]);
+    }
+
+    FP32Vec8 denom_vec(denom);
+    FP32Vec8 one_vec(1.0f);
+
+    return one_vec / denom_vec;
+  }
+
+  // Tanh-based GELU:
+  // gelu(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3)))
+  FP32Vec8 gelu_tanh() const {
+    const __vector float k_s2pi = vec_splats(0.7978845608028654f);  // √(2/π)
+    const __vector float k_0_0447 = vec_splats(0.044715f);
+
+    f32x4x2_t x2, x3, inner;
+    for (int i = 0; i < 2; ++i) {
+      __vector float x = reg.val[i];
+      x2.val[i] = vec_mul(x, x);                            // x^2
+      x3.val[i] = vec_mul(x2.val[i], x);                    // x^3
+      __vector float t = vec_madd(k_0_0447, x3.val[i], x);  // x + 0.044715*x^3
+      inner.val[i] = vec_mul(k_s2pi, t);                    // √(2/π)*(...)
+    }
+
+    FP32Vec8 inner_vec(inner);
+    FP32Vec8 t = inner_vec.tanh();  // tanh part
+
+    FP32Vec8 one_vec(1.0f);
+    FP32Vec8 half_vec(0.5f);
+
+    FP32Vec8 x_vec(*this);
+    return x_vec * half_vec * (one_vec + t);
+  }
+
+  // Erf-based GELU:
+  // gelu(x) = 0.5 * x * (1 + erf(x / √2))
+  FP32Vec8 gelu_erf() const {
+    const __vector float inv_sqrt2 = vec_splats(0.7071067811865476f);  // 1/√2
+    FP32Vec8 x_vec(*this);
+
+    f32x4x2_t scaled;
+    for (int i = 0; i < 2; ++i) {
+      scaled.val[i] = vec_mul(reg.val[i], inv_sqrt2);
+    }
+    FP32Vec8 x_scaled(scaled);
+
+    FP32Vec8 erf_x = x_scaled.er();
+
+    FP32Vec8 one_vec(1.0f);
+    FP32Vec8 half_vec(0.5f);
+
+    return x_vec * half_vec * (one_vec + erf_x);
+  }
+
+  // Elementwise reciprocal: 1/x (scalar per lane, for correctness)
+  FP32Vec8 rcp() const {
+    AliasReg in, out;
+    in.reg = reg;
+
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      out.values[i] = 1.0f / in.values[i];
+    }
+    return FP32Vec8(out.reg);
+  }
+
+  // Elementwise rsqrt(x) = 1 / sqrt(x) (scalar per lane, for correctness)
+  FP32Vec8 rsqrt() const {
+    AliasReg in, out;
+    in.reg = reg;
+
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      out.values[i] = 1.0f / std::sqrt(in.values[i]);
+    }
+    return FP32Vec8(out.reg);
   }
 
   FP32Vec8 operator*(const FP32Vec8& b) const {
@@ -316,10 +524,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   }
 
   explicit FP32Vec16(const BF16Vec16& v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
-    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
-    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+    // On big-endian s390x, place BF16 first to get correct byte order
+    reg.val[0] = (__vector float)vec_mergeh(v.reg.val[0], zero);
+    reg.val[1] = (__vector float)vec_mergel(v.reg.val[0], zero);
+    reg.val[2] = (__vector float)vec_mergeh(v.reg.val[1], zero);
+    reg.val[3] = (__vector float)vec_mergel(v.reg.val[1], zero);
   }
 
   explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
@@ -376,6 +585,23 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return result;
   }
 
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
+                                vec_max(reg.val[1], b.reg.val[1]),
+                                vec_max(reg.val[2], b.reg.val[2]),
+                                vec_max(reg.val[3], b.reg.val[3])}));
+  }
+
+  float reduce_max() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = ar.values[0];
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) {
+      if (ar.values[i] > result) result = ar.values[i];
+    });
+    return result;
+  }
+
   void save(float* ptr) const {
     vec_xst(reg.val[0], 0, ptr);
     vec_xst(reg.val[1], 16, ptr);
@@ -402,15 +628,14 @@ struct VecType<c10::BFloat16> {
   using vec_type = BF16Vec8;
 };
 
+// On s390x, FP16 (Half) is not natively supported, use FP32 vectors instead
+using FP16Vec16 = FP32Vec16;
+
 template <typename T>
 void storeFP32(float v, T* ptr) {
   *ptr = v;
 }
 
-inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
-  acc = acc + a * b;
-}
-
 namespace c10 {
 struct BFloat16 {
   uint16_t value;  // Assume BFloat16 is defined as a struct containing a 16-bit
@@ -429,6 +654,79 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
   #define __VEC_CLASS_FP_NAN (1 << 6)
 #endif
 
+// Optimized FMA (Fused Multiply-Add) implementations using IBM Z vector
+// intrinsics
+
+// FP32Vec4 FMA: acc = acc + (a * b) or equivalently acc = fma(a, b, acc)
+FORCE_INLINE void fma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_madd(a.reg, b.reg, acc.reg);
+}
+
+// FP32Vec8 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+// FP32Vec16 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_madd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_madd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Multiply-Subtract: acc = acc - (a * b)
+FORCE_INLINE void fms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_msub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void fms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void fms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_msub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_msub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Add: acc = -(a * b) + acc
+FORCE_INLINE void nfma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_nmadd(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_nmadd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_nmadd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Subtract: acc = -(a * b) - acc
+FORCE_INLINE void nfms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_nmsub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_nmsub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_nmsub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
 const static __vector unsigned char omask = {2,  3,  6,  7,  10, 11, 14, 15,
                                              18, 19, 22, 23, 26, 27, 30, 31};
 const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
@@ -441,13 +739,24 @@ const static __vector unsigned int one = {1, 1, 1, 1};
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
   __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
   __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int lsb0 = inp0 >> sh16;
+  __vector unsigned int lsb1 = inp1 >> sh16;
+  lsb0 = lsb0 & one;
+  lsb1 = lsb1 & one;
+  __vector unsigned int rnd0 = lsb0 + bias;
+  __vector unsigned int rnd1 = lsb1 + bias;
+  inp0 = inp0 + rnd0;
+  inp1 = inp1 + rnd1;
   int cc;
   __vector __bool int sel0 =
       vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
   __vector __bool int sel1 =
       vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
-  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
-  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp0 = inp0 >> sh16;
+  inp1 = inp1 >> sh16;
+
   reg = (__vector signed short)vec_perm(inp0, inp1, omask);
 }
 
@@ -456,6 +765,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
   __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
   __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
   __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  __vector unsigned int lsb0 = inp0 >> sh16;
+  __vector unsigned int lsb1 = inp1 >> sh16;
+  __vector unsigned int lsb2 = inp2 >> sh16;
+  __vector unsigned int lsb3 = inp3 >> sh16;
+  lsb0 = lsb0 & one;
+  lsb1 = lsb1 & one;
+  lsb2 = lsb2 & one;
+  lsb3 = lsb3 & one;
+  __vector unsigned int rnd0 = lsb0 + bias;
+  __vector unsigned int rnd1 = lsb1 + bias;
+  __vector unsigned int rnd2 = lsb2 + bias;
+  __vector unsigned int rnd3 = lsb3 + bias;
+  inp0 = inp0 + rnd0;
+  inp1 = inp1 + rnd1;
+  inp2 = inp2 + rnd2;
+  inp3 = inp3 + rnd3;
   int cc;
   __vector __bool int sel0 =
       vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
@@ -465,15 +790,164 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
       vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
   __vector __bool int sel3 =
       vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
-  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
-  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
-  inp2 = vec_sel(inp2, nan, sel2) >> sh16;
-  inp3 = vec_sel(inp3, nan, sel3) >> sh16;
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp2 = vec_sel(inp2, nan, sel2);
+  inp3 = vec_sel(inp3, nan, sel3);
+  inp0 = inp0 >> sh16;
+  inp1 = inp1 >> sh16;
+  inp2 = inp2 >> sh16;
+  inp3 = inp3 >> sh16;
+
   reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
   reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
 }
 
-inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
+// 1D softmax over `n` elements in `input`, writes result to `output`.
+// Uses FP32Vec8 for main body, scalar tail handling.
+// Requirement: n > 0
+FORCE_INLINE void softmax_fp32vec8(float* output, const float* input, int n) {
+  if (n <= 0) return;
+
+  // ---------- Pass 1: find max ----------
+  float max_val = -std::numeric_limits<float>::infinity();
+  int i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    FP32Vec8 v(input + i);
+    FP32Vec8::AliasReg ar;
+    ar.reg = v.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      if (ar.values[j] > max_val) max_val = ar.values[j];
+    }
+  }
+  for (; i < n; ++i) {
+    if (input[i] > max_val) max_val = input[i];
+  }
+
+  // ---------- Pass 2: compute exp(x - max) and sum ----------
+  float sum = 0.0f;
+  i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    float tmp[FP32Vec8::VEC_ELEM_NUM];
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      tmp[j] = input[i + j] - max_val;
+    }
+
+    FP32Vec8 v(tmp);
+    FP32Vec8 e = v.exp();
+
+    FP32Vec8::AliasReg ar;
+    ar.reg = e.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      output[i + j] = ar.values[j];
+      sum += ar.values[j];
+    }
+  }
+
+  // Tail
+  for (; i < n; ++i) {
+    float x = input[i] - max_val;
+    float ex = std::exp(x);  // scalar tail
+    output[i] = ex;
+    sum += ex;
+  }
+
+  // ---------- Pass 3: normalize ----------
+  float inv_sum = 1.0f / sum;
+  i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    float tmp[FP32Vec8::VEC_ELEM_NUM];
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      tmp[j] = output[i + j] * inv_sum;
+    }
+    FP32Vec8 v(tmp);
+    v.save(output + i);
+  }
+
+  for (; i < n; ++i) {
+    output[i] *= inv_sum;
+  }
+}
+
+// 1D RMSNorm kernel:
+//   input:  x[0..n-1]
+//   weight: w[0..n-1] (gamma), may be nullptr
+//   output: y[i] = x[i] * inv_rms * (weight[i] if weight != nullptr else 1)
+//   eps: small epsilon for numerical stability
+FORCE_INLINE void rmsnorm_fp32vec8(float* output, const float* input,
+                                   const float* weight, int n, float eps) {
+  if (n <= 0) return;
+
+  // ---------- Pass 1: compute sum of squares ----------
+  float sum_sq = 0.0f;
+  int i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    FP32Vec8 x_vec(input + i);
+
+    FP32Vec8 sq = x_vec * x_vec;
+
+    FP32Vec8::AliasReg ar;
+    ar.reg = sq.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      sum_sq += ar.values[j];
+    }
+  }
+
+  // Tail
+  for (; i < n; ++i) {
+    float v = input[i];
+    sum_sq += v * v;
+  }
+
+  float mean_sq = sum_sq / static_cast<float>(n);
+  float inv_rms = 1.0f / std::sqrt(mean_sq + eps);
+
+  // ---------- Pass 2: scale (and apply weight if given) ----------
+  const float inv_rms_f = inv_rms;
+  i = 0;
+
+  if (weight) {
+    // with gamma
+    for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+      FP32Vec8 x_vec(input + i);
+
+      float wtmp[FP32Vec8::VEC_ELEM_NUM];
+      for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+        wtmp[j] = weight[i + j];
+      }
+      FP32Vec8 w_vec(wtmp);
+
+      FP32Vec8 scale_vec(inv_rms_f);
+      FP32Vec8 y = x_vec * scale_vec * w_vec;
+      y.save(output + i);
+    }
+
+    for (; i < n; ++i) {
+      output[i] = input[i] * inv_rms_f * weight[i];
+    }
+  } else {
+    // without gamma
+    for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+      FP32Vec8 x_vec(input + i);
+      FP32Vec8 scale_vec(inv_rms_f);
+      FP32Vec8 y = x_vec * scale_vec;
+      y.save(output + i);
+    }
+
+    for (; i < n; ++i) {
+      output[i] = input[i] * inv_rms_f;
+    }
+  }
+}
+
+// Prefetch data to cache for better memory access performance
+FORCE_INLINE void prefetch(const void* addr) {
+  __builtin_prefetch(addr, 0, 3);  // 0=read, 3=high temporal locality
+}
 
 };  // namespace vec_op
 

From 2601f18a8216b8e0b8743cded669b8b4d0e3e980 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Mon, 24 Nov 2025 22:08:29 +0800
Subject: [PATCH 367/578] [EPLB] Optimize EPLB for Async Rearrange Experts 
 (#22179)

Signed-off-by: David Chen <530634352@qq.com>
Co-authored-by: SunChenxiang123 <1291824390@qq.com>
---
 tests/distributed/test_eplb_execute.py     | 127 +++++-
 tests/distributed/test_eplb_spec_decode.py |  39 +-
 vllm/config/parallel.py                    |   4 +
 vllm/distributed/eplb/async_worker.py      | 115 ++++++
 vllm/distributed/eplb/eplb_state.py        | 431 ++++++++++++++++++---
 vllm/distributed/eplb/rebalance_execute.py | 137 ++++++-
 vllm/v1/worker/gpu_model_runner.py         |   2 +
 7 files changed, 778 insertions(+), 77 deletions(-)
 create mode 100644 vllm/distributed/eplb/async_worker.py

diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 9498e75b279b..781dfd44c1ef 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -1,13 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import asyncio
 import random
 
 import pytest
 import torch
 import torch.distributed
 
-from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.eplb.rebalance_execute import (
+    move_from_buffer,
+    rearrange_expert_weights_inplace,
+    transfer_layer,
+)
 from vllm.distributed.parallel_state import (
     ensure_model_parallel_initialized,
     get_tp_group,
@@ -231,6 +236,100 @@ def verify_redundant_experts_have_same_weights(
                     )
 
 
+def _test_async_transfer_layer_without_mtp_worker(
+    env,
+    world_size: int,
+    num_layers: int,
+    num_local_experts: int,
+    num_logical_experts: int,
+) -> None:
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
+
+    tp_group = get_tp_group()
+    ep_group = tp_group.device_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
+
+    total_physical_experts = world_size * num_local_experts
+    hidden_sizes = [16, 32]
+
+    redundancy_config = create_redundancy_config(
+        num_logical_experts,
+        total_physical_experts,
+    )
+    old_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        redundancy_config,
+    )
+
+    new_redundancy_config = create_redundancy_config(
+        num_logical_experts,
+        total_physical_experts,
+    )
+    new_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        new_redundancy_config,
+    )
+
+    expert_weights = create_expert_weights(
+        num_layers,
+        num_local_experts,
+        hidden_sizes,
+        ep_rank,
+        device,
+        old_indices,
+    )
+
+    expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
+    cuda_stream = torch.cuda.Stream(device=device)
+
+    for layer_idx in range(num_layers):
+        is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
+            transfer_layer(
+                old_global_expert_indices=old_indices,
+                new_global_expert_indices=new_indices,
+                expert_weights=expert_weights,
+                expert_weights_buffer=expert_buffer,
+                ep_group=ep_group,
+                layer=layer_idx,
+                cuda_stream=cuda_stream,
+            )
+        )
+
+        cuda_stream.synchronize()
+        move_from_buffer(
+            expert_weights=expert_weights[layer_idx],
+            expert_weights_buffer=expert_buffer,
+            is_unchanged=is_unchanged,
+            is_received_locally=is_received_locally,
+            experts_recv_loc=experts_recv_loc,
+            new_indices=new_indices[layer_idx].tolist(),
+            ep_group=ep_group,
+        )
+
+    verify_expert_weights_after_shuffle(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        ep_rank,
+        num_local_experts,
+    )
+    verify_redundant_experts_have_same_weights(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        world_size,
+        num_local_experts,
+    )
+
+
 def _test_rearrange_expert_weights_with_redundancy(
     env, world_size, num_layers, num_local_experts, num_logical_experts
 ) -> None:
@@ -399,6 +498,32 @@ def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
             )
 
 
+@pytest.mark.parametrize(
+    "world_size,num_layers,num_local_experts,num_logical_experts",
+    [
+        (2, 2, 2, 3),
+    ],
+)
+def test_async_transfer_layer_without_mtp(
+    world_size: int,
+    num_layers: int,
+    num_local_experts: int,
+    num_logical_experts: int,
+):
+    """Exercise async EPLB transfer path without MTP/spec decode."""
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    distributed_run(
+        _test_async_transfer_layer_without_mtp_worker,
+        world_size,
+        num_layers,
+        num_local_experts,
+        num_logical_experts,
+    )
+
+
 @pytest.mark.parametrize("world_size", [2, 4])
 def test_rearrange_expert_weights_no_change(world_size):
     """
diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
index 11e23f128f33..c055b7a3f6dd 100644
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -10,10 +10,11 @@
 
 def get_model_args(
     model_name: str,
-    spec_model_name: str,
+    spec_model_name: str | None,
     spec_method: str,
     tp_size: int,
     model_max_len: int,
+    use_async: bool = False,
 ) -> dict:
     speculative_config = {
         "method": spec_method,
@@ -37,6 +38,8 @@ def get_model_args(
         "enable_eplb": True,
         "max_model_len": model_max_len,
     }
+    if use_async:
+        model_args["eplb_config"] = {"use_async": True}
     return model_args
 
 
@@ -94,3 +97,37 @@ def test_eplb_spec_decode(
         measured_value - RTOL < expected_gsm8k_value
         and measured_value + RTOL > expected_gsm8k_value
     ), f"Expected: {expected_gsm8k_value} |  Measured: {measured_value}"
+
+
+@large_gpu_mark(min_gb=80)
+def test_eplb_spec_decode_qwen3_next_mtp_async() -> None:
+    """
+    Ensure async EPLB works with MTP speculative decoding for Qwen3-Next.
+    """
+
+    TASK = "gsm8k"
+    FILTER = "exact_match,strict-match"
+    RTOL = 0.03
+    expected_gsm8k_value = 0.86
+
+    model_args = get_model_args(
+        model_name="Qwen/Qwen3-Next-80B-A3B-Instruct",
+        spec_model_name=None,
+        spec_method="mtp",
+        tp_size=4,
+        model_max_len=4096,
+        use_async=True,
+    )
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=TASK,
+        batch_size=64,
+        num_fewshot=8,
+    )
+    measured_value = results["results"][TASK][FILTER]
+    assert (
+        measured_value - RTOL < expected_gsm8k_value
+        and measured_value + RTOL > expected_gsm8k_value
+    ), f"Expected: {expected_gsm8k_value} |  Measured: {measured_value}"
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 4b0236d8de3f..ad438a8b464e 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -60,6 +60,10 @@ class EPLBConfig:
     Log the balancedness each step of expert parallelism.
     This is turned off by default since it will cause communication overhead.
     """
+    use_async: bool = False
+    """
+    Whether to use non-blocking EPLB.
+    """
 
 
 @config
diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py
new file mode 100644
index 000000000000..e4b4fc92eeaa
--- /dev/null
+++ b/vllm/distributed/eplb/async_worker.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+The async worker that transfers experts in the background.
+"""
+
+import asyncio
+import threading
+from typing import TYPE_CHECKING
+
+import torch
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.parallel_state import get_ep_group
+from vllm.logger import init_logger
+
+from .rebalance_execute import transfer_layer
+
+if TYPE_CHECKING:
+    from .eplb_state import EplbState
+
+logger = init_logger(__name__)
+
+
+def start_async_worker(
+    state: "EplbState",
+    rank_mapping: dict[int, int] | None = None,
+    is_profile: bool = False,
+) -> threading.Thread:
+    ep_group = get_ep_group().device_group
+    rank = ep_group.rank()
+    device_index = state.cuda_device_index
+
+    def thread_target() -> None:
+        assert device_index is not None
+        torch.cuda.set_device(device_index)
+        cuda_stream = torch.cuda.Stream(device=device_index)
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            loop.run_until_complete(
+                transfer_run_periodically(
+                    state=state,
+                    ep_group=ep_group,
+                    is_profile=is_profile,
+                    rank_mapping=rank_mapping,
+                    cuda_stream=cuda_stream,
+                )
+            )
+        except Exception as exc:  # pragma: no cover - diagnostic path
+            logger.exception("async loop error (Rank %d): %s", rank, str(exc))
+        finally:
+            loop.close()
+
+    thread = threading.Thread(target=thread_target, daemon=True)
+    thread.start()
+    return thread
+
+
+async def transfer_run_periodically(
+    state: "EplbState",
+    ep_group: ProcessGroup,
+    is_profile: bool = False,
+    rank_mapping: dict[int, int] | None = None,
+    cuda_stream: torch.cuda.Stream = None,
+) -> None:
+    while True:
+        await asyncio.to_thread(state.rearrange_event.wait)
+        logger.info("async worker woke up for EPLB transfer")
+
+        for model_state in state.model_states.values():
+            if not model_state.is_async_enabled:
+                continue
+            current_num_layers = model_state.model.num_moe_layers
+            while (
+                model_state.rebalanced
+                and model_state.layer_to_transfer < current_num_layers
+            ):
+                if (
+                    not model_state.ep_buffer_ready
+                    and model_state.rebalanced
+                    and model_state.new_physical_to_logical_map is not None
+                ):
+                    await asyncio.to_thread(model_state.buffer_lock.acquire)
+                    try:
+                        if model_state.layer_to_transfer >= current_num_layers:
+                            break
+
+                        (
+                            model_state.is_unchanged,
+                            model_state.is_received_locally,
+                            model_state.experts_recv_loc,
+                        ) = await transfer_layer(
+                            old_global_expert_indices=model_state.physical_to_logical_map,
+                            new_global_expert_indices=model_state.new_physical_to_logical_map,
+                            expert_weights=model_state.model.expert_weights,
+                            expert_weights_buffer=model_state.expert_buffer,
+                            ep_group=ep_group,
+                            is_profile=is_profile,
+                            layer=model_state.layer_to_transfer,
+                            cuda_stream=cuda_stream,
+                            rank_mapping=rank_mapping,
+                        )
+                        event = torch.cuda.Event(blocking=False)
+                        cuda_stream.record_event(event)
+                        model_state.buffer_ready_event = event
+                        model_state.ep_buffer_ready = 1
+                    finally:
+                        model_state.buffer_lock.release()
+                else:
+                    if not model_state.rebalanced:
+                        break
+                    await asyncio.sleep(0.001)
+
+        state.rearrange_event.clear()
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 526d3ceac7b8..9f8798a96a2f 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -26,6 +26,7 @@
 physical experts.
 """
 
+import threading
 import time
 from collections.abc import Sequence
 from dataclasses import dataclass
@@ -43,8 +44,9 @@
 from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MixtureOfExperts
 
+from .async_worker import start_async_worker
 from .rebalance_algo import rebalance_experts
-from .rebalance_execute import rearrange_expert_weights_inplace
+from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace
 
 logger = init_logger(__name__)
 
@@ -132,6 +134,74 @@ class EplbModelState:
     """
     model_name: str
     model: MixtureOfExperts
+    expert_buffer: list[torch.Tensor]
+    """
+    The buffer to store the expert weights during transfer.
+    """
+    buffer_lock: threading.Lock
+    """
+    The lock to protect the expert buffer.
+    """
+    buffer_ready_event: torch.cuda.Event | None
+    """
+    CUDA event recorded when the async worker finishes filling the buffer.
+    The main thread waits on this before consuming the buffer.
+    """
+    ep_buffer_ready: int
+    """
+    The flag indicates whether the expert buffer is ready for transfer.
+    0 or 1.
+    """
+    layer_to_transfer: int
+    """
+    The layer index to transfer in async mode.
+    """
+    rebalanced: bool
+    """
+    The flag indicates whether the experts rebalance have been computed.
+    """
+    pending_global_ready_check: bool
+    """
+    Whether the async EPLB needs to poll peers for buffer readiness.
+    """
+    is_unchanged: list[bool]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    is_received_locally: list[bool]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    experts_recv_loc: dict[int, int]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    is_async_enabled: bool
+    """
+    The flag indicates whether the EPLB is running in async mode.
+    """
+    cuda_device_index: int | None
+    """
+    CUDA device index for the async EPLB worker thread.
+    """
+    new_physical_to_logical_map: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as physical_to_logical_map
+    """
+    new_logical_to_physical_map: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as logical_to_physical_map
+    """
+    new_logical_replica_count: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as logical_replica_count
+    """
 
 
 class EplbState:
@@ -164,12 +234,31 @@ def __init__(self, parallel_config: ParallelConfig, device: torch.device):
         Otherwise, the rearrangement will hang at collective
         communication calls.
         """
-        self.expert_rearrangement_step: int = 0
+        self.expert_rearrangement_step_interval: int = 0
         """
         Interval for expert rearrangement steps.
         This is a constant and is taken from the config.
         """
-        self.expert_rearrangement_step_interval: int = 0
+        self.is_async: bool = False
+        """
+        The flag indicates whether the EPLB is running in async mode.
+        """
+        self.rearrange_event = threading.Event()
+        """
+        Event to signal when a new rearrangement is needed for the async thread.
+        """
+        self.async_worker: threading.Thread | None = None
+        """
+        Background thread handling async transfers.
+        """
+        self.cuda_device_index: int | None = None
+        """
+        CUDA device index for the async EPLB worker thread.
+        """
+        if self.device.type == "cuda":
+            self.cuda_device_index = self.device.index
+            if self.cuda_device_index is None and torch.cuda.is_available():
+                self.cuda_device_index = torch.cuda.current_device()
 
     @staticmethod
     def build_initial_global_physical_to_logical_map(
@@ -239,6 +328,8 @@ def add_model(
         Build the initial EPLB state.
         """
         self.validate_ep_configuration(model)
+        self.is_async = self.parallel_config.eplb_config.use_async
+
         physical_to_logical_map_list = (
             EplbState.build_initial_global_physical_to_logical_map(
                 model.num_routed_experts,
@@ -368,7 +459,12 @@ def add_model(
             physical_to_logical_map = new_physical_to_logical_map.to(self.device)
             logical_to_physical_map.copy_(new_logical_to_physical_map)
             logical_replica_count.copy_(new_logical_replica_count)
+        else:
+            new_physical_to_logical_map = None
+
+            new_logical_to_physical_map = None
 
+            new_logical_replica_count = None
         model.set_eplb_state(
             expert_load_pass,
             logical_to_physical_map,
@@ -385,15 +481,33 @@ def add_model(
             )
             self.expert_rearrangement_step = 0
 
-        self.model_states[model_config.compute_hash()] = EplbModelState(
-            physical_to_logical_map,
-            logical_to_physical_map,
-            logical_replica_count,
-            expert_load_pass,
-            expert_load_window,
-            model_config.model,
-            model,
+        expert_buffer = [torch.empty_like(w) for w in model.expert_weights[0]]
+
+        model_state = EplbModelState(
+            physical_to_logical_map=physical_to_logical_map,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+            expert_load_pass=expert_load_pass,
+            expert_load_window=expert_load_window,
+            model_name=model_config.model,
+            model=model,
+            expert_buffer=expert_buffer,
+            buffer_lock=threading.Lock(),
+            buffer_ready_event=None,
+            ep_buffer_ready=0,
+            layer_to_transfer=0,
+            rebalanced=False,
+            pending_global_ready_check=False,
+            is_unchanged=[],
+            is_received_locally=[],
+            experts_recv_loc={},
+            is_async_enabled=self.is_async,
+            cuda_device_index=self.cuda_device_index,
+            new_physical_to_logical_map=new_physical_to_logical_map,
+            new_logical_to_physical_map=new_logical_to_physical_map,
+            new_logical_replica_count=new_logical_replica_count,
         )
+        self.model_states[model_config.compute_hash()] = model_state
 
     def step(
         self,
@@ -420,7 +534,7 @@ def step(
             - `max_tokens`: The maximum load across ranks.
             - `balancedness`: The ratio of average load to maximum load.
         """
-
+        ep_group = get_ep_group().device_group
         if is_profile:
             self.rearrange(is_profile=True)
             return
@@ -488,7 +602,49 @@ def step(
         # rearrangement step and perform rearrangement to ensure all ranks are
         # performing collective communication.
         self.expert_rearrangement_step += 1
+
+        if self.is_async:
+            for eplb_model_state in self.model_states.values():
+                if not eplb_model_state.is_async_enabled:
+                    continue
+
+                all_ranks_buffer_ready = False
+                if eplb_model_state.pending_global_ready_check:
+                    all_ranks_buffer_ready = self._all_ranks_buffer_ready(
+                        eplb_model_state
+                    )
+                if (
+                    eplb_model_state.is_async_enabled
+                    and eplb_model_state.ep_buffer_ready
+                    and all_ranks_buffer_ready
+                ):
+                    self.move_to_workspace(
+                        model_state=eplb_model_state,
+                        ep_group=ep_group,
+                        is_profile=is_profile,
+                    )
+                    if (
+                        eplb_model_state.layer_to_transfer
+                        >= eplb_model_state.model.num_moe_layers
+                    ):
+                        self.post_eplb(eplb_model_state, is_profile)
+                        eplb_model_state.rebalanced = False
+                        eplb_model_state.layer_to_transfer = 0
+                        eplb_model_state.pending_global_ready_check = False
+                        logger.info(
+                            "finish async transfer for model %s rank %d layer %d",
+                            eplb_model_state.model_name,
+                            ep_group.rank(),
+                            eplb_model_state.model.num_moe_layers,
+                        )
+
         if self.expert_rearrangement_step >= self.expert_rearrangement_step_interval:
+            if any(
+                eplb_model_state.is_async_enabled and eplb_model_state.rebalanced
+                for eplb_model_state in self.model_states.values()
+            ):
+                # Still performing asynchronous rearrangement
+                return
             self.expert_rearrangement_step = 0
             self.rearrange()
 
@@ -524,7 +680,11 @@ def rearrange(
         if is_main_rank:
             torch.cuda.synchronize()
             time_start = time.perf_counter()
-            logger.info("Rearranging experts %s...", "(profile)" if is_profile else "")
+            logger.info(
+                "Rearranging experts %s %s...",
+                "(async mode)" if self.is_async else "sync mode",
+                "(profile)" if is_profile else "",
+            )
 
         if global_expert_loads is None:
             # Map the physical expert load to global logical experts
@@ -593,6 +753,7 @@ def rearrange(
         model = eplb_model_state.model
         num_replicas = model.num_physical_experts
         num_groups = model.num_expert_groups
+
         if rank_mapping is not None and len(rank_mapping) == ep_group.size():
             # NOTE(yongji): scale down, we need to rebalance the experts on
             # remaining GPUs, transfer the experts while we haven't shutdown
@@ -608,7 +769,7 @@ def rearrange(
             num_gpus = ep_group.size()
 
         if num_gpus % num_nodes != 0:
-            self.num_nodes = 1
+            num_nodes = 1
             logger.warning_once(
                 f"num_gpus % num_nodes != 0, "
                 "not using hierarchical rearrangement algorithm.\n"
@@ -631,59 +792,215 @@ def rearrange(
                 num_gpus,
             )
 
-            # Update expert weights
-            rearrange_expert_weights_inplace(
-                eplb_model_state.physical_to_logical_map,
-                new_physical_to_logical_map,
-                eplb_model_state.model.expert_weights,
-                ep_group,
-                is_profile,
-                rank_mapping,
-            )
+            if not eplb_model_state.is_async_enabled or is_profile:
+                # Update expert weights
+                rearrange_expert_weights_inplace(
+                    eplb_model_state.physical_to_logical_map,
+                    new_physical_to_logical_map,
+                    eplb_model_state.model.expert_weights,
+                    ep_group,
+                    is_profile,
+                    rank_mapping,
+                )
 
-            if not is_profile:
-                if (
-                    eplb_model_state.physical_to_logical_map.shape[1]
-                    != new_physical_to_logical_map.shape[1]
-                ):
-                    eplb_model_state.physical_to_logical_map = (
-                        new_physical_to_logical_map.to(
-                            eplb_model_state.physical_to_logical_map.device
+                if not is_profile:
+                    if (
+                        eplb_model_state.physical_to_logical_map.shape[1]
+                        != new_physical_to_logical_map.shape[1]
+                    ):
+                        eplb_model_state.physical_to_logical_map = (
+                            new_physical_to_logical_map.to(
+                                eplb_model_state.physical_to_logical_map.device
+                            )
                         )
+                    else:
+                        eplb_model_state.physical_to_logical_map.copy_(
+                            new_physical_to_logical_map
+                        )
+                    max_physical_slots = new_logical_to_physical_map.shape[-1]
+                    assert (
+                        max_physical_slots
+                        <= eplb_model_state.logical_to_physical_map.shape[-1]
                     )
-                else:
-                    eplb_model_state.physical_to_logical_map.copy_(
-                        new_physical_to_logical_map
+                    new_logical_to_physical_map = torch.nn.functional.pad(
+                        new_logical_to_physical_map,
+                        (
+                            0,
+                            eplb_model_state.logical_to_physical_map.shape[-1]
+                            - max_physical_slots,
+                        ),
+                        value=-1,
                     )
-                max_physical_slots = new_logical_to_physical_map.shape[-1]
-                assert (
-                    max_physical_slots
-                    <= eplb_model_state.logical_to_physical_map.shape[-1]
-                )
-                new_logical_to_physical_map = torch.nn.functional.pad(
+                    eplb_model_state.logical_to_physical_map.copy_(
+                        new_logical_to_physical_map
+                    )
+                    eplb_model_state.logical_replica_count.copy_(
+                        new_logical_replica_count
+                    )
+                if is_main_rank:
+                    assert time_start is not None
+                    torch.cuda.synchronize()
+                    time_end = time.perf_counter()
+                    logger.info(
+                        "Rearranged experts%sin %.2f seconds.",
+                        " (profile) " if is_profile else " ",
+                        time_end - time_start,
+                    )
+            else:
+                device = eplb_model_state.physical_to_logical_map.device
+                new_physical = new_physical_to_logical_map.to(device)
+                max_slots = eplb_model_state.logical_to_physical_map.shape[-1]
+                padded_logical = torch.nn.functional.pad(
                     new_logical_to_physical_map,
-                    (
-                        0,
-                        eplb_model_state.logical_to_physical_map.shape[-1]
-                        - max_physical_slots,
-                    ),
+                    (0, max(0, max_slots - new_logical_to_physical_map.shape[-1])),
                     value=-1,
+                ).to(eplb_model_state.logical_to_physical_map.device)
+                new_replica = new_logical_replica_count.to(
+                    eplb_model_state.logical_replica_count.device
                 )
-                eplb_model_state.logical_to_physical_map.copy_(
-                    new_logical_to_physical_map
-                )
-                eplb_model_state.logical_replica_count.copy_(new_logical_replica_count)
 
-        if is_main_rank:
-            assert time_start is not None
-            torch.cuda.synchronize()
-            time_end = time.perf_counter()
+                eplb_model_state.new_physical_to_logical_map = new_physical
+                eplb_model_state.new_logical_to_physical_map = padded_logical
+                eplb_model_state.new_logical_replica_count = new_replica
+
+                eplb_model_state.rebalanced = True
+                eplb_model_state.layer_to_transfer = 0
+                eplb_model_state.pending_global_ready_check = True
+
+        # Signal async thread to start transferring layers
+        if self.is_async and (not is_profile):
+            self.rearrange_event.set()
+        return None
+
+    def start_async_loop(
+        self,
+        rank_mapping: dict[int, int] | None = None,
+        is_profile: bool = False,
+    ):
+        if not self.is_async:
+            return
+        if self.async_worker is None:
+            self.async_worker = start_async_worker(
+                self,
+                rank_mapping=rank_mapping,
+                is_profile=is_profile,
+            )
+
+    def _update_layer_mapping_from_new(
+        self, model_state: EplbModelState, layer: int
+    ) -> None:
+        if (
+            model_state.new_physical_to_logical_map is None
+            or model_state.new_logical_to_physical_map is None
+            or model_state.new_logical_replica_count is None
+        ):
+            return
+
+        target_device = model_state.physical_to_logical_map.device
+        new_physical = model_state.new_physical_to_logical_map
+        if model_state.physical_to_logical_map.shape[1] != new_physical.shape[1]:
+            model_state.physical_to_logical_map = new_physical.to(target_device)
+        else:
+            model_state.physical_to_logical_map[layer].copy_(
+                new_physical[layer].to(target_device)
+            )
+
+        logical_device = model_state.logical_to_physical_map.device
+        new_logical = model_state.new_logical_to_physical_map[layer].to(logical_device)
+        max_slots = model_state.logical_to_physical_map.shape[-1]
+        slot_delta = max_slots - new_logical.shape[-1]
+        if slot_delta > 0:
+            new_logical = torch.nn.functional.pad(
+                new_logical, (0, slot_delta), value=-1
+            )
+        model_state.logical_to_physical_map[layer].copy_(new_logical)
+
+        replica_device = model_state.logical_replica_count.device
+        model_state.logical_replica_count[layer].copy_(
+            model_state.new_logical_replica_count[layer].to(replica_device)
+        )
+
+    def _all_ranks_buffer_ready(self, model_state: EplbModelState) -> bool:
+        parallel_state = get_ep_group()
+        cpu_group = getattr(parallel_state, "cpu_group", None)
+        if cpu_group is not None and cpu_group.size() > 1:
+            flag = torch.tensor(
+                (int(model_state.ep_buffer_ready),), dtype=torch.int32, device="cpu"
+            )
+            all_reduce(flag, group=cpu_group)
+            return int(flag.item()) == cpu_group.size()
+
+        device_group = parallel_state.device_group
+        if device_group.size() <= 1:
+            return bool(model_state.ep_buffer_ready)
+
+        device = getattr(
+            parallel_state, "device", model_state.physical_to_logical_map.device
+        )
+        flag = torch.tensor(
+            (int(model_state.ep_buffer_ready),), dtype=torch.int32, device=device
+        )
+        all_reduce(flag, group=device_group)
+        return int(flag.item()) == device_group.size()
+
+    def move_to_workspace(
+        self,
+        model_state: EplbModelState,
+        ep_group: ProcessGroup,
+        is_profile: bool = False,
+    ):
+        if not model_state.buffer_lock.acquire(blocking=False):
+            return
+        try:
+            assert model_state.new_physical_to_logical_map is not None
+            device_index = model_state.cuda_device_index or self.cuda_device_index
+            if model_state.buffer_ready_event is not None and device_index is not None:
+                stream = torch.cuda.current_stream(device=device_index)
+                stream.wait_event(model_state.buffer_ready_event)
+                model_state.buffer_ready_event = None
+            move_from_buffer(
+                expert_weights=model_state.model.expert_weights[
+                    model_state.layer_to_transfer
+                ],
+                expert_weights_buffer=model_state.expert_buffer,
+                is_unchanged=model_state.is_unchanged,
+                is_received_locally=model_state.is_received_locally,
+                experts_recv_loc=model_state.experts_recv_loc,
+                new_indices=model_state.new_physical_to_logical_map[
+                    model_state.layer_to_transfer
+                ].tolist(),
+                ep_group=ep_group,
+            )
+            transferred_layer = model_state.layer_to_transfer
+            self._update_layer_mapping_from_new(model_state, transferred_layer)
+            # After the main thread consumes, advance layer_to_transfer
+            model_state.layer_to_transfer += 1
+            model_state.ep_buffer_ready = 0
             logger.info(
-                "Rearranged experts%sin %.2f seconds.",
-                " (profile) " if is_profile else " ",
-                time_end - time_start,
+                "model %s successfully move_to_workspace layer %d",
+                model_state.model_name,
+                transferred_layer,
             )
-        return None
+        finally:
+            try:
+                model_state.buffer_lock.release()
+            except Exception as e:
+                logger.error(
+                    "Rank %d: buffer_lock release failed in move_to_workspace: %s",
+                    ep_group.rank(),
+                    str(e),
+                )
+
+    def post_eplb(self, model_state: EplbModelState, is_profile: bool = False) -> None:
+        assert model_state.new_physical_to_logical_map is not None
+        assert model_state.new_logical_to_physical_map is not None
+        assert model_state.new_logical_replica_count is not None
+        if not is_profile:
+            for layer_idx in range(model_state.physical_to_logical_map.shape[0]):
+                self._update_layer_mapping_from_new(model_state, layer_idx)
+        model_state.new_physical_to_logical_map = None
+        model_state.new_logical_to_physical_map = None
+        model_state.new_logical_replica_count = None
 
     @staticmethod
     def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]:
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 5c1efbaf03ba..376dad8a72ef 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -100,18 +100,19 @@ def get_ep_ranks_with_expert(
     return ranks_to_send, ranks_to_recv_actual
 
 
-def shuffle_layer(
+def move_to_buffer(
     num_local_experts: int,
-    ep_rank: int,
     old_indices: Sequence[int],
     new_indices: Sequence[int],
     expert_weights: Iterable[torch.Tensor],
     expert_weights_buffer: Sequence[torch.Tensor],
+    cuda_stream: torch.cuda.Stream | None,
     ep_group: ProcessGroup,
-) -> None:
+) -> tuple[list[bool], list[bool], dict[int, int]]:
     """
     Perform expert weights rearrangement of one layer.
     """
+    ep_rank = ep_group.rank()
     local2global = partial(
         idx_local_to_global,
         local_cnt=num_local_experts,
@@ -137,7 +138,8 @@ def shuffle_layer(
             if old_indices[src_global] == new_indices[dst_global]:
                 is_received_locally[dst] = True
                 for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                    buffer[dst].copy_(weight[src])
+                    with torch.cuda.stream(cuda_stream):
+                        buffer[dst].copy_(weight[src], non_blocking=True)
 
     p2p_ops: list[P2POp] = []
 
@@ -225,25 +227,115 @@ def shuffle_layer(
         ]
 
     # 4. Execute the P2P operations. The real communication happens here.
-    if p2p_ops:
+    if p2p_ops and cuda_stream is not None:
+        with torch.cuda.stream(cuda_stream):
+            reqs = batch_isend_irecv(p2p_ops)
+            for req in reqs:
+                req.wait()
+    elif p2p_ops:
         reqs = batch_isend_irecv(p2p_ops)
         for req in reqs:
             req.wait()
+    # wait for the communication to finish
+    return is_unchanged, is_received_locally, experts_recv_loc
+
+
+def move_from_buffer(
+    expert_weights: Iterable[torch.Tensor],
+    expert_weights_buffer: list[torch.Tensor],
+    is_unchanged: list[bool],
+    is_received_locally: list[bool],
+    experts_recv_loc: dict[int, int],
+    new_indices: Sequence[int],
+    ep_group: ProcessGroup,
+) -> None:
+    ep_rank = ep_group.rank()
+    num_local_experts = len(is_unchanged)
+
+    local2global = partial(
+        idx_local_to_global, local_cnt=num_local_experts, ep_rank=ep_rank
+    )
 
-    # 5. Copy the weights from the buffer back to the original weights.
     for dst in range(num_local_experts):
         if is_unchanged[dst]:
             continue
         if is_received_locally[dst]:
             for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                weight[dst].copy_(buffer[dst])
+                weight[dst].copy_(buffer[dst], non_blocking=True)
         else:
             expert = new_indices[local2global(dst)]
             if expert == -1:
                 continue
             src = experts_recv_loc[expert]
             for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                weight[dst].copy_(buffer[src])
+                weight[dst].copy_(buffer[src], non_blocking=True)
+
+
+async def transfer_layer(
+    old_global_expert_indices: torch.Tensor,
+    new_global_expert_indices: torch.Tensor,
+    expert_weights: Sequence[Iterable[torch.Tensor]],
+    expert_weights_buffer: Sequence[torch.Tensor],
+    ep_group: ProcessGroup,
+    is_profile: bool = False,
+    layer: int = 0,
+    cuda_stream: torch.cuda.Stream | None = None,
+    rank_mapping: dict[int, int] | None = None,
+) -> tuple[list[bool], list[bool], dict[int, int]]:
+    """
+    Rearranges the expert weights in place according to the new expert indices.
+
+    The value of the indices arguments are logical indices of the experts,
+    while keys are physical.
+
+    Args:
+        old_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+        new_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+        expert_weights: A sequence of shape (num_moe_layers)(weight_count)
+            of tensors of shape (num_local_physical_experts, hidden_size_i).
+            For example, a linear layer may have up and down projection,
+            so weight_count = 2. Each weight's hidden size can be different.
+        ep_group: The device process group for expert parallelism.
+        is_profile (bool): If `True`, do not perform any actual weight copy.
+            This is used during profile run, where we only perform dummy
+            communications to reserve enough memory for the buffers.
+    """
+    ep_size = ep_group.size()
+    if rank_mapping is not None:
+        if len(rank_mapping) == ep_group.size():
+            # scale down
+            new_global_expert_indices = _map_new_expert_indices_with_rank_mapping(
+                new_global_expert_indices,
+                rank_mapping,
+            )
+        else:
+            # scale up
+            old_global_expert_indices = _map_old_expert_indices_with_rank_mapping(
+                old_global_expert_indices,
+                rank_mapping,
+                ep_group.size(),
+            )
+
+    assert old_global_expert_indices.shape[1] == new_global_expert_indices.shape[1]
+    num_moe_layers, num_physical_experts = old_global_expert_indices.shape
+    assert len(expert_weights) == num_moe_layers
+    num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
+    assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
+    assert num_physical_experts == ep_size * num_local_physical_experts
+    # A buffer to hold the expert weights in one layer during the exchange.
+    # NOTE: Currently we assume the same weights across different layers
+    # have the same shape.
+
+    is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+        num_local_experts=num_local_physical_experts,
+        old_indices=old_global_expert_indices[layer].tolist(),
+        new_indices=new_global_expert_indices[layer].tolist(),
+        expert_weights=expert_weights[layer],
+        expert_weights_buffer=expert_weights_buffer,
+        cuda_stream=cuda_stream,
+        ep_group=ep_group,
+    )
+    return is_unchanged, is_received_locally, experts_recv_loc
 
 
 def rearrange_expert_weights_inplace(
@@ -296,7 +388,6 @@ def rearrange_expert_weights_inplace(
     num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
     assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
 
-    ep_rank = ep_group.rank()
     ep_size = ep_group.size()
     assert num_physical_experts == ep_size * num_local_physical_experts
 
@@ -329,14 +420,24 @@ def rearrange_expert_weights_inplace(
     torch.cuda.synchronize()
 
     for layer in range(num_moe_layers):
-        shuffle_layer(
-            num_local_physical_experts,
-            ep_rank,
-            old_global_expert_indices_cpu[layer].tolist(),
-            new_global_expert_indices_cpu[layer].tolist(),
-            expert_weights[layer],
-            expert_weights_buffer,
-            ep_group,
+        is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+            num_local_experts=num_local_physical_experts,
+            old_indices=old_global_expert_indices_cpu[layer].tolist(),
+            new_indices=new_global_expert_indices_cpu[layer].tolist(),
+            expert_weights=expert_weights[layer],
+            expert_weights_buffer=expert_weights_buffer,
+            cuda_stream=None,
+            ep_group=ep_group,
+        )
+
+        move_from_buffer(
+            expert_weights=expert_weights[layer],
+            expert_weights_buffer=expert_weights_buffer,
+            is_unchanged=is_unchanged,
+            is_received_locally=is_received_locally,
+            experts_recv_loc=experts_recv_loc,
+            new_indices=new_global_expert_indices[layer].tolist(),
+            ep_group=ep_group,
         )
 
 
@@ -428,4 +529,4 @@ def _map_new_expert_indices_with_rank_mapping(
     return mapped_expert_indices
 
 
-__all__ = ["rearrange_expert_weights_inplace"]
+__all__ = ["transfer_layer", "move_from_buffer"]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6a54e02f861e..cbafc9c993cc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3370,6 +3370,8 @@ def load_model(self, eep_scale_up: bool = False) -> None:
                 old_global_expert_indices,
                 rank_mapping,
             )
+            if self.eplb_state.is_async:
+                self.eplb_state.start_async_loop(rank_mapping=rank_mapping)
 
         if (
             self.vllm_config.compilation_config.mode

From f716a153723d4b2e18d01380cfe25d9ac636e2ef Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 24 Nov 2025 09:40:05 -0500
Subject: [PATCH 368/578] Update KServe guide link in documentation (#29258)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/deployment/integrations/kserve.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md
index edf79fca4f93..37b29aa1a487 100644
--- a/docs/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@@ -2,4 +2,4 @@
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
-Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
+Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.

From 7a228b5305f3257834c5079fd475f659bc4bf73d Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Mon, 24 Nov 2025 07:12:41 -0800
Subject: [PATCH 369/578] Add option to use unbacked, and backed size obl
 dynamic shapes for more sounds compilation. (#26199)

Signed-off-by: Laith Sakka <lsakka@meta.com>
---
 docs/design/debug_vllm_compile.md             |  70 ++++++++++++
 docs/design/torch_compile.md                  | 105 +++++++++++++++++-
 .../test_dynamic_shapes_compilation.py        |  88 +++++++++++++++
 vllm/compilation/decorators.py                |  63 +++++++++--
 vllm/compilation/wrapper.py                   |  24 +++-
 vllm/config/compilation.py                    |  60 +++++++++-
 vllm/model_executor/models/llama.py           |  12 +-
 vllm/model_executor/models/qwen2.py           |  35 +++++-
 8 files changed, 442 insertions(+), 15 deletions(-)
 create mode 100644 tests/compile/test_dynamic_shapes_compilation.py

diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index 8912eb58f8ac..408d2878309d 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -151,6 +151,76 @@ To avoid this, please either:
 2. wrap the branching logic into a custom operator. TorchDynamo does not
 trace into custom operators.
 
+## Debugging constraint violations and dynamic shapes guards issues
+
+Dynamic-shape guards are a specific category of Dynamo guards. They are constraints that `torch.compile`
+attaches to dynamic dimensions (e.g., `seq_len`) to ensure the compiled artifact remains valid.
+These guards typically appear when framework code, custom passes, or user code branches based on
+dynamic shape values.
+
+**Example:**
+
+```python
+if x > 10:
+    # path A
+else:
+    # path B
+```
+
+This creates a guard `x > 10` or `x <= 10` depending on which path was traced.
+
+**vLLM's Assumption:**
+vLLM assumes that all guards added by torch.compile are safe to drop and will not
+constrain the compiled graph to specific input shapes. When this assumption is violated,
+it can cause issues that users need to debug.
+Some side effects that indicates this assumption is violated are runtime errors
+or `ConstraintViolationErrors`.
+
+A `ConstraintViolationErrors` will be thrown if a dynamic shape gets constrained to
+a single value. If you encounter a constraint violation error or suspect that a dynamic
+shapes guard is being added incorrectly, you can use stricter dynamic shape modes to
+help debug the issue:
+
+```sh
+# Online - using unbacked mode
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+
+# Online - using backed_size_oblivious mode
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
+```
+
+```py
+# Offline - using unbacked mode
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+LLM(model, compilation_config=CompilationConfig(
+    dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.UNBACKED)
+))
+
+# Offline - using backed_size_oblivious mode
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+LLM(model, compilation_config=CompilationConfig(
+    dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS)
+))
+```
+
+These modes are stricter and reduce or eliminate the need of dynamic shapes guarding, which can help isolate issues:
+
+- `unbacked`: Uses unbacked symints which don't allow guards, making it easier to identify where guards are being incorrectly added
+- `backed_size_oblivious`: Uses a mode that is more strict about guarding.
+
+For more details on dynamic shapes modes, see [Dynamic shapes and vLLM guard dropping](torch_compile.md#dynamic-shapes-and-vllm-guard-dropping).
+
+### Printing guards
+
+To see all guards that are being added during compilation, you can use `TORCH_LOGS=+dynamic`:
+
+```sh
+TORCH_LOGS=+dynamic vllm serve meta-llama/Llama-3.2-1B
+```
+
+Look for `[guard added]` in the logs to see where guards are being added. This can help you identify which operations are
+causing guards to be added incorrectly.
+
 ## Debugging TorchInductor
 
 TorchInductor takes a captured graph and then compiles it down to some Python code
diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 27edc4f89201..7b0b2c1e9697 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -29,6 +29,109 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
 
 By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set the field `compile_cache_save_format=unpacked` in the compilation config, or omit this and set the env variable `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`.
 
+## Dynamic shapes and vllm guard dropping
+
+`torch.compile` is designed to guard on dynamic shapes with no hesitation
+when needed. This contradicts with vLLM's `torch.compile` approach of
+dropping the guards since many of those guards could be material.
+
+`torch.compile` provides two kinds of dynamic shapes: `backed` and `unbacked`.
+`torch.compile` guards on `backed` dynamic shapes and does not provide a
+guarantee that no guards will be added to them. User code, dynamo,
+inductor, and autograd all can add guards. Moreover, for 0/1
+specializations, backed symbols are specialized unconditionally to 0, 1,
+or >=2 even without encountering a branching on those ranges.
+
+On the contrary, `unbacked` dynamic shapes are guaranteed not to be guarded
+on and are not 0/1 specialized. However, there is a possibility of
+throwing a data dependent error when a branch that requires their value is
+encountered and no explicit unbacked handling is defined. The framework is
+converging to a state where it won't throw DDE but rather pick general
+paths. One downside of using unbacked is missed optimization opportunities
+due to either perf bugs or picking general paths, also using a fixed
+non-example input-based hint (this will be fixed soon with override_hint
+API). An example of picking general paths is assuming input not contiguous
+in functions call contiguous() and reshape() when can't be symbolically proven
+with a change of introducing a clone.
+
+`backed_size_oblivious` is a flag that enables treating backed symbols as
+unbacked wherever explicit handling for unbacked is defined. With this
+mode, 0/1 specializations are mostly avoided in framework code and the
+default 0/1 specialization does not happen. However, there is still no
+guarantee that torch.compile won't guard, especially due to user code or
+custom passes. `backed_size_oblivious` is experimental in PyTorch compile
+and could be deprecated. That said, it's a safer option to use than
+`backed` and the probability of reducing performance is lower than
+`unbacked`.
+
+### Configuring Dynamic Shapes
+
+The `DynamicShapesConfig` allows you to control the dynamic shapes behavior by
+setting the `type` field. You can choose between three modes:
+`BACKED`(default), `UNBACKED` , and `BACKED_SIZE_OBLIVIOUS`.
+
+#### Offline Inference Example (Using LLM class)
+
+When using the `LLM` class for offline inference, you can configure dynamic
+shapes through the `compilation_config` parameter:
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+
+# Example: Using backed_size_oblivious (experimental, safer than backed)
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B",
+    compilation_config=CompilationConfig(
+        dynamic_shapes_config=DynamicShapesConfig(
+            type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS
+        )
+    )
+)
+
+# Example: Using unbacked (strongest guarantee against guards)
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B",
+    compilation_config=CompilationConfig(
+        dynamic_shapes_config=DynamicShapesConfig(
+            type=DynamicShapesType.UNBACKED
+        )
+    )
+)
+
+# Generate outputs
+prompts = ["Hello, my name is", "The future of AI is"]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+outputs = llm.generate(prompts, sampling_params)
+```
+
+#### Online Serving Example (Using vllm serve)
+
+When using `vllm serve` for online serving, you can configure dynamic shapes
+through the `--compilation-config` flag:
+
+```bash
+# Example: Using unbacked
+vllm serve meta-llama/Llama-3.2-1B \
+  --compilation-config '{"dynamic_shapes_config": {"type": "unbacked"}}'
+
+
+# Alternative: Using dot notation (simpler for single values)
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+```
+
+#### Choosing the Right Mode
+
+- **BACKED** (default): Use when you're willing to accept potential unsafe dropping of guards
+for maximal performance. Guard could be unsoundly added and then ignored.
+
+- **UNBACKED**  Use when you need the strongest guarantee against guards.
+  This is the most conservative option but may miss some optimization opportunities.
+
+- **BACKED_SIZE_OBLIVIOUS**: Use when you want a balance between avoiding guards
+  and performance. This experimental mode is safer than BACKED but still not as
+  conservative as UNBACKED.
+
 ## Python Code Compilation
 
 In the very verbose logs, we can see:
@@ -122,7 +225,7 @@ When all the shapes are known, `torch.compile` can compare different configs, an
       triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
       triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
       triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
-      mm 0.0160 ms 81.6% 
+      mm 0.0160 ms 81.6%
       triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
       triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
       triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
new file mode 100644
index 000000000000..c20aea822fe8
--- /dev/null
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import gc
+
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config.compilation import CompilationMode, DynamicShapesType
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+
+def get_test_models():
+    """Get list of models to test based on PyTorch version"""
+    # TODO "Qwen/Qwen3-4B-Instruct-2507" fails Fix issue and support it.
+    return ["gpt2", "Qwen/Qwen2-7B-Instruct", "meta-llama/Llama-3.1-8B"]
+
+
+@pytest.mark.parametrize("model_name", get_test_models())
+@pytest.mark.parametrize(
+    "shapes_type",
+    [
+        DynamicShapesType.BACKED,
+        DynamicShapesType.UNBACKED,
+        DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
+    ],
+)
+@pytest.mark.parametrize("use_aot_compile", ["0"])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_dynamic_shapes_compilation(
+    monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook
+):
+    """Test that all dynamic shapes types compile successfully"""
+    print(
+        f"\nTesting model: {model_name} with {shapes_type.name}, "
+        f"AOT compile: {use_aot_compile}, "
+        f"Bytecode hook: {use_bytecode_hook}"
+    )
+    if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
+        pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
+
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
+    prompt = "Hello, my name is"
+
+    print(f"Testing {shapes_type.name} dynamic shapes...")
+
+    # Initialize the model with specific dynamic shapes configuration
+    model = LLM(
+        model=model_name,
+        compilation_config={
+            "mode": CompilationMode.VLLM_COMPILE,
+            "dynamic_shapes_config": {
+                "type": shapes_type.value,
+            },
+        },
+    )
+
+    output = model.generate(prompt)
+    result = output[0].outputs[0].text
+    # Example of setting the sampling parameters
+    tokenizer = get_tokenizer(model_name)
+    yes_tokens = tokenizer.encode("yes", add_special_tokens=False)
+    no_tokens = tokenizer.encode("no", add_special_tokens=False)
+    allowed_ids = list(set(yes_tokens + no_tokens))
+    sampling_params = SamplingParams(
+        max_tokens=1, temperature=0, allowed_token_ids=allowed_ids
+    )
+
+    output = model.generate(
+        "answer with yes or no is " + result + " rubbish for prompt " + prompt + "?",
+        sampling_params=sampling_params,
+    )
+    result = output[0].outputs[0].text
+    assert result == "yes"
+
+    # Clean up GPU memory
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
+    print("GPU memory cleared")
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 11a18c0e6bb7..6d9da1c488c6 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -24,6 +24,7 @@
     get_current_vllm_config,
     set_current_vllm_config,
 )
+from vllm.config.compilation import DynamicShapesType
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -104,6 +105,7 @@ def support_torch_compile(
     dynamic_arg_dims: dict[str, int | list[int]] | None = None,
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[VllmConfig], bool] | None = None,
+    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> Callable[[_T], _T] | _T:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -161,6 +163,14 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
     dim to be decorated with `mark_unbacked`.  This is useful if we would like to
     enforce that dynamo does not specialize on 0/1 values in the case of dummy input
     such as for vision model compilation
+
+    `shape_invariants` is a function that gets compiled right before forward.
+    The function should have the torch._check calls that are needed to set
+    the relationships between different input sizes. For example:
+            torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
+    This enforces constraints on the symbolic shapes without hardcoding
+    specific values. It is needed for some models to avoid data dependent
+    errors.
     """
 
     def cls_decorator_helper(cls: _T) -> _T:
@@ -199,7 +209,11 @@ def cls_decorator_helper(cls: _T) -> _T:
                     f"Argument {k} not found in the forward method of {cls}"
                 )
         return _support_torch_compile(
-            cls, inferred_dynamic_arg_dims, mark_unbacked_dims, enable_if
+            cls,
+            inferred_dynamic_arg_dims,
+            mark_unbacked_dims,
+            enable_if,
+            shape_invariants,
         )
 
     if cls is not None:
@@ -242,6 +256,7 @@ def _support_torch_compile(
     dynamic_arg_dims: dict[str, int | list[int]],
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[VllmConfig], bool] | None = None,
+    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> _T:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -276,11 +291,12 @@ def __init__(
         old_init(self, **kwargs)
 
         self.vllm_config = vllm_config
+        self.compilation_config = self.vllm_config.compilation_config
         enable_compile = enable_if is None or enable_if(vllm_config)
         # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner
         # will handle the compilation, so we don't need to do anything here.
         self.do_not_compile = (
-            vllm_config.compilation_config.mode
+            self.compilation_config.mode
             in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE]
             or not supports_dynamo()
             or _should_ignore_torch_compile(self.__class__)
@@ -289,29 +305,38 @@ def __init__(
         if self.do_not_compile:
             return
 
+        self._check_shape_invariants = shape_invariants
+
         compilation_counter.num_models_seen += 1
         self.compiled = False
         TorchCompileWithNoGuardsWrapper.__init__(self)
 
     cls.__init__ = __init__
 
-    def _mark_dynamic_inputs(mod, *args, **kwargs):
+    def _mark_dynamic_inputs(mod, type, *args, **kwargs):
+        def mark_dynamic(arg, dims):
+            if type == DynamicShapesType.UNBACKED:
+                torch._dynamo.decorators.mark_unbacked(arg, dims)
+            else:
+                torch._dynamo.mark_dynamic(arg, dims)
+
         sig = inspect.signature(mod.__class__.forward)
         bound_args = sig.bind(mod, *args, **kwargs)
         bound_args.apply_defaults()
         for k, dims in dynamic_arg_dims.items():
             arg = bound_args.arguments.get(k)
+
             if arg is not None:
                 dims = [dims] if isinstance(dims, int) else dims
                 if isinstance(arg, torch.Tensor):
                     # In case dims is specified with negative indexing
                     dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                    torch._dynamo.mark_dynamic(arg, dims)
+                    mark_dynamic(arg, dims)
                 elif isinstance(arg, IntermediateTensors):
                     for tensor in arg.tensors.values():
                         # In case dims is specified with negative indexing
                         dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims]
-                        torch._dynamo.mark_dynamic(tensor, dims)
+                        mark_dynamic(tensor, dims)
                 else:
                     raise ValueError(
                         "Unsupported dynamic dimensions"
@@ -338,6 +363,7 @@ def __call__(self, *args, **kwargs):
         if getattr(self, "aot_compiled_fn", None) is not None:
             return self.aot_compiled_fn(self, *args, **kwargs)
 
+        ds_type = self.compilation_config.dynamic_shapes_config.type
         cache_dir = None
         aot_compilation_path = None
         if envs.VLLM_USE_AOT_COMPILE:
@@ -352,6 +378,14 @@ def __call__(self, *args, **kwargs):
             serialized backend artifacts), then we need to generate a new AOT
             compile artifact from scratch.
             """
+            # Validate that AOT compile is not used with unbacked dynamic
+            # shapes. aot_compile re-allocates backed symbols post dynamo!
+            if ds_type == DynamicShapesType.UNBACKED:
+                raise ValueError(
+                    "AOT compilation is not compatible with UNBACKED dynamic shapes. "
+                    "Please use BACKED or BACKED_SIZE_OBLIVIOUS dynamic shapes type "
+                    "when VLLM_USE_AOT_COMPILE is enabled."
+                )
             from .caching import compilation_config_hash_factors
 
             factors: list[str] = compilation_config_hash_factors(self.vllm_config)
@@ -401,7 +435,12 @@ def __call__(self, *args, **kwargs):
         # This is the path for the first compilation.
 
         # the first compilation needs to have dynamic shapes marked
-        _mark_dynamic_inputs(self, *args, **kwargs)
+        _mark_dynamic_inputs(
+            self,
+            ds_type,
+            *args,
+            **kwargs,
+        )
 
         # here, it is the starting point of the `torch.compile` process
         start_monitoring_torch_compile(self.vllm_config)
@@ -417,9 +456,7 @@ def __call__(self, *args, **kwargs):
         # properly when any of these files change.
 
         # 1. the file containing the top-level forward function
-        self.vllm_config.compilation_config.traced_files.add(
-            original_code_object.co_filename
-        )
+        self.compilation_config.traced_files.add(original_code_object.co_filename)
 
         # 2. every time Dynamo sees a function call, it will inline
         # the function by calling InliningInstructionTranslator.inline_call_
@@ -429,7 +466,7 @@ def __call__(self, *args, **kwargs):
 
         def patched_inline_call(self_):
             code = self_.f_code
-            self.vllm_config.compilation_config.traced_files.add(code.co_filename)
+            self.compilation_config.traced_files.add(code.co_filename)
             return inline_call(self_)
 
         # Disable the C++ compilation of symbolic shape guards. C++-fication
@@ -445,12 +482,18 @@ def patched_inline_call(self_):
             # if the config doesn't exist
             logger.debug("enable_cpp_symbolic_shape_guards config not available")
 
+        # Prepare backed_size_oblivious config patch if needed
+        fx_config_patches = {}
+        if ds_type == DynamicShapesType.BACKED_SIZE_OBLIVIOUS:
+            fx_config_patches["backed_size_oblivious"] = True
+
         with (
             patch.object(
                 InliningInstructionTranslator, "inline_call_", patched_inline_call
             ),
             torch._dynamo.config.patch(**dynamo_config_patches),
             maybe_use_cudagraph_partition_wrapper(self.vllm_config),
+            torch.fx.experimental._config.patch(**fx_config_patches),
             _torch27_patch_tensor_subclasses(),
         ):
             if envs.VLLM_USE_AOT_COMPILE:
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 493e57f97f0f..b120c85bf232 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -6,6 +6,7 @@
 from abc import abstractmethod
 from contextlib import contextmanager
 from types import CodeType
+from typing import Any
 
 import torch
 import torch._C._dynamo.guards
@@ -85,6 +86,12 @@ class TorchCompileWithNoGuardsWrapper:
     since we drop all guards.
     """
 
+    def check_invariants_and_forward(self, *args, **kwargs):
+        assert hasattr(self, "_check_shape_invariants")
+        self._check_shape_invariants(*args, **kwargs)
+
+        return self.forward(*args, **kwargs)
+
     def __init__(self):
         self.compiled = False
 
@@ -104,6 +111,21 @@ def __init__(self):
             # Drop all the guards.
             options["guard_filter_fn"] = lambda x: [False for _ in x]
 
+        # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False
+        from vllm.compilation.decorators import DynamicShapesType
+
+        ds_type = vllm_config.compilation_config.dynamic_shapes_config.type
+        compiled_ptr: Any = self.forward
+        if ds_type == DynamicShapesType.UNBACKED:
+            if envs.VLLM_USE_BYTECODE_HOOK:
+                # reason is that bytecode does this hack torch._dynamo.eval_frame.
+                # remove_from_cache(self.original_code_object()) to force a new
+                # re-compilation.
+                raise ValueError(
+                    "UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0. "
+                )
+            compiled_ptr = self.check_invariants_and_forward
+
         if envs.VLLM_USE_AOT_COMPILE:
             if hasattr(torch._dynamo.config, "enable_aot_compile"):
                 torch._dynamo.config.enable_aot_compile = True
@@ -114,7 +136,7 @@ def __init__(self):
                 logger.warning(msg)
 
         self._compiled_callable = torch.compile(
-            self.forward,
+            compiled_ptr,
             fullgraph=True,
             dynamic=False,
             backend=backend,
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 9b5309598d0e..42eccf9f4112 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -192,6 +192,54 @@ def __post_init__(self) -> None:
             self.enable_qk_norm_rope_fusion = False
 
 
+class DynamicShapesType(str, enum.Enum):
+    """Types of dynamic shapes handling in torch.compile().
+    see  Dynamic shapes and vllm guard dropping in torch_compile.md
+    for more details."""
+
+    BACKED = "backed"
+    """Use backed dynamic shapes. torch.compile() guards on backed dynamic
+    shapes and may add guards. Symbols are specialized to 0, 1, or >=2 even
+    without encountering branching on those ranges."""
+
+    UNBACKED = "unbacked"
+    """Use unbacked dynamic shapes. Guaranteed not to be guarded on and not
+    0/1 specialized, but may throw data dependent errors when branches require
+    their value without explicit unbacked handling."""
+
+    BACKED_SIZE_OBLIVIOUS = "backed_size_oblivious"
+    """Experimental flag that treats backed symbols as unbacked when explicit
+    unbacked handling is defined."""
+
+
+@config
+@dataclass
+class DynamicShapesConfig:
+    """Configuration to control/debug torch compile dynamic shapes."""
+
+    type: DynamicShapesType = DynamicShapesType.BACKED
+    """Controls the type of dynamic shapes handling to use with torch.compile().
+
+    - BACKED: Default PyTorch behavior with potential guards ignored.
+    - UNBACKED: No guards guaranteed (most sound) but may throw
+      data dependent errors.
+    - BACKED_SIZE_OBLIVIOUS: Experimental safer alternative to
+      backed/unbacked.
+    """
+
+    # TODO add a debug mode to fail
+
+    def compute_hash(self) -> str:
+        """
+        Provide a hash for DynamicShapesConfig
+        """
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, {})
+        return hash_factors(factors)
+
+
 @config
 @dataclass
 class CompilationConfig:
@@ -322,7 +370,7 @@ class CompilationConfig:
     If empty list [], no ops are excluded (suitable for full cudagraphs)."""
     compile_mm_encoder: bool = False
     """Whether or not to compile the multimodal encoder.
-    Currently, this only works for `Qwen2_5_vl` on selected platforms. 
+    Currently, this only works for `Qwen2_5_vl` on selected platforms.
     Disabled by default until more models are supported/tested to work."""
 
     # Inductor capture
@@ -348,9 +396,11 @@ class CompilationConfig:
     """Sizes to compile for inductor. In addition
     to integers, it also supports "cudagraph_capture_sizes" to
     specify the sizes for cudagraph capture."""
+
     inductor_compile_config: dict = field(default_factory=dict)
     """Additional configurations for inductor.
     - None: use default configurations."""
+
     inductor_passes: dict[str, str] = field(default_factory=dict)
     """Additional passes for inductor. It is a dictionary
     from pass name to pass function qualified name. We use function
@@ -460,8 +510,15 @@ class CompilationConfig:
     max_num_seqs, and prevents capture of many large graphs (>512) that would
     greatly increase startup time with limited performance benefit.
     """
+
+    dynamic_shapes_config: DynamicShapesConfig = field(
+        default_factory=DynamicShapesConfig
+    )
+    """Configuration for dynamic shapes options"""
+
     local_cache_dir: str = field(default=None, init=False)  # type: ignore
     """local cache dir for each rank"""
+
     bs_to_padded_graph_size: list[int] = field(
         default=None,  # type: ignore
         init=False,
@@ -530,6 +587,7 @@ def compute_hash(self) -> str:
         from vllm.config.utils import get_hash_factors, hash_factors
 
         factors = get_hash_factors(self, ignored_factors)
+
         factors["pass_config"] = self.pass_config.compute_hash()
         return hash_factors(factors)
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index ebf8addda4a5..eebb9e07fa89 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -354,7 +354,17 @@ def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None
         return vllm_config.quant_config
 
 
-@support_torch_compile
+def llama_model_invariants(
+    input_ids, positions, intermediate_tensors=None, inputs_embeds=None
+):
+    """Shape invariants for Llama model compilation, those are translated to
+    runtime assertions for unbacked dynamic shapes and are compiled away for
+    backed"""
+    if input_ids is not None:
+        torch._check(positions.size()[0] == input_ids.size()[0])
+
+
+@support_torch_compile(shape_invariants=llama_model_invariants)
 class LlamaModel(nn.Module):
     def __init__(
         self,
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 32b6d6dd07b8..5831ce0b3d64 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -274,6 +274,38 @@ def forward(
         return hidden_states, residual
 
 
+def qwen_2_model_invariants(
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    intermediate_tensors: IntermediateTensors | None = None,
+    inputs_embeds: torch.Tensor | None = None,
+):
+    """Shape invariants for Qwen2Model Model, those are translated to
+    runtime assertions for unbacked dynamic shapes and are compiled away for
+    backed"""
+    # All these should be equal.
+    # input_ids.size()[0]
+    # positions.size()[-1]
+    # intermediate_tensors["hidden_states"].size()[0]
+    # inputs_embeds.size()[0]
+    torch._check(input_ids.size()[0] == positions.size()[-1])
+    if intermediate_tensors is not None:
+        torch._check(
+            input_ids.size()[0] == intermediate_tensors["hidden_states"].size()[0]
+        )
+
+    if inputs_embeds is not None:
+        torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
+
+    # Hidden dimensions should match (hidden_size)
+    # intermediate_tensors["hidden_states"].size()[1]
+    # inputs_embeds.size()[1]
+    if inputs_embeds is not None and intermediate_tensors is not None:
+        torch._check(
+            inputs_embeds.size()[1] == intermediate_tensors["hidden_states"].size()[1]
+        )
+
+
 @support_torch_compile(
     dynamic_arg_dims={
         "input_ids": 0,
@@ -282,7 +314,8 @@ def forward(
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    }
+    },
+    shape_invariants=qwen_2_model_invariants,
 )
 class Qwen2Model(nn.Module):
     def __init__(

From e48b2e6848ac0084860c8a6bd73927d7535a2c61 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 24 Nov 2025 22:24:49 +0700
Subject: [PATCH 370/578] [Bugfix] [ROCm] [UX] Reorganize ROCm Backend
 Selection Logic (#26980)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../test_rocm_attention_backends_selection.py | 337 ++++++++++++++++++
 vllm/platforms/rocm.py                        |  80 +++--
 2 files changed, 394 insertions(+), 23 deletions(-)
 create mode 100644 tests/v1/attention/test_rocm_attention_backends_selection.py

diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
new file mode 100644
index 000000000000..4ec79e9eb6ba
--- /dev/null
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -0,0 +1,337 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for attention backend selectors."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.platforms import current_platform
+
+# ROCm-specific attention backend selection tests
+pytestmark = pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="ROCm-specific tests"
+)
+
+
+@pytest.fixture
+def mock_vllm_config():
+    """Create a mock VllmConfig for testing."""
+    config = MagicMock()
+    config.model_config.dtype = torch.float16
+    config.model_config.hf_config.architectures = ["LlamaForCausalLM"]
+    config.cache_config.block_size = 16
+    return config
+
+
+@pytest.fixture
+def mock_on_gfx9():
+    """Mock the on_gfx9 function to return True."""
+    with patch("vllm.platforms.rocm.on_gfx9", return_value=True):
+        yield
+
+
+@pytest.mark.parametrize(
+    "env_vars, selected_backend, expected_backend_path",
+    [
+        # Test Case 1: Default (no env vars, no explicit backend)
+        (
+            {},
+            None,
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 2: Explicit TRITON_ATTN backend
+        (
+            {},
+            "TRITON_ATTN",
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 3: Explicit ROCM_ATTN backend
+        (
+            {},
+            "ROCM_ATTN",
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+        # Test Case 4: Explicit ROCM_AITER_FA backend
+        (
+            {},
+            "ROCM_AITER_FA",
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 5: Explicit ROCM_AITER_UNIFIED_ATTN backend
+        (
+            {},
+            "ROCM_AITER_UNIFIED_ATTN",
+            AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
+        ),
+        # Test Case 6: VLLM_ROCM_USE_AITER=1
+        # (defaults to AITER FA when MHA not explicitly disabled)
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 7: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=1
+        (
+            {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "1"},
+            None,
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 8: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
+        (
+            {
+                "VLLM_ROCM_USE_AITER": "1",
+                "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION": "1",
+            },
+            None,
+            AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
+        ),
+        # Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
+        (
+            {"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+            None,
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+        # Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "TRITON_ATTN",
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
+        # (explicitly disabled)
+        (
+            {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"},
+            None,
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "ROCM_ATTN",
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+    ],
+)
+def test_standard_attention_backend_selection(
+    env_vars,
+    selected_backend,
+    expected_backend_path,
+    mock_vllm_config,
+    mock_on_gfx9,
+    monkeypatch,
+):
+    """Test standard attention backend selection with various configurations."""
+    # Set environment variables
+    for key, value in env_vars.items():
+        monkeypatch.setenv(key, value)
+
+    # Import after setting env vars to ensure they're picked up
+    # Reload envs to pick up new environment variables
+    import importlib
+
+    import vllm.envs as envs
+    from vllm.attention.backends.registry import _Backend
+
+    importlib.reload(envs)
+
+    # Convert string backend to enum if provided
+    backend_enum = None
+    if selected_backend:
+        backend_enum = getattr(_Backend, selected_backend)
+
+    # Get the backend class path
+    from vllm.platforms.rocm import RocmPlatform
+
+    backend_path = RocmPlatform.get_attn_backend_cls(
+        selected_backend=backend_enum,
+        head_size=128,
+        dtype=torch.float16,
+        kv_cache_dtype="auto",
+        block_size=16,
+        use_mla=False,
+        has_sink=False,
+        use_sparse=False,
+    )
+    assert backend_path == expected_backend_path
+
+
+@pytest.mark.parametrize(
+    "env_vars, selected_backend, block_size, expected_backend_path, should_raise",
+    [
+        # Test Case 1: TRITON_MLA with block_size != 1
+        (
+            {},
+            "TRITON_MLA",
+            16,
+            AttentionBackendEnum.TRITON_MLA.get_path(),
+            False,
+        ),
+        # Test Case 2: TRITON_MLA with block_size == 1 (should raise)
+        (
+            {},
+            "TRITON_MLA",
+            1,
+            None,
+            True,
+        ),
+        # Test Case 3: ROCM_AITER_MLA with block_size == 1
+        (
+            {},
+            "ROCM_AITER_MLA",
+            1,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 4: ROCM_AITER_MLA with block_size != 1 (should raise)
+        (
+            {},
+            "ROCM_AITER_MLA",
+            16,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 5: VLLM_ROCM_USE_AITER=1 with block_size == 1
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            1,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 6: VLLM_ROCM_USE_AITER=1 with block_size == 16
+        # (should use ROCM_AITER_MLA now, as it supports block_size 16)
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            16,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 7: VLLM_ROCM_USE_AITER=1 + explicit TRITON_MLA
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "TRITON_MLA",
+            16,
+            AttentionBackendEnum.TRITON_MLA.get_path(),
+            False,
+        ),
+        # Test Case 8: Explicit ROCM_AITER_TRITON_MLA
+        (
+            {},
+            "ROCM_AITER_TRITON_MLA",
+            16,
+            AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path(),
+            False,
+        ),
+    ],
+)
+def test_mla_backend_selection(
+    env_vars,
+    selected_backend,
+    block_size,
+    expected_backend_path,
+    should_raise,
+    mock_vllm_config,
+    monkeypatch,
+):
+    """Test MLA backend selection with various configurations."""
+    # Set environment variables
+    for key, value in env_vars.items():
+        monkeypatch.setenv(key, value)
+
+    # Import after setting env vars
+    # Reload envs
+    import importlib
+
+    import vllm.envs as envs
+    from vllm.attention.backends.registry import _Backend
+
+    importlib.reload(envs)
+
+    # Mock is_aiter_mla_enabled based on env vars and block_size
+    aiter_enabled = env_vars.get("VLLM_ROCM_USE_AITER") == "1"
+
+    mock_rocm_ops = MagicMock()
+    mock_rocm_ops.is_mla_enabled.return_value = aiter_enabled
+    mock_aiter_module = MagicMock()
+    mock_aiter_module.rocm_aiter_ops = mock_rocm_ops
+
+    with patch.dict("sys.modules", {"vllm._aiter_ops": mock_aiter_module}):
+        # Convert string backend to enum if provided
+        backend_enum = None
+        if selected_backend:
+            backend_enum = getattr(_Backend, selected_backend)
+
+        from vllm.platforms.rocm import RocmPlatform
+
+        if should_raise:
+            with pytest.raises(ValueError):
+                RocmPlatform.get_attn_backend_cls(
+                    selected_backend=backend_enum,
+                    head_size=128,
+                    dtype=torch.float16,
+                    kv_cache_dtype="auto",
+                    block_size=block_size,
+                    use_mla=True,
+                    has_sink=False,
+                    use_sparse=False,
+                )
+        else:
+            backend_path = RocmPlatform.get_attn_backend_cls(
+                selected_backend=backend_enum,
+                head_size=128,
+                dtype=torch.float16,
+                kv_cache_dtype="auto",
+                block_size=block_size,
+                use_mla=True,
+                has_sink=False,
+                use_sparse=False,
+            )
+            assert backend_path == expected_backend_path
+
+
+def test_aiter_fa_requires_gfx9(mock_vllm_config):
+    """Test that ROCM_AITER_FA requires gfx9 architecture."""
+    from vllm.attention.backends.registry import _Backend
+    from vllm.platforms.rocm import RocmPlatform
+
+    # Mock on_gfx9 to return False
+    with (
+        patch("vllm.platforms.rocm.on_gfx9", return_value=False),
+        pytest.raises(
+            ValueError,
+            match="only supported on gfx9",
+        ),
+    ):
+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=_Backend.ROCM_AITER_FA,
+            head_size=128,
+            dtype=torch.float16,
+            kv_cache_dtype="auto",
+            block_size=16,
+            use_mla=False,
+            has_sink=False,
+            use_sparse=False,
+        )
+
+
+def test_sparse_not_supported(mock_vllm_config):
+    """Test that sparse attention is not supported on ROCm."""
+    from vllm.platforms.rocm import RocmPlatform
+
+    with pytest.raises(
+        AssertionError, match="Sparse MLA backend on ROCm only supports block size 1"
+    ):
+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=None,
+            head_size=128,
+            dtype=torch.float16,
+            kv_cache_dtype="auto",
+            block_size=16,
+            use_mla=False,
+            has_sink=False,
+            use_sparse=True,
+        )
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f9005fd7d044..f3ec965bd088 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -262,30 +262,64 @@ def get_attn_backend_cls(
                 f"is not MLA type while requested for MLA backend."
             )
 
-        if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
-            logger.info("Using FlexAttention backend.")
-            return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
-        if (
-            rocm_aiter_ops.is_mha_enabled()
-        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
-            logger.info("Using Aiter Flash Attention backend.")
-            return AttentionBackendEnum.ROCM_AITER_FA.get_path()
-        if (
-            rocm_aiter_ops.is_triton_unified_attn_enabled()
-        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
-            logger.info("Using Aiter Unified Attention backend.")
-            return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
-        if (
-            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
-            or selected_backend == AttentionBackendEnum.ROCM_ATTN
-        ):
-            # rocm specific backend, with aiter and/or
-            #   triton prefix-prefill
-            logger.info("Using Rocm Attention backend.")
+        if selected_backend == AttentionBackendEnum.TRITON_ATTN:
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        if selected_backend == AttentionBackendEnum.ROCM_ATTN:
+            logger.info("Using Rocm Attention backend on V1 engine.")
             return AttentionBackendEnum.ROCM_ATTN.get_path()
-        # default case, using triton unified attention
-        logger.info("Using Triton Attention backend.")
-        return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        if selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
+            if on_gfx9():
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+            else:
+                raise ValueError(
+                    f"The selected backend, {selected_backend.name}, "
+                    "is only supported on gfx9 architectures."
+                )
+
+        if selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
+            logger.info("Using Aiter Unified Attention backend on V1 engine.")
+            return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+
+        # Handle automatic backend selection based on environment variables
+        if selected_backend is None:
+            # Priority 1: Check for AITER Unified Attention (must check before MHA)
+            if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION:
+                logger.info("Using Aiter Unified Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+
+            # Priority 2: Check for AITER MHA (Flash Attention)
+            # Only use if explicitly enabled (not just VLLM_ROCM_USE_AITER=1)
+            if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
+            # Priority 3: Check for ROCM_ATTN (prefill-decode split)
+            if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
+                logger.info("Using Rocm Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_ATTN.get_path()
+
+            # Priority 4: Check for AITER enabled without specific flags
+            # This defaults to AITER FA only if MHA is not explicitly disabled
+            if (
+                envs.VLLM_ROCM_USE_AITER
+                and on_gfx9()
+                and envs.VLLM_ROCM_USE_AITER_MHA is not False
+            ):
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
+            # Default: Triton Unified Attention
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        raise RuntimeError(
+            "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
+            "to select a supported backend."
+        )
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:

From 656516c3158ef932c6a19a6aab9fdf4df74b105f Mon Sep 17 00:00:00 2001
From: Aydin Abiar <62435714+Aydin-ab@users.noreply.github.com>
Date: Mon, 24 Nov 2025 07:28:51 -0800
Subject: [PATCH 371/578] [Bugfix] properly handle nested json with llama3 tool
 parser (#27701)

Signed-off-by: Aydin Abiar <aydin@anyscale.com>
Signed-off-by: Aydin Abiar <62435714+Aydin-ab@users.noreply.github.com>
Co-authored-by: Aydin Abiar <aydin@anyscale.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 .../test_llama3_json_tool_parser.py           | 128 ++++++++++++++++++
 .../openai/tool_parsers/llama_tool_parser.py  | 112 +++++++++------
 2 files changed, 201 insertions(+), 39 deletions(-)

diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
index 2b68a653f460..37e52d2cdf60 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from unittest.mock import MagicMock, patch
+
 import pytest
 
 from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
@@ -132,3 +134,129 @@ def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
     assert result.tool_calls[0].function.name == "searchTool"
     assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
     assert result.tool_calls[2].function.name == "searchTool"
+
+
+def test_extract_tool_calls_deeply_nested_json(parser):
+    # Test with deeply nested JSON parameters (5 levels)
+    model_output = (
+        '{"name": "complexTool", '
+        '"parameters": {'
+        '"level1": {'
+        '"level2": {'
+        '"level3": {'
+        '"level4": {'
+        '"value": "deep"'
+        "}}}}}}"
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "complexTool"
+    # Verify the nested structure is preserved in the arguments
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["level1"]["level2"]["level3"]["level4"]["value"] == "deep"
+
+
+def test_extract_tool_calls_multiple_with_deep_nesting(parser):
+    # Test with multiple tool calls where some have deeply nested parameters
+    model_output = (
+        '{"name": "simpleTool", "parameters": {"value": "test"}}; '
+        '{"name": "complexTool", "parameters": '
+        '{"config": {"database": {"connection": {"pool": {"size": 10}}}}}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 2
+
+    # Check first tool call
+    assert result.tool_calls[0].function.name == "simpleTool"
+    import json
+
+    args0 = json.loads(result.tool_calls[0].function.arguments)
+    assert args0["value"] == "test"
+
+    # Check second tool call with deep nesting
+    assert result.tool_calls[1].function.name == "complexTool"
+    args1 = json.loads(result.tool_calls[1].function.arguments)
+    assert args1["config"]["database"]["connection"]["pool"]["size"] == 10
+
+
+def test_extract_tool_calls_with_quotes_and_brackets_in_string(parser):
+    # Test with quotes and brackets inside quoted string values
+    model_output = (
+        '{"name": "searchTool", '
+        '"parameters": {'
+        '"query": "test {value} [complex]",'
+        '"nested": {"inner": "more {brackets}"}'
+        "}}"
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "searchTool"
+    # Verify the string values are preserved including brackets and quotes
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["query"] == "test {value} [complex]"
+    assert args["nested"]["inner"] == "more {brackets}"
+
+
+def test_extract_tool_calls_with_escaped_quotes_in_nested_json(parser):
+    # Test with escaped quotes in deeply nested JSON
+    model_output = (
+        '{"name": "parserTool", "parameters": {"text": "He said \\"Hello {world}\\""}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "parserTool"
+    # Verify escaped quotes are preserved
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["text"] == 'He said "Hello {world}"'
+
+
+def test_extract_tool_calls_missing_name_key(parser):
+    # Test that missing "name" key returns content
+    model_output = '{"parameters": {}}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_missing_parameters_and_arguments_key(parser):
+    # Test that missing both "parameters" and "arguments" keys returns content
+    model_output = '{"name": "toolWithoutParams"}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_regex_timeout_handling(parser):
+    """Test regex timeout is handled gracefully"""
+    fake_problematic_input = "{hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.finditer.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(parser, "tool_call_start_regex", mock_regex):
+        result = parser.extract_tool_calls(fake_problematic_input, None)
+
+        # should treat as regular text when regex times out
+        assert result.content == fake_problematic_input
+        assert result.tools_called is False
+        assert len(result.tool_calls) == 0
+        mock_regex.finditer.assert_called_once()
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 02fc9b8a4d34..e1fe6e90dfd0 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -9,6 +9,7 @@
 from partial_json_parser.core.options import Allow
 from transformers import PreTrainedTokenizerBase
 
+import vllm.envs as envs
 from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -56,12 +57,10 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase):
         self.bot_token_id = tokenizer.encode(self.bot_token, add_special_tokens=False)[
             0
         ]
-        # Updated regex to match multiple JSONs separated by semicolons
-        # This pattern is more robust and can handle nested JSON objects
-        self.tool_call_regex = re.compile(
-            r"{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*",
-            re.DOTALL,
-        )
+        # Simple regex to find opening braces - we'll use JSON decoder for parsing
+        # This handles arbitrary nesting depth correctly
+        self.tool_call_start_regex = re.compile(r"\{")
+        self.json_decoder = json.JSONDecoder()
 
     def extract_tool_calls(
         self, model_output: str, request: ChatCompletionRequest
@@ -77,50 +76,85 @@ def extract_tool_calls(
                 tools_called=False, tool_calls=[], content=model_output
             )
 
-        # Find JSON object(s) in the text using regex
-        match = self.tool_call_regex.search(model_output)
-        if not match:
-            return ExtractedToolCallInformation(
-                tools_called=False, tool_calls=[], content=model_output
-            )
+        # Keep track of the end index of the last parsed JSON object
+        # so we don't parse inner brackets
+        end_index = -1
+        tool_calls: list[ToolCall] = []
 
         try:
-            json_str = match.group(0)
-            # Split by semicolon and strip whitespace
-            json_objects = [obj.strip() for obj in json_str.split(";")]
-
-            tool_calls: list[ToolCall] = []
-            for json_obj in json_objects:
-                if not json_obj:  # Skip empty strings
+            for match in self.tool_call_start_regex.finditer(
+                model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
+            ):
+                start_index = match.start()
+                # Skip if this brace is inside a previously parsed JSON object
+                if start_index <= end_index:
                     continue
-                obj = json.loads(json_obj)
-                tool_calls.append(
-                    ToolCall(
-                        type="function",
-                        function=FunctionCall(
-                            name=obj["name"],
-                            # function call args are JSON but as a string
-                            arguments=json.dumps(
-                                obj["arguments"]
-                                if "arguments" in obj
-                                else obj["parameters"],
-                                ensure_ascii=False,
-                            ),
-                        ),
+
+                try:
+                    obj, json_end_index = self.json_decoder.raw_decode(
+                        model_output[start_index:]
+                    )
+                    end_index = start_index + json_end_index
+
+                    # raise KeyError if missing
+                    name = obj["name"]
+                    arguments_or_params = (
+                        obj["arguments"] if "arguments" in obj else obj["parameters"]
                     )
-                )
 
+                    tool_calls.append(
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(
+                                name=name,
+                                # function call args are JSON but as a string
+                                arguments=json.dumps(
+                                    arguments_or_params, ensure_ascii=False
+                                ),
+                            ),
+                        )
+                    )
+                except KeyError as e:
+                    # Missing required key
+                    missing_key = str(e).strip("'\"")
+                    logger.exception(
+                        "Couldn't extract tool call from JSON response. "
+                        "Required key '%s' not present. "
+                        "Returning output in content with empty tool calls.",
+                        missing_key,
+                    )
+                    return ExtractedToolCallInformation(
+                        tools_called=False, tool_calls=[], content=model_output
+                    )
+                except Exception:
+                    # Any other error during parsing
+                    logger.exception(
+                        "Error in extracting tool call from response. "
+                        "Returning output in content with empty tool calls"
+                    )
+                    return ExtractedToolCallInformation(
+                        tools_called=False, tool_calls=[], content=model_output
+                    )
+        except TimeoutError:
+            logger.warning("Regex timeout occurred when matching tool call pattern.")
+            logger.debug(
+                "Regex timeout occurred when matching user input: %s", model_output
+            )
             return ExtractedToolCallInformation(
-                tools_called=True, tool_calls=tool_calls, content=None
+                tools_called=False, tool_calls=[], content=model_output
             )
 
-        except Exception:
-            logger.exception("Error in extracting tool call from response.")
-            # return information to just treat the tool call as regular JSON
+        # If we have valid tool calls, return them normally
+        if tool_calls:
             return ExtractedToolCallInformation(
-                tools_called=False, tool_calls=[], content=model_output
+                tools_called=True, tool_calls=tool_calls, content=None
             )
 
+        # No valid tool calls found
+        return ExtractedToolCallInformation(
+            tools_called=False, tool_calls=[], content=model_output
+        )
+
     def extract_tool_calls_streaming(
         self,
         previous_text: str,

From e924bbb4f4ac3258a71a18ac4c753c8056bc059f Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 24 Nov 2025 11:06:17 -0500
Subject: [PATCH 372/578] [Build/CI][DP/EP] Add QWen/Qwen3-30B-A3B-FP8 + EPLB
 tests to Nightly H100 and B200 (#29195)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 ...block_ep.sh => qwen30b_a3b_fp8_block_ep_eplb.sh} | 10 +++++++---
 .buildkite/test-amd.yaml                            |  2 +-
 .buildkite/test-pipeline.yaml                       | 13 +++++++++++--
 3 files changed, 19 insertions(+), 6 deletions(-)
 rename .buildkite/scripts/scheduled_integration_test/{qwen30b_a3b_fp8_block_ep.sh => qwen30b_a3b_fp8_block_ep_eplb.sh} (82%)

diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
similarity index 82%
rename from .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
rename to .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
index 0d06f53a183d..6a1bef275d04 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
 THRESHOLD=${1:-0.8}
 NUM_Q=${2:-1319}
 PORT=${3:-8020}
+DATA_PARALLEL_SIZE=${4:-2}
+TENSOR_PARALLEL_SIZE=${5:-2}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"
 
@@ -45,8 +47,10 @@ for BACK in "${BACKENDS[@]}"; do
   VLLM_ALL2ALL_BACKEND=$BACK \
   vllm serve "$MODEL" \
     --enforce-eager \
-    --tensor-parallel-size 2 \
-    --data-parallel-size 2 \
+    --enable-eplb \
+    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
     --enable-expert-parallel \
     --trust-remote-code \
     --max-model-len 2048 \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index f098e23866eb..4ddf11c0b268 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1486,4 +1486,4 @@ steps:
   num_gpus: 4
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7a46e919f93b..f1cd39ef4f94 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1340,11 +1340,20 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
   timeout_in_minutes: 60
   gpu: h100
   optional: true
   num_gpus: 4
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file

From 26a465584ae14a04a1f6e9b36621d70e9d907c10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Mon, 24 Nov 2025 18:18:04 +0100
Subject: [PATCH 373/578] [NIXL] Use config to enable telemetry + NIXL version
 bump (#29305)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 requirements/kv_connectors.txt                           | 2 +-
 .../kv_transfer/kv_connector/v1/nixl_connector.py        | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt
index b1f3269cd381..083230c17109 100644
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -1,2 +1,2 @@
 lmcache
-nixl >= 0.6.0 # Required for disaggregated prefill
+nixl >= 0.7.1 # Required for disaggregated prefill
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 7c0911240493..493938d4aad9 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -4,7 +4,6 @@
 import copy
 import logging
 import math
-import os
 import queue
 import threading
 import time
@@ -810,9 +809,6 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config(
             "backends", ["UCX"]
         )
-        # TODO temporary, once nixl allows for telemetry flag in config
-        # (next release), we can remove this env var.
-        os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
 
         # Agent.
         non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
@@ -828,10 +824,11 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         if nixl_agent_config is None:
             config = None
         else:
+            # Enable telemetry by default for NIXL 0.7.1 and above.
             config = (
-                nixl_agent_config(backends=self.nixl_backends)
+                nixl_agent_config(backends=self.nixl_backends, capture_telemetry=True)
                 if len(non_ucx_backends) > 0
-                else nixl_agent_config(num_threads=num_threads)
+                else nixl_agent_config(num_threads=num_threads, capture_telemetry=True)
             )
 
         self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)

From cc313cb73d75cd5ac2715fc45bfadb89888cf8cd Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 09:32:27 -0800
Subject: [PATCH 374/578] [Model Runner V2] Implement Single-step Eagle 1
 (#29300)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py          |   3 +
 vllm/v1/worker/gpu/model_runner.py         |  79 +++++++++
 vllm/v1/worker/gpu/sampler.py              |   5 +-
 vllm/v1/worker/gpu/spec_decode/__init__.py |  18 ++
 vllm/v1/worker/gpu/spec_decode/eagle.py    | 197 +++++++++++++++++++++
 5 files changed, 300 insertions(+), 2 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/spec_decode/eagle.py

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 7675cb45170b..1177d25e300c 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -37,6 +37,9 @@ def __init__(
         self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
         self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
 
+        # Spec decoding.
+        self.next_prefill_tokens = self._make_buffer(max_num_reqs, dtype=torch.int32)
+
         # Structured outputs.
         self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
         self.grammar_bitmask = self._make_buffer(
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 6e332ee4b75b..205298a415d4 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -45,6 +45,7 @@
     prepare_prefill_inputs,
 )
 from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
+from vllm.v1.worker.gpu.spec_decode import init_speculator
 from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
 from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
@@ -97,16 +98,20 @@ def __init__(
         if self.use_async_scheduling:
             self.input_prep_event = torch.cuda.Event()
             self.structured_outputs_event = torch.cuda.Event()
+            self.spec_decode_event = torch.cuda.Event()
         else:
             self.input_prep_event = None
             self.structured_outputs_event = None
+            self.spec_decode_event = None
 
         if self.speculative_config is not None:
             self.do_spec_decode = True
             self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+            self.speculator = init_speculator(self.vllm_config, self.device)
         else:
             self.do_spec_decode = False
             self.num_speculative_steps = 0
+            self.speculator = None
 
         self.req_states = RequestState(
             max_num_reqs=self.max_num_reqs,
@@ -153,6 +158,8 @@ def load_model(self, *args, **kwargs) -> None:
                     self.vllm_config,
                     self.device,
                 )
+            if self.do_spec_decode:
+                self.speculator.load_model(self.model)
         time_after_load = time.perf_counter()
 
         self.model_memory_usage = m.consumed_memory
@@ -285,6 +292,33 @@ def _dummy_sampler_run(
         logits = self.model.compute_logits(hidden_states)
         self.sampler(logits, sampling_metadata)
 
+    @torch.inference_mode()
+    def _dummy_speculator_run(
+        self,
+        hidden_states: torch.Tensor,
+        aux_hidden_states: list[torch.Tensor] | None,
+    ) -> None:
+        num_tokens = hidden_states.shape[0]
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        input_batch = InputBatch.make_dummy(
+            num_reqs=num_reqs,
+            num_tokens=num_tokens,
+            input_buffers=self.input_buffers,
+            device=self.device,
+        )
+        sampling_metadata = SamplingMetadata.make_dummy(
+            num_reqs=num_reqs,
+            device=self.device,
+        )
+        num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
+        self.propose_draft(
+            input_batch=input_batch,
+            sampling_metadata=sampling_metadata,
+            last_hidden_states=hidden_states,
+            aux_hidden_states=aux_hidden_states,
+            num_sampled=num_sampled,
+        )
+
     @torch.inference_mode()
     def profile_run(self) -> None:
         hidden_states, sample_hidden_states = self._dummy_run(
@@ -292,6 +326,8 @@ def profile_run(self) -> None:
             skip_attn=True,
         )
         self._dummy_sampler_run(sample_hidden_states)
+        if self.do_spec_decode:
+            self._dummy_speculator_run(hidden_states, None)
         torch.cuda.synchronize()
         del hidden_states, sample_hidden_states
         gc.collect()
@@ -727,6 +763,41 @@ def postprocess(
             self.req_states.prefill_len.np[idx_mapping_np],
         )
 
+    @torch.inference_mode()
+    def propose_draft(
+        self,
+        input_batch: InputBatch,
+        sampling_metadata: SamplingMetadata,
+        last_hidden_states: torch.Tensor,
+        aux_hidden_states: list[torch.Tensor] | None,
+        num_sampled: torch.Tensor,
+    ) -> torch.Tensor:
+        num_reqs = input_batch.num_reqs
+        idx_mapping_np = input_batch.idx_mapping_np
+        with async_barrier(self.spec_decode_event):
+            self.input_buffers.next_prefill_tokens.np[:num_reqs] = (
+                self.req_states.prefill_token_ids[
+                    idx_mapping_np,
+                    self.req_states.num_computed_prefill_tokens[idx_mapping_np],
+                ]
+            )
+            next_prefill_tokens = self.input_buffers.next_prefill_tokens.copy_to_gpu(
+                num_reqs
+            )
+
+        assert self.speculator is not None
+        draft_tokens = self.speculator.propose(
+            input_batch,
+            sampling_metadata,
+            last_hidden_states,
+            aux_hidden_states,
+            num_sampled,
+            self.req_states.last_sampled_tokens,
+            next_prefill_tokens,
+        )
+        self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
+        return draft_tokens
+
     def get_cudagraph_and_dp_padding(
         self,
         scheduler_output: SchedulerOutput,
@@ -913,6 +984,14 @@ def sample_tokens(
         self.postprocess(
             input_batch, sampler_output.sampled_token_ids, num_sampled_tokens
         )
+        if self.do_spec_decode:
+            _ = self.propose_draft(
+                input_batch,
+                sampling_metadata,
+                hidden_states,
+                None,  # aux_hidden_states
+                num_sampled_tokens,
+            )
 
         if self.use_async_scheduling:
             return async_output
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index c48ed2d8ca16..d8676079ab95 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -100,8 +100,9 @@ def _gumbel_sample_kernel(
         mask=mask,
         other=float("-inf"),
     )
+    logits = logits.to(tl.float32)
 
-    temp = tl.load(temp_ptr + req_idx)
+    temp = tl.load(temp_ptr + req_idx).to(tl.float32)
     if temp != 0.0:
         # Calculate the seed for gumbel noise.
         seed = tl.load(seeds_ptr + req_idx)
@@ -116,7 +117,7 @@ def _gumbel_sample_kernel(
         # Apply temperature.
         if APPLY_TEMPERATURE:
             # NOTE(woosuk): Use div_rn to match the behavior of torch.
-            logits = tl.div_rn(logits, temp.to(tl.float32))
+            logits = tl.div_rn(logits, temp)
 
         # Apply gumbel noise.
         logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
diff --git a/vllm/v1/worker/gpu/spec_decode/__init__.py b/vllm/v1/worker/gpu/spec_decode/__init__.py
index e69de29bb2d1..15b85204e05c 100644
--- a/vllm/v1/worker/gpu/spec_decode/__init__.py
+++ b/vllm/v1/worker/gpu/spec_decode/__init__.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.config import VllmConfig
+
+
+def init_speculator(
+    vllm_config: VllmConfig,
+    device: torch.device,
+):
+    speculative_config = vllm_config.speculative_config
+    assert speculative_config is not None
+    if speculative_config.use_eagle():
+        from vllm.v1.worker.gpu.spec_decode.eagle import EagleSpeculator
+
+        return EagleSpeculator(vllm_config, device)
+    raise NotImplementedError(f"{speculative_config.method} is not supported yet.")
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
new file mode 100644
index 000000000000..0f11903e1454
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.forward_context import set_forward_context
+from vllm.model_executor.model_loader import get_model
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.v1.worker.gpu.sampler import gumbel_sample
+from vllm.v1.worker.gpu.states import SamplingMetadata
+
+
+class EagleSpeculator:
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        self.vllm_config = vllm_config
+        self.device = device
+
+        self.speculative_config = vllm_config.speculative_config
+        assert self.speculative_config is not None
+        self.method = self.speculative_config.method
+        self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+        self.draft_model_config = self.speculative_config.draft_model_config
+
+        self.scheduler_config = vllm_config.scheduler_config
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+
+        self.input_ids = torch.zeros(
+            self.max_num_tokens, dtype=torch.int32, device=device
+        )
+        self.positions = torch.zeros(
+            self.max_num_tokens, dtype=torch.int64, device=device
+        )
+
+    def load_model(self, target_model: nn.Module) -> None:
+        from vllm.compilation.backends import set_model_tag
+
+        with set_model_tag("eagle_head"):
+            self.model = get_model(
+                vllm_config=self.vllm_config, model_config=self.draft_model_config
+            )
+
+        share_lm_head = True
+        if share_lm_head and hasattr(target_model, "lm_head"):
+            if hasattr(self.model, "lm_head"):
+                del self.model.lm_head
+            self.model.lm_head = target_model.lm_head
+
+    @torch.inference_mode()
+    def propose(
+        self,
+        input_batch: InputBatch,
+        sampling_metadata: SamplingMetadata,
+        # [num_tokens, hidden_size]
+        last_hidden_states: torch.Tensor,
+        # num_layers x [num_tokens, hidden_size]
+        aux_hidden_states: list[torch.Tensor] | None,
+        # [num_reqs]
+        num_sampled: torch.Tensor,
+        # [max_num_reqs, 1]
+        last_sampled: torch.Tensor,
+        # [num_reqs]
+        next_prefill_tokens: torch.Tensor,
+    ) -> torch.Tensor:
+        if aux_hidden_states:
+            assert self.method == "eagle3"
+            hidden_states = self.model.combine_hidden_states(
+                torch.cat(aux_hidden_states, dim=-1)
+            )
+        else:
+            hidden_states = last_hidden_states
+
+        # Get the input ids and last token indices for the speculator.
+        last_token_indices = prepare_eagle_inputs(
+            self.input_ids,
+            input_batch,
+            num_sampled,
+            last_sampled,
+            next_prefill_tokens,
+        )
+        input_ids = self.input_ids[: input_batch.num_tokens_after_padding]
+
+        # Prefill: Run the eagle speculator with eager mode.
+        with set_forward_context(
+            input_batch.attn_metadata,
+            self.vllm_config,
+            num_tokens=input_batch.num_tokens_after_padding,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+        ):
+            ret_hidden_states = self.model(
+                input_ids=input_ids,
+                positions=input_batch.positions,
+                hidden_states=hidden_states,
+            )
+        if self.method == "mtp":
+            last_hidden_states = ret_hidden_states
+            hidden_states = ret_hidden_states
+        else:
+            last_hidden_states, hidden_states = ret_hidden_states
+        sample_hidden_states = last_hidden_states[last_token_indices]
+        logits = self.model.compute_logits(sample_hidden_states)
+
+        num_reqs = input_batch.num_reqs
+        cu_num_logits = input_batch.cu_num_logits[:num_reqs]
+        temperature = sampling_metadata.temperature[cu_num_logits]
+        seed = sampling_metadata.seeds[cu_num_logits]
+        # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+        # used for draft and target sampling.
+        pos = input_batch.positions[last_token_indices] + 1
+        draft_tokens = gumbel_sample(
+            logits, temperature, seed, pos, apply_temperature=True
+        )
+        if self.num_speculative_steps == 1:
+            # Early exit.
+            return draft_tokens.view(-1, 1)
+        raise NotImplementedError("num_speculative_steps > 1 is not supported yet.")
+
+
+@triton.jit
+def _prepare_eagle_inputs_kernel(
+    last_token_indices_ptr,
+    eagle_input_ids_ptr,
+    target_input_ids_ptr,
+    idx_mapping_ptr,
+    last_sampled_ptr,
+    next_prefill_tokens_ptr,
+    num_sampled_ptr,
+    query_start_loc_ptr,
+    cu_num_logits_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    query_start = tl.load(query_start_loc_ptr + batch_idx)
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    query_len = query_end - query_start
+
+    # Get the true query length and next token after accounting for rejected tokens.
+    num_sampled = tl.load(num_sampled_ptr + batch_idx)
+    if num_sampled > 0:
+        req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+        next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32)
+
+        logits_start = tl.load(cu_num_logits_ptr + batch_idx)
+        logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
+        num_logits = logits_end - logits_start
+
+        num_rejected = num_logits - num_sampled
+        query_len -= num_rejected
+    else:
+        # Chunked prefilling.
+        # Get the next prefill token.
+        next_token = tl.load(next_prefill_tokens_ptr + batch_idx)
+
+    # Shift target_input_ids by one.
+    for i in range(1, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        input_ids = tl.load(target_input_ids_ptr + query_start + block, mask=mask)
+        tl.store(eagle_input_ids_ptr + query_start + block - 1, input_ids, mask=mask)
+
+    last_token_index = query_start + query_len - 1
+    tl.store(last_token_indices_ptr + batch_idx, last_token_index)
+    tl.store(eagle_input_ids_ptr + last_token_index, next_token)
+
+
+def prepare_eagle_inputs(
+    eagle_input_ids: torch.Tensor,
+    input_batch: InputBatch,
+    # [num_reqs]
+    num_sampled: torch.Tensor,
+    # [max_num_reqs, 1]
+    last_sampled: torch.Tensor,
+    # [max_num_reqs]
+    next_prefill_tokens: torch.Tensor,
+) -> torch.Tensor:
+    num_reqs = input_batch.num_reqs
+    last_token_indices = torch.empty(
+        num_reqs,
+        dtype=torch.int64,
+        device=eagle_input_ids.device,
+    )
+    _prepare_eagle_inputs_kernel[(num_reqs,)](
+        last_token_indices,
+        eagle_input_ids,
+        input_batch.input_ids,
+        input_batch.idx_mapping,
+        last_sampled,
+        next_prefill_tokens,
+        num_sampled,
+        input_batch.query_start_loc,
+        input_batch.cu_num_logits,
+        BLOCK_SIZE=1024,
+    )
+    return last_token_indices

From cec418b5df3bf032a83b6a6795e8026d39e199bd Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 09:34:37 -0800
Subject: [PATCH 375/578] [Model Runner V2] Change Numba AoT to JIT (#29328)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py  | 71 +++++++-----------------------
 vllm/v1/worker/gpu/model_runner.py | 24 ++++++----
 2 files changed, 32 insertions(+), 63 deletions(-)

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 1177d25e300c..3ac43ea4952d 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -4,7 +4,6 @@
 from typing import Any
 
 import numba
-import numba.types as types
 import numpy as np
 import torch
 
@@ -147,80 +146,42 @@ def make_dummy(
         )
 
 
-# NOTE: With the type annotations, this function is pre-compiled
-# before the first call.
-@numba.jit(
-    [
-        types.none(
-            types.int32[:],  # idx_mapping
-            types.int32[:],  # num_scheduled_tokens
-            types.int32[:, :],  # prefill_token_ids
-            types.int32[:],  # num_computed_prefill_tokens
-            types.int32[:],  # prefill_len
-            types.int32[:],  # input_ids
-            types.int32[:],  # query_start_loc
-        )
-    ],
-    nopython=True,
-    cache=True,
-)
+@numba.njit(cache=True)
 def _prepare_prefill_inputs(
-    idx_mapping: np.ndarray,  # batch_idx -> req_idx
-    num_scheduled_tokens: np.ndarray,  # [B]
+    idx_mapping: np.ndarray,  # [B]
+    query_lens: np.ndarray,  # [B]
+    query_start_loc: np.ndarray,  # [B + 1]
     prefill_token_ids: np.ndarray,  # [N, max_model_len]
     num_computed_prefill_tokens: np.ndarray,  # [N]
-    prefill_len: np.ndarray,  # [N]
     input_ids: np.ndarray,  # [num_input_tokens]
-    query_start_loc: np.ndarray,  # [B + 1]
 ) -> None:
-    num_reqs = num_scheduled_tokens.shape[0]
-    query_start_loc[0] = 0
-
-    cu_num_tokens = 0
+    num_reqs = idx_mapping.shape[0]
+    query_starts = query_start_loc[:num_reqs]
+    query_ends = query_start_loc[1 : num_reqs + 1]
+    starts = num_computed_prefill_tokens[idx_mapping]
+    ends = starts + query_lens
     for i in range(num_reqs):
-        req_idx = idx_mapping[i]
-        query_len = num_scheduled_tokens[i]
-
-        start = num_computed_prefill_tokens[req_idx]
-        end = min(start + query_len, prefill_len[req_idx])
-        n = end - start
-
-        start_idx = cu_num_tokens
-        input_ids[start_idx : start_idx + n] = prefill_token_ids[req_idx, start:end]
-
-        cu_num_tokens = start_idx + query_len
-        query_start_loc[i + 1] = cu_num_tokens
-
-    # Pad the inputs for CUDA graphs.
-    # Note: pad query_start_loc to be non-decreasing, as kernels
-    # like FlashAttention requires that
-    query_start_loc[num_reqs + 1 :].fill(cu_num_tokens)
+        input_ids[query_starts[i] : query_ends[i]] = prefill_token_ids[
+            idx_mapping[i], starts[i] : ends[i]
+        ]
 
 
 def prepare_prefill_inputs(
     idx_mapping: np.ndarray,
     num_scheduled_tokens: np.ndarray,
-    total_num_tokens: int,
+    query_start_loc: np.ndarray,
     prefill_token_ids: np.ndarray,
     num_computed_prefill_tokens: np.ndarray,
-    prefill_len: np.ndarray,
-    input_ids: CpuGpuBuffer,
-    query_start_loc: CpuGpuBuffer,
+    input_ids: np.ndarray,
 ) -> None:
     _prepare_prefill_inputs(
         idx_mapping,
         num_scheduled_tokens,
+        query_start_loc,
         prefill_token_ids,
         num_computed_prefill_tokens,
-        prefill_len,
-        input_ids.np,
-        query_start_loc.np,
+        input_ids,
     )
-    input_ids.copy_to_gpu(total_num_tokens)
-    # NOTE(woosuk): We should copy the whole query_start_loc and seq_lens
-    # tensors from CPU to GPU, because they may include paddings needed
-    # for full CUDA graph mode.
-    query_start_loc.copy_to_gpu()
 
 
 @triton.jit
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 205298a415d4..e0ed183d3c5b 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -502,20 +502,28 @@ def prepare_inputs(
         # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
         block_tables = self.block_tables.gather_block_tables(idx_mapping)
 
-        # Copy prefill tokens from CPU to GPU and get query_start_loc.
+        # Get query_start_loc.
+        np.cumsum(
+            num_scheduled_tokens,
+            out=self.input_buffers.query_start_loc.np[1 : num_reqs + 1],
+        )
+        # Pad for full CUDA graph mode.
+        # Some attention backends like FA3 require query_start_loc to be non-decreasing.
+        self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
+        self.input_buffers.query_start_loc.copy_to_gpu()
+        query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
+
+        # Copy prefill tokens from CPU to GPU.
         prepare_prefill_inputs(
             idx_mapping_np,
             num_scheduled_tokens,
-            num_tokens,
+            query_start_loc_np,
             self.req_states.prefill_token_ids,
             self.req_states.num_computed_prefill_tokens,
-            self.req_states.prefill_len.np,
-            self.input_buffers.input_ids,
-            self.input_buffers.query_start_loc,
+            self.input_buffers.input_ids.np,
         )
-        query_start_loc = self.input_buffers.query_start_loc
-        query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
-        query_start_loc_np = query_start_loc.np[: num_reqs + 1]
+        self.input_buffers.input_ids.copy_to_gpu(num_tokens)
 
         # Prepare positions and seq_lens.
         prepare_pos_seq_lens(

From 8f066146c395dfadb86914c88d9a0f3173f8fa39 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Mon, 24 Nov 2025 13:38:04 -0500
Subject: [PATCH 376/578] [MoE][Refactor] Make select_experts a non-static
 method (#29067)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 tests/kernels/moe/test_flashinfer.py          |  19 +--
 tests/test_routing_simulator.py               |  35 ++++-
 .../layers/fused_moe/fused_moe_method_base.py |   6 +-
 .../fused_moe/fused_moe_modular_method.py     |  41 +-----
 vllm/model_executor/layers/fused_moe/layer.py | 118 +++++++++--------
 .../fused_moe/unquantized_fused_moe_method.py |  32 +----
 .../layers/quantization/awq_marlin.py         |  17 +--
 .../layers/quantization/bitsandbytes.py       |  20 +--
 .../compressed_tensors_moe.py                 | 123 +++---------------
 .../layers/quantization/experts_int8.py       |  19 +--
 .../model_executor/layers/quantization/fp8.py |  29 +----
 .../layers/quantization/gguf.py               |  17 +--
 .../layers/quantization/gptq_marlin.py        |  19 +--
 .../layers/quantization/modelopt.py           |  45 ++-----
 .../layers/quantization/moe_wna16.py          |  17 +--
 .../layers/quantization/mxfp4.py              |  23 +---
 .../layers/quantization/quark/quark_moe.py    |  38 +-----
 .../model_executor/layers/quantization/rtn.py |  17 +--
 18 files changed, 163 insertions(+), 472 deletions(-)

diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index 638741e91619..a6977f222408 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -11,7 +11,6 @@
     fp8_w8a8_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     apply_flashinfer_per_tensor_scale_fp8,
     flashinfer_cutlass_moe_fp8,
@@ -151,14 +150,11 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
         td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids = Llama4MoE.custom_routing_function(
             hidden_states=td.hidden_states,
-            router_logits=score,
-            use_grouped_topk=False,
-            top_k=topk,
+            gating_output=score,
+            topk=topk,
             renormalize=False,
-            custom_routing_function=Llama4MoE.custom_routing_function,
-            scoring_func="softmax",
         )
 
         quant_config = fp8_w8a8_moe_quant_config(
@@ -219,14 +215,11 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
         )
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids = Llama4MoE.custom_routing_function(
             hidden_states=td.hidden_states,
-            router_logits=score,
-            use_grouped_topk=False,
-            top_k=topk,
+            gating_output=score,
+            topk=topk,
             renormalize=False,
-            custom_routing_function=Llama4MoE.custom_routing_function,
-            scoring_func="softmax",
         )
 
         quant_config = fp8_w8a8_moe_quant_config(
diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py
index 5a162fa8f791..e8826eb441a2 100644
--- a/tests/test_routing_simulator.py
+++ b/tests/test_routing_simulator.py
@@ -9,9 +9,16 @@
 integration tests with FusedMoE layer.
 """
 
+import tempfile
+
 import pytest
 import torch
 
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.fused_moe.routing_simulator import (
     DistributionBasedRouting,
     RoutingSimulator,
@@ -89,6 +96,28 @@ def test_routing_strategy_integration(monkeypatch, device):
     # Test different routing strategies
     strategies = RoutingSimulator.get_available_strategies()
 
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        temp_file = tempfile.mkstemp()[1]
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            local_rank=0,
+            distributed_init_method=f"file://{temp_file}",
+        )
+        initialize_model_parallel(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+        )
+        fused_moe = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=0,
+            use_grouped_topk=False,
+            renormalize=True,
+        )
+
     for strategy in strategies:
         # Set environment variable
         env_name = "VLLM_MOE_ROUTING_SIMULATION_STRATEGY"
@@ -98,13 +127,9 @@ def test_routing_strategy_integration(monkeypatch, device):
         envs.environment_variables[env_name] = lambda s=strategy: s
 
         # Test the select_experts method
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = fused_moe.select_experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
-            top_k=top_k,
-            use_grouped_topk=False,
-            renormalize=True,
-            indices_type=torch.long,
         )
 
         # Verify output shapes
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index 073e90a4e680..ef7090c349fc 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -90,10 +90,14 @@ def supports_eplb(self) -> bool:
     def allow_inplace(self) -> bool:
         return False
 
+    @property
+    def method_name(self) -> str:
+        return self.__class__.__name__
+
     @abstractmethod
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index c6dc95acdb63..c23c41df226f 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -66,6 +66,10 @@ def supports_eplb(self) -> bool:
     def allow_inplace(self) -> bool:
         return self.old_quant_method.allow_inplace
 
+    @property
+    def method_name(self) -> str:
+        return self.old_quant_method.method_name
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -84,7 +88,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -105,42 +109,9 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        # Is getattr needed?
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        if enable_eplb:
-            if self.supports_eplb:
-                assert expert_load_view is not None
-                assert logical_to_physical_map is not None
-                assert logical_replica_count is not None
-            else:
-                raise NotImplementedError(
-                    "EPLB is not supported for "
-                    f"{self.old_quant_method.__class__.__name__}."
-                )
-
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
         )
 
         result = self.fused_experts(
@@ -156,7 +127,7 @@ def apply(
             expert_map=None if self.disable_expert_map else expert_map,
         )
 
-        if zero_expert_num != 0 and zero_expert_type is not None:
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 6619b64b2bbc..0ef3130b2633 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1510,30 +1510,11 @@ def ensure_dp_chunking_init(self):
             logits_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
         )
 
-    @staticmethod
     def select_experts(
+        self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        use_grouped_topk: bool,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        indices_type: torch.dtype | None = None,
-        enable_eplb: bool = False,
-        expert_map: torch.Tensor | None = None,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-        global_num_experts: int | None = None,
-        zero_expert_num: int | None = None,
-        zero_expert_type: str | None = None,
-        num_fused_shared_experts: int = 0,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
         """
         Route the input hidden states to the top-k experts based on the
         router logits.
@@ -1552,6 +1533,27 @@ def select_experts(
             fused_topk_bias,
         )
 
+        if self.enable_eplb:
+            if self.quant_method.supports_eplb:
+                if self.expert_load_view is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere expert_load_view != None"
+                    )
+                if self.logical_to_physical_map is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere logical_to_physical_map != None"
+                    )
+                if self.logical_replica_count is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere logical_replica_count != None"
+                    )
+            else:
+                raise NotImplementedError(
+                    f"EPLB is not supported for {self.quant_method.method_name}."
+                )
+
+        indices_type = self.quant_method.topk_indices_dtype
+
         # Check if we should use a routing simulation strategy
         routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
         if routing_strategy != "":
@@ -1559,20 +1561,20 @@ def select_experts(
                 hidden_states=hidden_states,
                 router_logits=router_logits,
                 strategy_name=routing_strategy,
-                top_k=top_k,
+                top_k=self.top_k,
                 indices_type=indices_type,
             )
 
         # DeepSeekv2 uses grouped_top_k
-        elif use_grouped_topk:
-            assert topk_group is not None
-            assert num_expert_group is not None
+        elif self.use_grouped_topk:
+            assert self.topk_group is not None
+            assert self.num_expert_group is not None
             if rocm_aiter_ops.is_fused_moe_enabled():
                 if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
-                    assert num_fused_shared_experts == 0
+                    assert self.num_fused_shared_experts == 0
                 grouped_topk_impl = partial(
                     rocm_aiter_grouped_topk,
-                    num_fused_shared_experts=num_fused_shared_experts,
+                    num_fused_shared_experts=self.num_fused_shared_experts,
                 )
             else:
                 grouped_topk_impl = grouped_topk
@@ -1580,50 +1582,46 @@ def select_experts(
             topk_weights, topk_ids = grouped_topk_impl(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                scoring_func=scoring_func,
-                routed_scaling_factor=routed_scaling_factor,
-                e_score_correction_bias=e_score_correction_bias,
+                topk=self.top_k,
+                renormalize=self.renormalize,
+                num_expert_group=self.num_expert_group,
+                topk_group=self.topk_group,
+                scoring_func=self.scoring_func,
+                routed_scaling_factor=self.routed_scaling_factor,
+                e_score_correction_bias=self.e_score_correction_bias,
             )
-        elif e_score_correction_bias is not None:
+        elif self.e_score_correction_bias is not None:
             topk_weights, topk_ids = fused_topk_bias(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                e_score_correction_bias=e_score_correction_bias.data,
-                topk=top_k,
-                renormalize=renormalize,
+                e_score_correction_bias=self.e_score_correction_bias.data,
+                topk=self.top_k,
+                renormalize=self.renormalize,
             )
-            if routed_scaling_factor != 1.0:
-                topk_weights *= routed_scaling_factor
-        elif custom_routing_function is None:
+            if self.routed_scaling_factor != 1.0:
+                topk_weights *= self.routed_scaling_factor
+        elif self.custom_routing_function is None:
             topk_weights, topk_ids, token_expert_indices = fused_topk(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
+                topk=self.top_k,
+                renormalize=self.renormalize,
                 indices_type=indices_type,
             )
         else:
-            topk_weights, topk_ids = custom_routing_function(
+            topk_weights, topk_ids = self.custom_routing_function(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
+                topk=self.top_k,
+                renormalize=self.renormalize,
             )
 
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-
+        if self.enable_eplb:
             topk_ids = eplb_map_to_physical_and_record(
                 topk_ids=topk_ids,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
+                expert_load_view=self.expert_load_view,
+                logical_to_physical_map=self.logical_to_physical_map,
+                logical_replica_count=self.logical_replica_count,
             )
 
         if (indices_type is not None) and topk_ids.dtype != indices_type:
@@ -1633,16 +1631,16 @@ def select_experts(
 
         # Compute zero expert result if needed
         if (
-            zero_expert_num is not None
-            and zero_expert_num > 0
-            and zero_expert_type is not None
-            and global_num_experts is not None
+            self.zero_expert_num is not None
+            and self.zero_expert_num > 0
+            and self.zero_expert_type is not None
+            and self.global_num_experts is not None
         ):
             zero_expert_result = zero_experts_compute_triton(
                 expert_indices=topk_ids,
                 expert_scales=topk_weights,
-                num_experts=global_num_experts,
-                zero_expert_type=zero_expert_type,
+                num_experts=self.global_num_experts,
+                zero_expert_type=self.zero_expert_type,
                 hidden_states=hidden_states,
             )
         else:
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 63b0e6f573d6..48e5a8907f92 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -331,7 +331,7 @@ def get_fused_moe_quant_config(
 
     def forward_cuda(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -352,31 +352,9 @@ def forward_cuda(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
         )
 
         if self.rocm_aiter_moe_enabled:
@@ -415,7 +393,7 @@ def forward_cuda(
                 expert_map=expert_map,
             )
 
-        if zero_expert_num != 0 and zero_expert_type is not None:
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
@@ -425,7 +403,7 @@ def forward_cuda(
 
     def forward_cpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -474,7 +452,7 @@ def forward_cpu(
 
     def forward_xpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -515,7 +493,7 @@ def forward_xpu(
 
     def forward_tpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 3f6ea68072b4..66945e2d2a7c 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -597,7 +597,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -618,24 +618,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.")
-
         assert activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index e5a741e639ad..1e57fa218b79 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -495,7 +495,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -518,25 +518,11 @@ def apply(
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `BitsAndBytesMoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
+        # TODO(bnell): Do these need to be called on the hot path?
         if self.quant_config.load_in_8bit:
             w13, w2 = self._apply_8bit_dequant(layer)
         else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index ad547dd40982..149e4419c64a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -511,7 +511,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -532,16 +532,17 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
-            )
         assert activation == "silu", "Only SiLU activation is supported."
 
         if (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
+            if enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
+                )
+
             return flashinfer_trtllm_fp4_moe(
                 layer=layer,
                 x=x,
@@ -554,19 +555,9 @@ def apply(
                 e_score_correction_bias=e_score_correction_bias,
             )
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.use_marlin:
@@ -1109,7 +1100,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1130,31 +1121,9 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-            assert isinstance(layer, FusedMoE)
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
         )
 
         per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
@@ -1377,7 +1346,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1398,26 +1367,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
-            )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
@@ -1738,7 +1692,7 @@ def select_gemm_impl(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1759,26 +1713,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
-            )
-
         assert activation == "silu", f"{activation} not supported for Marlin MoE."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
@@ -2001,7 +1940,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -2022,43 +1961,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            if expert_load_view is None:
-                raise ValueError("enable_eplb=True requiere expert_load_view != None")
-            if logical_to_physical_map is None:
-                raise ValueError(
-                    "enable_eplb=True requiere logical_to_physical_map != None"
-                )
-            if logical_replica_count is None:
-                raise ValueError(
-                    "enable_eplb=True requiere logical_replica_count != None"
-                )
-            if not isinstance(layer, FusedMoE):
-                raise TypeError(
-                    "EPLB is only supported when `layer` is a instance of FusedMoE."
-                )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0),
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
         )
 
         return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 5241f9a2301b..7ebe40ec8468 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -137,7 +137,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -158,26 +158,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ExpertsInt8MoEMethod` yet."
-            )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 91bd45bf879c..9e2718057038 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1140,7 +1140,7 @@ def allow_inplace(self) -> bool:
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1216,31 +1216,9 @@ def apply(
                     apply_router_weight_on_input=apply_router_weight_on_input,
                 )
 
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        select_result = FusedMoE.select_experts(
+        select_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
         )
 
         topk_weights, topk_ids, zero_expert_result = select_result
@@ -1322,7 +1300,8 @@ def apply(
                     self.allow_cutlass_block_scaled_grouped_gemm
                 ),
             )
-        if zero_expert_num != 0 and zero_expert_type is not None:
+
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 42d7a67371ae..bcdfafb50fc5 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -621,7 +621,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -642,9 +642,6 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.")
-
         assert activation == "silu", "Only SiLU activation is supported."
         if apply_router_weight_on_input:
             raise NotImplementedError(
@@ -652,19 +649,9 @@ def apply(
                 "fused GGUF MoE method."
             )
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
         return fused_moe_gguf(
             x,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 68a122fd46c6..77b15db373a3 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -722,7 +722,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -743,26 +743,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `GPTQMarlinMoEMethod` yet."
-            )
-
         assert activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 01a23168bdde..816567313591 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -696,7 +696,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -717,12 +717,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ModelOptFp8MoEMethod` yet."
-            )
-
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            if layer.enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `ModelOptFp8MoEMethod` yet."
+                )
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
             )
@@ -740,19 +739,9 @@ def apply(
             )
 
         # Expert selection
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
@@ -1459,7 +1448,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1480,16 +1469,16 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
-            )
         assert activation == "silu", "Only SiLU activation is supported."
 
         if (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
+            if enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
+                )
             return flashinfer_trtllm_fp4_moe(
                 layer=layer,
                 x=x,
@@ -1502,19 +1491,9 @@ def apply(
                 e_score_correction_bias=e_score_correction_bias,
             )
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.use_marlin:
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 2090c86f78dc..cf348290a271 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -359,7 +359,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -380,25 +380,12 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.")
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         assert activation == "silu", "Only SiLU activation is supported."
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 66ae2e94c60a..255b5aad1785 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -862,7 +862,7 @@ def allow_inplace(self) -> bool:
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -887,18 +887,9 @@ def apply(
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            topk_weights, topk_ids, _ = FusedMoE.select_experts(
+            topk_weights, topk_ids, _ = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
-                use_grouped_topk=use_grouped_topk,
-                top_k=top_k,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                routed_scaling_factor=routed_scaling_factor,
-                e_score_correction_bias=e_score_correction_bias,
             )
 
             return fused_marlin_moe(
@@ -989,17 +980,9 @@ def apply(
         ):
             from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
 
-            topk_weights, topk_ids, _ = FusedMoE.select_experts(
+            topk_weights, topk_ids, _ = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
-                use_grouped_topk=use_grouped_topk,
-                top_k=top_k,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                e_score_correction_bias=e_score_correction_bias,
             )
 
             # Backend-specific preparation
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 30772c3665b0..8be0299eaa66 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -334,7 +334,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -355,24 +355,9 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.rocm_aiter_moe_enabled:
@@ -609,7 +594,7 @@ def allow_inplace(self) -> bool:
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -630,24 +615,9 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `QuarkOCP_MX_MoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if not self.emulate:
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 52656263a601..7b51b828009f 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -356,7 +356,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -377,22 +377,9 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.")
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(

From 839c6b7b72bcc7197443019aae32be409f1c0363 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Mon, 24 Nov 2025 11:24:37 -0800
Subject: [PATCH 377/578] [Multimodal][Qwen3 Omni] Make Qwen3 Omni work with
 audio-in-video inputs in V1 engine.   (#27721)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 .../qwen3_omni/only_thinker.py                | 170 ++++++++++++++
 tests/model_executor/test_qwen3_omni.py       | 221 ++++++++++++++++++
 .../models/qwen2_5_omni_thinker.py            |  25 --
 .../models/qwen3_omni_moe_thinker.py          | 110 ++++++---
 4 files changed, 467 insertions(+), 59 deletions(-)
 create mode 100644 examples/offline_inference/qwen3_omni/only_thinker.py
 create mode 100644 tests/model_executor/test_qwen3_omni.py

diff --git a/examples/offline_inference/qwen3_omni/only_thinker.py b/examples/offline_inference/qwen3_omni/only_thinker.py
new file mode 100644
index 000000000000..88a61ed694c2
--- /dev/null
+++ b/examples/offline_inference/qwen3_omni/only_thinker.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference
+with the correct prompt format on Qwen2.5-Omni (thinker only).
+"""
+
+from typing import NamedTuple
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+class QueryResult(NamedTuple):
+    inputs: dict
+    limit_mm_per_prompt: dict[str, int]
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+default_system = (
+    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+    "Group, capable of perceiving auditory and visual inputs, as well as "
+    "generating text and speech."
+)
+
+
+def get_mixed_modalities_query() -> QueryResult:
+    question = (
+        "What is recited in the audio? "
+        "What is the content of this image? Why is this video funny?"
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
+        "<|vision_start|><|image_pad|><|vision_end|>"
+        "<|vision_start|><|video_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                "image": convert_image_mode(
+                    ImageAsset("cherry_blossom").pil_image, "RGB"
+                ),
+                "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
+    )
+
+
+def get_use_audio_in_video_query() -> QueryResult:
+    question = (
+        "Describe the content of the video in details, then convert what the "
+        "baby say into text."
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    asset = VideoAsset(name="baby_reading", num_frames=16)
+    audio = asset.get_audio(sampling_rate=16000)
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "video": asset.np_ndarrays,
+                "audio": audio,
+            },
+            "mm_processor_kwargs": {
+                "use_audio_in_video": True,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "video": 1},
+    )
+
+
+def get_multi_audios_query() -> QueryResult:
+    question = "Are these two audio clips the same?"
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
+        "<|audio_start|><|audio_pad|><|audio_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": [
+                    AudioAsset("winning_call").audio_and_sample_rate,
+                    AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                ],
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+    )
+
+
+query_map = {
+    "mixed_modalities": get_mixed_modalities_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
+    "multi_audios": get_multi_audios_query,
+}
+
+
+def main(args):
+    model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+    query_result = query_map[args.query_type]()
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=12800,
+        max_num_seqs=5,
+        limit_mm_per_prompt=query_result.limit_mm_per_prompt,
+        seed=args.seed,
+    )
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=256)
+
+    outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "audio language models"
+    )
+    parser.add_argument(
+        "--query-type",
+        "-q",
+        type=str,
+        default="mixed_modalities",
+        choices=query_map.keys(),
+        help="Query type.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/tests/model_executor/test_qwen3_omni.py b/tests/model_executor/test_qwen3_omni.py
new file mode 100644
index 000000000000..c92c61dcd3bc
--- /dev/null
+++ b/tests/model_executor/test_qwen3_omni.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock
+
+import pytest
+from transformers import PretrainedConfig
+
+from vllm.multimodal.processing import InputProcessingContext
+
+
+# Helper function to print input IDs with coalesced audio/video tokens.
+def print_input_ids(input_ids):
+    """
+    Print input IDs, compressing consecutive special tokens.
+    - 151675: <|audio_pad|>
+    - 151656: <|video_pad|>
+    """
+    if not input_ids:
+        print("[]")
+        return
+
+    result = []
+    i = 0
+
+    while i < len(input_ids):
+        current_id = input_ids[i]
+
+        # Check if it's a special token that should be compressed
+        if current_id in [151675, 151656]:
+            # Count consecutive occurrences
+            count = 1
+            while i + count < len(input_ids) and input_ids[i + count] == current_id:
+                count += 1
+
+            # Add compressed representation
+            token_name = "<|audio_pad|>" if current_id == 151675 else "<|video_pad|>"
+            result.append(f"{token_name} * {count}")
+            i += count
+        else:
+            # Regular token, just add it
+            result.append(str(current_id))
+            i += 1
+
+    print(", ".join(result))
+
+
+@pytest.fixture
+def mock_qwen3_omni_config():
+    """Create a mock Qwen3OmniMoeThinker config."""
+    config = Mock(spec=PretrainedConfig)
+    # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
+    config.audio_token_id = 151675  # <|audio_pad|>
+    config.video_token_id = 151656  # <|video_pad|>
+    config.image_token_id = 151655  # <|image_pad|>
+    config.audio_start_token_id = 151669  # <|audio_start|>
+    config.audio_end_token_id = 151670  # <|audio_end|>
+    config.vision_start_token_id = 151652  # <|vision_start|>
+    config.position_id_per_seconds = 12.5
+
+    # Vision config
+    vision_config = Mock()
+    vision_config.spatial_merge_size = 2
+    config.vision_config = vision_config
+
+    return config
+
+
+@pytest.fixture
+def mock_processor():
+    """Create a mock HF processor."""
+    from transformers.models.whisper import WhisperFeatureExtractor
+
+    processor = Mock()
+    processor.audio_token = "<|audio_pad|>"
+    processor.image_token = "<|image_pad|>"
+    processor.video_token = "<|video_pad|>"
+
+    # Create a real WhisperFeatureExtractor instance for the feature_extractor attribute
+    feature_extractor = WhisperFeatureExtractor()
+    processor.feature_extractor = feature_extractor
+
+    return processor
+
+
+@pytest.fixture
+def mock_tokenizer():
+    """Create a mock tokenizer."""
+    tokenizer = Mock()
+    # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
+    tokenizer.get_vocab = Mock(
+        return_value={
+            "<|audio_pad|>": 151675,
+            "<|video_pad|>": 151656,
+            "<|image_pad|>": 151655,
+            "<|audio_start|>": 151669,
+            "<|audio_end|>": 151670,
+            "<|vision_start|>": 151652,
+            "<|vision_end|>": 151653,
+        }
+    )
+    tokenizer.encode = Mock(
+        side_effect=lambda x: {
+            "<|vision_start|>": [151652],
+            "<|vision_end|>": [151653],
+            "<|audio_start|>": [151669],
+            "<|audio_end|>": [151670],
+            "<|audio_pad|>": [151675],
+            "<|image_pad|>": [151655],
+            "<|video_pad|>": [151656],
+        }.get(x, [0])
+    )
+    tokenizer.vision_bos_token = "<|vision_start|>"
+    tokenizer.vision_eos_token = "<|vision_end|>"
+    tokenizer.audio_bos_token = "<|audio_start|>"
+    tokenizer.audio_eos_token = "<|audio_end|>"
+    return tokenizer
+
+
+@pytest.fixture
+def mock_image_processor():
+    """Create a mock image processor."""
+    image_processor = Mock()
+    image_processor.merge_size = 2
+    return image_processor
+
+
+def test_qwen3_omni_get_updates_use_audio_in_video(
+    mock_qwen3_omni_config,
+    mock_processor,
+    mock_tokenizer,
+    mock_image_processor,
+):
+    """Test the get_updates_use_audio_in_video method directly."""
+
+    from vllm.model_executor.models.qwen3_omni_moe_thinker import (
+        Qwen3OmniMoeThinkerMultiModalProcessor,
+        Qwen3OmniMoeThinkerProcessingInfo,
+    )
+
+    # Create a mock context
+    mock_ctx = Mock(spec=InputProcessingContext)
+
+    # Create processing info
+    info = Qwen3OmniMoeThinkerProcessingInfo(mock_ctx)
+    info.get_hf_config = Mock(return_value=mock_qwen3_omni_config)
+    info.get_hf_processor = Mock(return_value=mock_processor)
+    info.get_tokenizer = Mock(return_value=mock_tokenizer)
+    info.get_image_processor = Mock(return_value=mock_image_processor)
+
+    # Create a mock dummy_inputs builder
+    mock_dummy_inputs = Mock()
+
+    # Create the processor
+    processor = Qwen3OmniMoeThinkerMultiModalProcessor(info, mock_dummy_inputs)
+
+    # Test parameters from reference video
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4
+    audio_len = 85
+    video_grid_thw = [6, 36, 64]
+    video_second_per_grid_t = 2.0
+
+    # Call the method
+    updates = processor.get_updates_use_audio_in_video(
+        thinker_config=mock_qwen3_omni_config,
+        audio_len=audio_len,
+        video_grid_thw=video_grid_thw,
+        video_second_per_grid_t=video_second_per_grid_t,
+    )
+
+    # Updated input ids should align with HF implementation.
+    # 151669,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 10,
+    # <|video_pad|> * 1152,
+    # 151670
+    print_input_ids(updates)
+
+    # Verify structure
+    assert isinstance(updates, list)
+    assert len(updates) > 0
+
+    # Verify start and end tokens
+    audio_start_token_id = mock_qwen3_omni_config.audio_start_token_id
+    audio_end_token_id = mock_qwen3_omni_config.audio_end_token_id
+
+    assert updates[0] == audio_start_token_id
+    assert updates[-1] == audio_end_token_id
+
+    # Verify both audio and video tokens are present
+    audio_token_id = mock_qwen3_omni_config.audio_token_id
+    video_token_id = mock_qwen3_omni_config.video_token_id
+
+    audio_count = updates.count(audio_token_id)
+    video_count = updates.count(video_token_id)
+
+    assert audio_count == audio_len, (
+        f"Expected {audio_len} audio tokens, got {audio_count}"
+    )
+
+    # Calculate expected video token count
+    spatial_merge_size = mock_qwen3_omni_config.vision_config.spatial_merge_size
+    height = video_grid_thw[1] // spatial_merge_size
+    width = video_grid_thw[2] // spatial_merge_size
+    expected_video_count = video_grid_thw[0] * height * width
+
+    assert video_count == expected_video_count, (
+        f"Expected {expected_video_count} video tokens, got {video_count}"
+    )
+
+    # Total tokens should be: 1 (start) + audio_len + video_count + 1 (end)
+    expected_total = 1 + audio_len + expected_video_count + 1
+    assert len(updates) == expected_total, (
+        f"Expected {expected_total} total tokens, got {len(updates)}"
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 262ea771d9cd..7506ee8656fd 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -23,7 +23,6 @@
 """Inference-only Qwen2.5-Omni model (thinker part)."""
 
 from collections.abc import Callable, Iterable, Mapping, Sequence
-from copy import copy
 from functools import partial
 from typing import Annotated, Any, Literal
 
@@ -387,15 +386,6 @@ def _maybe_apply_prompt_updates(
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
         self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
 
-        use_audio_in_video = False
-        if "video" in mm_kwargs:
-            video_items = [item for item in mm_kwargs["video"] if item is not None]
-            # only check video items (if there are any)
-            if video_items:
-                use_audio_in_video = all(
-                    item["use_audio_in_video"].data for item in video_items
-                )
-
         if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
                 prompt_ids,
@@ -404,7 +394,6 @@ def _maybe_apply_prompt_updates(
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
-                use_audio_in_video=use_audio_in_video,
             )
         else:
             prompt_ids, mm_placeholders = self._apply_prompt_updates(
@@ -414,7 +403,6 @@ def _maybe_apply_prompt_updates(
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
-                use_audio_in_video=use_audio_in_video,
             )
 
         return prompt_ids, mm_placeholders
@@ -640,19 +628,6 @@ def _apply_hf_processor_mm_only(
 
         return mm_processed_data
 
-    def _validate_mm_placeholders(
-        self,
-        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
-        mm_item_counts: Mapping[str, int],
-        use_audio_in_video: bool = False,
-    ) -> None:
-        if use_audio_in_video:
-            mm_item_counts = copy(mm_item_counts)
-            if "video" in mm_item_counts:
-                assert "audio" in mm_item_counts
-                mm_item_counts["audio"] -= mm_item_counts["video"]
-        super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
-
 
 class Qwen2_5OmniConditionalGenerationMixin:
     def _parse_and_validate_audio_input(
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 61f218f16d79..f5f88f66eff9 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -68,11 +68,11 @@
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
 from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import (
-    BaseMultiModalProcessor,
     MultiModalPromptUpdates,
     PlaceholderFeaturesInfo,
     PromptReplacement,
     PromptUpdate,
+    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
 
@@ -87,7 +87,6 @@
     Qwen2_5OmniConditionalGenerationMixin,
     Qwen2_5OmniThinkerDummyInputsBuilder,
     Qwen2_5OmniThinkerMultiModalProcessor,
-    Qwen2_5OmniThinkerProcessingInfo,
 )
 from .qwen2_5_vl import (
     Qwen2_5_VisionAttention,
@@ -807,24 +806,8 @@ def _maybe_apply_prompt_updates(
                 else:
                     use_audio_in_video = False
 
-        if use_audio_in_video and "video" in mm_item_counts:
-            assert "audio" in mm_item_counts
-            mm_item_counts["audio"] -= mm_item_counts["video"]
-
-        # Special case with `use_audio_in_video=True`
-        if use_audio_in_video:
-            if is_update_applied:
-                prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video)
-            (
-                prompt_ids,
-                mm_placeholders,
-            ) = self._apply_prompt_updates(
-                prompt_ids,
-                mm_prompt_updates,
-            )
-            self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
         # normal case with `use_audio_in_video=False`
-        elif is_update_applied:
+        if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
                 prompt_ids,
                 mm_prompt_updates,
@@ -834,10 +817,24 @@ def _maybe_apply_prompt_updates(
                 mm_item_counts,
             )
         else:
-            prompt_ids, mm_placeholders = self._apply_prompt_updates(
-                prompt_ids,
-                mm_prompt_updates,
-            )
+            if use_audio_in_video and "audio" in mm_prompt_updates:
+                filtered_updates = {
+                    k: v for k, v in mm_prompt_updates.items() if k != "audio"
+                }
+                prompt_ids, mm_placeholders = self._apply_prompt_updates(
+                    prompt_ids,
+                    filtered_updates,
+                )
+                # Derive audio placeholders from video placeholders
+                mm_placeholders = self._derive_audio_from_video_placeholders(
+                    mm_placeholders, mm_prompt_updates
+                )
+            else:
+                prompt_ids, mm_placeholders = self._apply_prompt_updates(
+                    prompt_ids,
+                    mm_prompt_updates,
+                )
+
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
@@ -962,7 +959,9 @@ def get_replacement_qwen2_vision(item_idx: int, modality: str):
 
         def get_replacement_qwen2_use_audio_in_video(item_idx: int):
             nonlocal audio_in_video_item_idx
-            audio_num_features = audio_output_lengths[audio_item_idx + item_idx]
+            audio_num_features = audio_output_lengths[
+                audio_in_video_item_idx + item_idx
+            ]
             video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
 
             audio_in_video_item_idx += 1
@@ -971,14 +970,17 @@ def get_replacement_qwen2_use_audio_in_video(item_idx: int):
             if second_per_grid_ts:
                 video_second_per_grid_t = second_per_grid_ts[item_idx]
             else:
-                video_second_per_grid_t = 1.0
+                video_second_per_grid_t = 2.0
 
-            return self.get_updates_use_audio_in_video(
+            placeholder = self.get_updates_use_audio_in_video(
                 thinker_config=thinker_config,
                 audio_len=audio_num_features,
                 video_grid_thw=video_grid_thw,
                 video_second_per_grid_t=video_second_per_grid_t,
             )
+            return PromptUpdateDetails.select_token_id(
+                placeholder, embed_token_id=video_token_id
+            )
 
         video_replacement_fn = (
             get_replacement_qwen2_use_audio_in_video
@@ -1004,14 +1006,50 @@ def get_replacement_qwen2_use_audio_in_video(item_idx: int):
             ),
         ]
 
-    def _validate_mm_placeholders(
+    def _derive_audio_from_video_placeholders(
         self,
-        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
-        mm_item_counts: Mapping[str, int],
-    ) -> None:
-        BaseMultiModalProcessor[
-            Qwen2_5OmniThinkerProcessingInfo
-        ]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts)
+        placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_prompt_updates: MultiModalPromptUpdates,
+    ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+        """
+        Helper to derive audio placeholders from video placeholders when
+        use_audio_in_video=True.
+        """
+        if "video" not in placeholders:
+            return placeholders
+
+        # Validate audio and video counts match
+        num_videos = len(placeholders["video"])
+        num_audios = len(mm_prompt_updates.get("audio", []))
+        if num_audios != num_videos:
+            raise ValueError(
+                f"use_audio_in_video requires equal number of audio and video items, "
+                f"got {num_audios=}, {num_videos=}"
+            )
+
+        tokenizer = self.info.get_tokenizer()
+        processor = self.info.get_hf_processor()
+        audio_token_id = tokenizer.get_vocab()[processor.audio_token]
+
+        result_placeholders = dict(placeholders)
+        audio_placeholders = []
+
+        # Each video is paired with one audio
+        for video_idx, video_placeholder in enumerate(placeholders["video"]):
+            # Create is_embed mask selecting only audio tokens
+            audio_is_embed = torch.tensor(video_placeholder.tokens) == audio_token_id
+
+            audio_placeholder = PlaceholderFeaturesInfo(
+                modality="audio",
+                item_idx=video_idx,
+                start_idx=video_placeholder.start_idx,
+                tokens=video_placeholder.tokens,
+                is_embed=audio_is_embed,
+            )
+            audio_placeholders.append(audio_placeholder)
+
+        result_placeholders["audio"] = audio_placeholders
+        return result_placeholders
 
     def _get_raw_input_ids(
         self,
@@ -1454,7 +1492,11 @@ def get_mrope_input_positions(
             )
 
         if not len(second_per_grid_ts) and len(video_grid_thw):
-            second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
+            second_per_grid_ts = 2.0
+            second_per_grids = (
+                torch.ones(len(video_grid_thw), dtype=torch.float32)
+                * second_per_grid_ts
+            )
         else:
             second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
 

From 97588c4d1231287eb380cf9fb95ec77f88479b85 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 11:28:56 -0800
Subject: [PATCH 378/578] [Model Runner V2] Add minor clarification comments
 for Eagle (#29332)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/spec_decode/eagle.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 0f11903e1454..59d0f313d96a 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -65,6 +65,12 @@ def propose(
         # [num_reqs]
         next_prefill_tokens: torch.Tensor,
     ) -> torch.Tensor:
+        # NOTE(woosuk): To avoid CPU-GPU synchronization without CPU knowing the
+        # number of rejected tokens, we maintain the size of eagle's input_ids and
+        # hidden_states the same as the target model's. This means, we pad each
+        # request's query length to include any rejected positions. By doing so,
+        # we can also reuse the attention metadata (e.g., query_start_loc,
+        # seq_lens) of the target model.
         if aux_hidden_states:
             assert self.method == "eagle3"
             hidden_states = self.model.combine_hidden_states(
@@ -110,6 +116,11 @@ def propose(
         # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
         # used for draft and target sampling.
         pos = input_batch.positions[last_token_indices] + 1
+        # NOTE(woosuk): For draft sampling, we only consider the temperature
+        # and ignore the other sampling parameters such as top_k and top_p,
+        # for simplicity and performance.
+        # While this may slightly degrade the acceptance rate, it does not
+        # affect the output distribution after rejection sampling.
         draft_tokens = gumbel_sample(
             logits, temperature, seed, pos, apply_temperature=True
         )

From 4d6afcaddccaf281385ddfa7c6078916af7d9d20 Mon Sep 17 00:00:00 2001
From: Benjamin Bartels <benjamin@bartels.dev>
Date: Mon, 24 Nov 2025 19:40:54 +0000
Subject: [PATCH 379/578] [CI/Build] Moves to cuda-base runtime image while
 retaining minimal JIT dependencies (#29270)

Signed-off-by: bbartels <benjamin@bartels.dev>
Signed-off-by: Benjamin Bartels <benjamin@bartels.dev>
---
 docker/Dockerfile                             |  16 ++++++++++++++--
 .../dockerfile-stages-dependency.png          | Bin 134558 -> 149377 bytes
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1b937bbc1225..e03b9989a190 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,8 +20,8 @@ ARG PYTHON_VERSION=3.12
 # glibc version is baked into the distro, and binaries built with one glibc
 # version are not backwards compatible with OSes that use an earlier version.
 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
-# TODO: Restore to base image after FlashInfer AOT wheel fixed
-ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
+ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
 
 # By parameterizing the Deadsnakes repository URL, we allow third-party to use
 # their own mirror. When doing so, we don't benefit from the transparent
@@ -328,6 +328,18 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 
+# Install CUDA development tools and build essentials for runtime JIT compilation
+# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
+RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
+    apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    cuda-nvcc-${CUDA_VERSION_DASH} \
+    cuda-cudart-${CUDA_VERSION_DASH} \
+    cuda-nvrtc-${CUDA_VERSION_DASH} \
+    cuda-cuobjdump-${CUDA_VERSION_DASH} \
+    libcublas-${CUDA_VERSION_DASH} && \
+    rm -rf /var/lib/apt/lists/*
+
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index 57a33524a5169c8b56c7de309cdeb243ab3fa918..b327eb2151f50e4d682fe533fa57e12f0b6118c2 100644
GIT binary patch
literal 149377
zcmbTecU;fw|3Ch;j&sa|j8Ji;5(@2|nHMb$?LjJ<G_}LALgketO{CHuD(xJSw4}X+
zmbUh;-~H+w<o)^Mce{OipYuMvs^{~1Uf1J#JnrLhJ#Wg(N-bZqaS26H%V|dsDNxjk
z3lz2B?C-zflcz#l8}OgsP9K*#M9q-@1!snNQPdWSc4)uSMgOjbbN7^60>rx}7;?94
zxw1<B^`+khx4!-J?@t>T@6lw+Uq3jsCY|ri$vUCZkh0n%sz#4ZjW^eY2M8X~Te#5V
z<&G7H4)5O^zDnxQdFoKGWJK}AR`U*)+Eq)cmCtJ#$Jb8?O-v3ujppq5R^PV6yP(Aa
zW0~`%>UTtiAz;o2_@nN@kk^0w@=@FhjtldD`6Oz=G5=papgLVwv;Xo*j&Sh*%Vi|>
zOXvN(jw^TT)JS8=L~nM`)urnVKL2y|aNfML@|>KUA3|mPo}VjY>#j?YE4aV=mit1=
zWZ^x*pT8T-te$M5#(eS9C28M%+sne_N^Udq>@)wE@mO!3$)4mov-GaC&X^k0l!kq0
z-g++EuxHDijeMNK>!`R(vlCUib<Qx(<hrd*ZE_c}D3cM;e|K+(TAJ0l(z_J@MnBVJ
zwc$pWnay_1cNuwSN9Km1@H#&6o;wy@%htkteIKp+oiXWL18pTUuZ@MRzIrl?*`K+!
zJNam5Wpvr&!?%X~R|kKd_;h7u!r9Wt8GUay`R>&_vTnEf;LR%Q_7B_hm>o>qg<2mx
zH>xla3tLF>Z<)IY^7m{TodopW+|G}6nd;o7k!<3fmt1QUxoi{vu2<JrmblJa@FB}-
z;<CbglNZ`fLlvTv-(E3FI6L-aj(psu8mH@>C(!6P;BVLJD>OaYl$g)v$8c{obv9@2
zoa#D7#>5nF_tB(V77MZ8TF)J$W&R<AmeqI1y!pDs<f-9$%aVJ?p3YaCZ=!iJM$<Q5
zFR%68&e%eykp`cK4<FXf4oK4?f4fk#=fNe?zf-)Da~~m2Tx_B%QE2?@lY!T3<jQ0C
z%s-0evA8(OiVc-peSQBZ$BJUjKhB+bZ=hI&JA0y`S3yflE8Y41{>3f*1>ACZ?o3Oi
z7gM}jp6qm!{keG5rdu<UB{R}N;!Z|O{nxHtJN9{{%L5;6`yJ~w&tIC)E-6-;WL#B$
z{?&3a-AC?hR8DuU(z1Q(Fw(;)Vq<ctopQbQT6Rvj$GJO=e!e~y<}w~uZ{Hc6WSriW
za6Be+xOT9&F+1lB<+w^O&%NkE9+QudWh*P~*i!{r!OwqB_WNW@>+9>k#P#e8f4Lr4
zkuMMXg5Q>f?>>M3*i*6uzWk;SF|q}FEt=nok~tZCf6$Ay#X<<fK8GQCZ$EyY!G8r6
zUh>QGFxj<ut0atkbZAR$3p`Fopm3%%Iy=0%&FIA@^ANw_T2Y#8T32sVXPVj(5ot8D
zENIWE1H%orjTsX?>5IY^P#uv9shV8tndWSFe%=zX5zXJ0Z@gzW(7K(LX4OvrV+~u#
zaUZTl><fZpnC-tl+HCcHzgnE`etHNd%~#aUqBg}`=lnh@IFQklea=#-78sNS25dce
zt$dwex@~XZVirgFOKWr&#8+uMw0yqKA{!<fK%b0$A#w5Br(K#a;y#_BB%PkjO(Vvo
zb>`XR3TJxiQ)%>h3w~>sa~W3@w&_yE{VGGXY-+Zz6dROJZm_9i&Bu<n`1W#fSVTm`
z?4=(6a;g0OmIBL9mln6S_~^*?r!WcTd9VakMr*A1U*>B4$g-{2m-n1PZIZDMs~vvI
zqHu2>gZ~mLT<(`C-#u34GF3G>R^lStKjS85T9>?Aa4psOVTDMSr$bLFyPsLwIeN3p
zbeoGs?pl@E3vmB>fzhCuu^>&X9F5*u<kz;+W%S_e<8V@y*Rf$+1+vFv5JU9lW-qyz
z-9rngO#<8J%xU<Ol#lj1*#j@0?jFN~1O|#Ztb6dg>rnTm#?d3PLM``L$#V{5_BwGZ
zD=RMwn@_og&YcEptNIIh`pB#_daujOgiuHL6Sw6HCAX5GL12i{`u|zV-8?x^!Y#Op
zl7Iex!ee7Z<IMEPzX7UYmZ~4<GChXG9QsL#VrU^@h<%rEI@gWYtX{AzROa|dPs1(u
zMHGYH^ME-SQg?Z@X6inPsdhQz67K9ti$$`ocQo;%171OsthD}8DVCO~#9~e|1lGAv
zuX=E1s&OV@z4pbew0(%0&5qw*E@Gb_EI!)j;o@(aT*F4&XVX<JikyOvDYsuEKo@CU
zs5fJP&iYj0+3*JHA)oPmmx&aYNU@ok66ce;8>!%<za*&NdR{cSXN<M_(dZ|h9C@mf
z?L2!kq5ikq5?`$lVWovi`_kJ29b!Hmq9j-Svd+sjFXXOG$_w+iIChCfg?n)B+-dpG
zAME9UI@T4+i$dp7ZVH?}1uc&c{>)sG2m3MahCW(0G<sLYK(ViorNPUa_+8Y5d$Iw-
zEw5HeNTYTPRBD(%#pXPmH=n8uDz*8t@W(Y=E3DcAY%7^rEDQH8Vw)E%ZrqS&oi$cu
z&Nim;Xdr8%XD6mKE^Eh}`krI75oMvfZ~tN@CEe9jNBWhO;xC3rSsEoM*E<KFX>$48
zTJf{YQp<DGd$as@Cs+SfQBhH2U8Ti?8uWPEJb91Z8p*r-%(Lw7b7?({POjxN&*(dX
zKrW4^T!;Q=_4%X&g!%CMG&;(C_9tGd+}6(|fj0cnp_et^U&yk*);M;p|L>G)&@Xe!
zCpu8bla%K)KDcAg9~g;5mQM9cMv9;7mqptc=rmkY+cdi|7P?-0EakzRl!u>)yKv%{
zncCwp+{H>mibjf4IKPqFcsaoKW9slIb`=*c|G%?@nu&xb<H!FjDr%SDa%%ZYpRB>B
zwM_sj>mDqmR2zRC>SU&4S(<g{WiRTz{Bxv<Vbz;!{?ACoQIuI}z%xr_=Sv<e#XjOa
z_W|+$axn{o|8M{2@$VJ1u+k1+_4qRXTdw!9r>rx*P7A4O!BM|$KLUXK;r|zknNZ`y
z-F^zs?%|*PSAJW|WEddQ{k+Ub#~ia!3NZU=&D;nf*ZhA(fJ*C_$Z!Am<vRW}B7!re
z-s1Qhq~5R~XW@a?qT|-q)+Fo3Xr&vx_1ro(oRS@YV0ZfLi;`2+ozfVHfD<8rF2im>
zL0Q?mh`(-^R=Q1*O7?gKsYHx>bR~~H{UGdUO;K^v_ujQwq<;PL`z<L<Kd*7+t^yx!
zxmfMYeWta~A2-F_1sv=qK^9$UX_h@TyxHRI)}ZlGCfoYEUUK3>h}<`Z#pf!c)JQ`0
zM_xZy9$spw?a=5n(lXIF<NVQaps1v%D%G;h-?2ZRAeb@?PMKK*+0^3IaskJl+b&j<
zeR*vO1zZ}@Z8>S5rWR2A$NmSXNKT!f9&2m!Jh<Z7?(@!5ojN54SBl*`d}|%q=!J@M
zV%}@yME81odq0bacsJG-6oj2AhzFS|_gyJ5dH&i;mOx-Q9zYxd;RdSpJV;(wOSRDd
z2m}UfCdRn$%u#x?w~BWDP4>VC0HbD^83q)k>N7W1sg2<(I!=5*8~6u%>zP)gdG?f1
z)l6@!T;0`y@y^(i8p9whhv|W!8ByE%3sMC35v;LW{rS<}kD2&*EGQKhS5Q;)WhEG@
zShzDvu4t5drk`6a{`6t`nZgHqQ@auhK4iPhJU43)2J(JKHqYksIrB)-l@)FeDRIoK
z(s6O>?i3s<S2V8L#Ah`4EVhaZIE;aYo%7>MciRX?)Hd4W5i7<bcwW~T!*b~uB9Ir)
zsOsR{m%G00Sy<qBwRNIlNiiUSQAzedK(C{L`%<nKTbP8cR0F<Jlzi+j5pL4#JXWA%
z(Ej0RQk%qdpNAr+xA=h2%y{g~LpQf<C)P|&uC@=yxu>f6fb7*;97`V`4iZYFd9l?v
z^k!zCHKC~R5zbpHzK=3a9JRM4I_2XrdKBs$kh3h^5jT*nOE&G~b?9%--{Dh0qa&|}
z;2P(uVziQL4E)lrf<dZYCrG++vQ;3d4HNwK{iA~tBX%0*nbv8xz1rzztza<sBV)6K
zk%^)+!HW1JMKR<iiT=9A>0zFwj{y&i%`%57E1VednA|@fbLhihd)Ywt%tZFs*C!r?
zb0{e*>(?acr+&J;^aFC0a8Y+-_Do)-d1}jj#{s{_sN>CD*)!7!s1{CR0?fI9e1(g|
zP!#(C*h5f)&ec9Y_fWt*kV`AA*(ZBaKcE#zk~JT(t30ttfc7C)CmTRubg5X1VM$=p
z_F^D2QLOJ+hf22k#3-&uFAflCLx9rg^;Tsv1kYNu`WhOHe*63^YVfWNMe&<Be_8b3
zymk;2Xg5$Kkf{6ea>mT`1OcHaG0vl}SSru;ELbV(StchjrPnDjmM_<#^u+|bmOVyA
zMJ2Ycuq1(B+;b9<+k)Uw*N--D%(4!g=+v1J&YtQvFN;u)Bq5DNEfaY%x}RmHkA>|2
zfVKr-`LB?ePYU(cCUvfNHoLxp>7<ZLe?Gg)jteK1l$w!6ioSh*rHIVLYS$H?X90fl
z6xU(oH&MN|g6Sc^tE0&}iZc2A$EI@?7Q!T0;z42)DuMZgeFXgk%%^X5BI8Au)pI+w
zA145-EJ43O+2($Y^F((iuK-O}VrsA)Ux6xEwof^itOM&1Js#0XSjl2b9ml?0(cx;n
z$S7qcC9>hHA5Ng0wiF38N~ha&e?WTW&bYD?A#vOGqn)Am0Er>&`=O0R_7mMH63N3!
zu}-@^MEj3A4?bbh?<jwk)VXxsuD89;Bk7Ypw%Ny@DZT4S?+tw7F3c&#Y!||Pe;%cK
z<<Splx#=cYyF_f(xKV4NuNOi?zt~@>=3H1n|1F)#$Fi1){E7IaMN8L(XmW)qaYpym
zzr3=z4ZJ)OxWTlJU*;h%&4){?W3@nlh;1=~{A3toP%H3|4!b40T%e{y#>KubQL6EO
z?K~M3K2V?CdwIY^J;iM5S-Q9L2~Z&OEG%PHU<xh;ji7{JTPq|d*~EF2Ym#~2mt14h
zN)(cbo1K{sBgIV~5`ouQoCOK1?9Y>!zTo|jVXEfr#|U&$KsvLSjQ)4GHw#%rx6G$p
zAOBE9a*lj?UM5>WB0_B<gh}CKmH0%L7YZpUOIY^kDRZsk{&l7)T10aYZmWB5lGYxI
zn<L15cgAMyBa;du<hfFObPJMs>_Gk4KwGRu7D5`lJh)h-9cvttmR4IFh<bKE3GCl{
z02_*Ly_Qv}aB5YcLyxc6NIj2==7qzdvH=f8dNX)X1}sTUsx|%yhPoLdNb2->Cm{jm
z0nHTf*Wn*2wBs3WM|NC30^p%LlZXN6UQ}7e?v(m|J!7z-=(0jq5`C6mA6>7sTakON
zF1ee%Cp<0GrbgdqvqM+BuYkGsN4vHF!Xe3@3Hb|4EfF1%&g%D$B?8A=Jh~sUpD;;V
z?!p}$B98MZNv7w<Rc77;NvS+-VLQB!wKrzFjF0pNl|GSbLtqty*OiIROb*bdlX=>*
zCkwLAG{3!TJk+}rv+xtJN{S`owuXo?wS+UY?poWdfdFhD59lPzSj0s&Op}y}(gOS$
zQT$YRl=aGQ%ijg>zfw6~kLn`E$H!;a8J*G(0NoOw#~hg%L{zM()Q_1Cu@=)Ek+CsV
z69Pu%99W<8*OqU(AM4a@Y&Tdc<uw^CZI5|szOl_+HP#NInP~)v@Q-XCk$Y=>timU0
zy1>VaPl_$2K-eK3aXmS5XLic~%{)b#w+>v|c^t$%fjmIL?iA@~N)cL(27aa;ytXax
zyrBWq+Si!=PEj3C#GS@NtUD`{+K|nxjEs!3z72rozK7O=e=;Wf3nE2rh-RV0sUPGt
zJXSA+GEfABko~R}A=ZD@k^+%F^Mi_qKhZC^-*R=m_FJN+U^x0YuJf!PreKhvp`mQp
za4}O!5LsfMK7W6HIT4VqEk8~tn~msLo6Ixr0)}PDTF#7DWrxZIiPPwamVICr$5YH2
z`sHQ@<=U%abpk9GP&wlt!cRQmbN;x}`P&suq$C<0*@hjlu1jQx5~rTai<a&?_0Qj|
zw30wEiJ`UuwL}A_N4rG2;$Cr*dN*2Fj}+Bd^CMm>C9y9rEoOdzA&4VYNCU-LRG+%u
z0yQskpo>I*qCwHSzyDa%Pm(!6r9FUA);fh(*PdX~+v`&;hmoRr0sLeWwK1aFCcw5<
zAnZcEDoro^xss0!F`z=2dw1W5r^h9RM`Z*Wl+yKDG2+a3dSQX~Mnqe5nC_jK?$!Ak
z2!^G7f0vqqNK;v%p8)qt(Qi3z#eQNA!<=pD-N|)f+7lkJDQRiK$W*E=eW=Hg!$W}<
z?{)+^n4`Y0|4{~Xmt!>x{e^e{RAgmjWV*+(JW9Hpx-us5x$6k3OoUMK^x*XsNu2<^
zyZ5ZcII~<P8Hcfb$EurL0DX)-Y#Xerl$q;{BBL#aDpaZhC9rpaS``(zd?bdP38vyo
zZKkv4GdpzMHFue)3-axik{CaM+K?&dH2K6s{GCvnzoNs(XpNMvq$=$o`>B~J->I2(
zk`kRm^Rb``YF6I^{cN}m!~-x>ks8QdiTH1rMv7T`O`>6GylX`{iv3_`RqQZ9M|SN`
zeL6J1_>0(@xr+@~4}LA(h6$4PU>3s+`|ui-hsWHV2cB_fv(P>jZ99E>GuF45+qv%+
zNim>Ug)c5N86AHJne1p=ae&0|Brvfk(o<1;SWqCuMG{=I<rbCL5L>gb)z3T-uy~K0
z*pO08T1Uj>^hmZG-3me~vg1J8bkqa@%61H*kkQcp?xTa(4UnW|ut$kVOaz0}Z~rVE
zfat<T!)p6(GJElv$8ZI+s9@1UzT<#%^kyg$?{e4c_}g^Xw&b%16{5J^Z#EaPao@;i
z#OES_df3h3Ezucy;=lRQUC@VF)(1qg8*dLA=t?XJRf*LmYtzANiHcMM#8ahkK=&Ze
zg)grO7LJS}5@<?;jRrzkTb>hX<}5(SeQupV?M!=ep~HEX)Pd3$Gg-;^fX&NghN9%y
zAya=qRg=qXMRCZdu?O7Co*s$~awxg7LYM=y+rpV%JM(1i&Xdh4*;A%mX<r{tws5<Q
zO|HKi0&<+%lh*kmOfIM-bC^IlQ?&svCEyx@`)yGlxvk7qeKK6AoFnI+|Iqw5Wgta4
zc3rsu=%9n_oppKjyiHebaiwu!1CHla2HCYTrVr1TToCs6L8p^iS^>r}sceM!VSuT-
z)Sq*p+~-4<^dTT%uuQJxq=xBGswIKGB0Vp*HRXAfls{AC!OHAXOJI#wPkJ|8XC4&m
zJQO(wEHZJ`?Y@fuc62@ThcWOE|7<*vU2o<{g5y~KbCa5VbtZ{2gyRxzyvAi_O2v5b
z1hPR3QKvzw?vkWLOUPZPamGVz_$`?+=c#e0&uVd8b~BJY=szDc3z5=_Wz^D-hRo~B
zV<@}-rt6(tq**#6k3sqLLx?nikZ?iVL%yCxmE-tK#j*W6Usn31-<=g*tvch_OhR8#
zYr<@MKenUp+0>p3gFUh75<qYmVq552&Vs7iLLVJxA<Tf>%=A<uFvO0(XVuwqy&oz^
z`EkSm$$FDx1Bt*sRp%<f&;k-m#2t2TU%*ibSZMKq#=d>5j*BbPe&`r3G|<E{BM4x`
zE%+}XpE>$H!@Alxkzj@`;0IjC>LEUUb{&`I0}RP<xU1%{)PisUW|zsn+9rbC0Wf`q
zt&IrjTK`}P#V<?9HcA#%ZuxU`n^d%{m5D&ET#4B8E)0@r?6II`vRP(E^9WfdsD+>`
z?yS+%WC}nD(=c^$r2w*rNw6bBtYcc_`Ws}#wQ5<8cJwpwVRl1vi4*@pXXMS7Z5E76
zew~U%3~`y6t^wsOL+!cjHLsinRl(8*7-ci)Dvb`%c?Mt<TBz%t<9__6{nHb@hQwQv
zniqU&G;icKDlfyY5>B|lwPlR-&e&}61&dcm1DCqFFL=KNIu1X9cmw`7*RUIaIbqhF
zD&L7x)t*yMi@U(wpU#3?NLQ}oW_XySJQo+1c@+POwTBmfPYSQT;r3yuyRtmceIIP|
z5LLLeCYT~lP8GH-KoXbJ#BdQwQu$u&>zS5P9Zw0CqLJ9nKrLGoHZOSV#m`sQ8+pu3
zi_fP_Hvgz7s@s4lOPce&EZ;C1h9JrawItDrD2w}z2y<lKY4A}3P}Mjgqn_e=@#!+=
zOmeCmRj9TJF^#Eky>@zhustbH%zlt#Rti)8q2st-GXyuNfkKdQCZDcc83qLuA-(?I
z5?8(o<m`GNLt+UKWfT_+0lom<F)uBpg0*Y0U5!1EY679Au6^*E>$OiT$7g<(=5&*~
zQD6u|H7eN&-!D=n*0KG#4kwDAam0nDH?y<~(2SLAcZ->+k)5pza>7R6sbncQOwN+q
z|D@cG^p1`W;sw$Ox3%i7O@c_9Gp_WD>iYhhW2Z*rJ2pQPI;jZngZI|}MiY0Diu(w~
zg3AsH4GAk0`Vya7>f5vF!7vGZ)Ej9!p~!^*LJH^qqIAVJ{&YSnZ>;p2_swqIE>%yR
zsSXuJJ|m@n^)@9dR!Y^F=r(OZF5=KXeFZO^b{G6|gXpJncbG(yhcANpLb#>e<WKx4
zyOr6fcaOJGajPD{z9~j<qs(##2wT5{dGP8siv&>{uu)xwMUX&@Jz}x-=6g}{K~8|K
zsqhuOSV_8_w|KKW8LE@wcNw($DE6iVNT4+#s#C*tv0nvYtCGr6+eB;zvXPJ6-f$N%
zBsxclinGC<+e7RxdNUx|+d^NSrLqepod#hBZbd}c(3|0A7NZht|Idwb08+GS;8>Qy
zBP10*6=`eW+pCqpQXh8+Z=^Q8IL?*ITBT_zbHshKuMB?;D=h&E`<d%ZHXMSCR1{O*
zW=rYrK4`PrM5K-^;7X4oBDDarwbU;nbxxXP{&pYYo@nbwBdMkiRcZ`)$2vX@R-#Ny
z$R0pEas!*a-Fkhqg4Je9btlPp#5*854FEcEK+XWk?Swl#^cu-D0a61yjif4q*5Dn$
z?8vd_ul?cbZF+HVg%C`I9fZ3Qsf}<;_!<fG*{M!th@3ni6^mpkiMMTS-+KZ0IZ(i6
zi#00|$%?`0F0Yv<uLF_S-?~yQsjbc5(jC8V64LdZWXG)<1;%{{v4w#Nx`ghPL}W~&
zD-{#R96lkT?d_0Cp}BKZFBhoyBy64b0p6SFShLscMoU=ppKqV@M)J}c*X4%+Jl!Rx
zN2MR^*}79DW|!~^YL<G4L@TSPv|$?-OR+d9o#&+7cwMF^{p~xRCAA?}o%poS_0EGI
zWs-WY3lzLyY$(<x;MA*Y1hWdERt$tmOzj{g;w)wIny7qTV1_h0hS-KV-UTn>uyEh&
zH3O)x+i7GYEMpdZfo&>z_n7{S{eQJnp1UYeOoRYgFd++M^|efksgA2GQ(vW0zqsub
zPj-eZs}B`-=ayB}WDya$u-w~OoCZ(8bm4)2-vwU^W{Iv?I@0Gr3G87S9lD4pk72R&
z#BldgS@=_De}EfFF;b?9mkmN#jAW|F<wK;H!T=Eg5RhHM%oIZm;QdIWi;EeQVv_L)
z83hjxf&(0qkdW~8A_YZpI5DWg`40kZyh*l&^vFTHLZTh)JO3#M>f3f2bm-*GuvCdr
zx8#RWCm-YwzK}Zl<(7(CB(N}I!wX0WNtWseBLXp!fHXF}JHXQ8CQ33QGLi>f4YS!U
z0h?7HE<KU2UglG(RUe@rk$wlF&Cr|wUL*VVsgJh02?rG{F8@6fgv%KeKDZ0ZgpFnc
zay6S2357FBEaHdl3rFDAMdaHLDdvr}t4rSA-DZi6Tf29o?t)=3sWTv<`B2^hSP7|T
zh9^hR5V2szel&&1H~OQAfKl|~`K<AZSd@3j5`5pv73K7?dPQGyZ+Wp2W6`oZsVemG
z<Hz{rDAVKjcjd>Ode#5_;L7bZVcQ;m0#JzQi05*(KkyyFkjh$_yqNjomc^UTO9H0Q
zS+V~v0>EE``1pccYBy0`FzkW1+dNol#BLxv6s+^$?4`~;j~)2oaq%0)W>4Fbx|o3^
z1(2Y-AfZHjqB+#hEMm7Qky%JaCj^jK(AU5&cmQe*O)n(^8vqB=v1z~U{Q-vnoVy9^
z^fe6h1@&%OEHSztlYIiUfd5Y-G_XlO@VviUgN?=}>;Hf_z##J6$a&U%cy8i35+9I0
ztE<2*3jz~Z>%R=H;E$?bNhg)gUoigb1}eu$75aZndQTcF4FTee8g&MQq5#uKx+l&H
zQ~V1FW+(T8?8ZhjD0=8kY;{D`F5EOS5=uArW+8&c2jrNImS)ra5GjBc%xM?EqrA=!
z2v%N-WwQj)3k3@l`sBKxKWD*82Btr#(aYF)WU8C7zC}<tK$BPs2_OPfQX*qu0IVtm
z!Y_Lw<-wXy<S=u`rcFXn0rnxwTrQjU?J$u!NQ+IeVAE|PqSF$f-nGuU|F%7vxJa=0
zk3yz;aOrxkSzbrdID3GV0%<&89jl0SR>mUfTALs}q<(#TtDkrWDC-4;PL#=tM1NJK
zOmwHS6zhnPRGWlu1F`Sz9kmDkq=pLU+mBoHq_%~UPKSN^$LJxCAO9sWGge}Wuzd=k
zwupTpJkwun^5CuNNCzZdXK984r#ERi@<;t3&qEqQ_>>hDz1)^GdaTSz8-Q-l&mQDx
zK}?(4BzRDJiTt<%Ju{@Yrfi-kD=AhLDmnqA4+g~~QVUWv2vzIDE}-^Bp$n*uo~6@I
zSN)4Hb(cdB{)=4jA-YPy?}ZV(CRp$A9C)Vwq$16#W<+Zfg8yh-MM40zb{CR~9vGHR
z3&M<h&(@qbozMdch%RS>EG6{$?=@(sxwA>2n1nKN_|Eh5smR<PMbP>iS~?8!>D$}k
zS(FkIeAbm25M;8MT_PXO*7JAv;4J&l*+Y~%7&fyVJjZ^I`|JQPyAL~+Kp7$xA<rAJ
z?DRfH;$e335nqc!7s9A2{kg3m6U70Bw~}Bb`$BH~VlUAhNs|?{F&VT6q5r^-_)TE&
zJGbnXeDxe^0QyY!kYE9}4+K6v1DxwC;WC{UV_N^>!V2N9!DP5Jh)he4E~QMi{Mck9
z#Srxxd;D-773@yX9g!P|Bn1Pv4A9rwxn#YL<6Z)q%(BK>Q7>!*8dth1L&@8X%ykC&
zl!a_#4%FE9-+pW(h&nQ4iN=RIeR`dG9fU^eEz5vL0Se=;5hsUa;|Q_X@bKBjF&K>y
zvv@HM89X2H9f@&41VALHhs08Xq;{?$&6>aFj2E#!0W)aiC(&_6q-atXx|8tp_{a2Q
zsx9O-QjjqH(t)BvWFUY;FS1;gQv4TDvW^nh6rC2gH|+fj7u_kmnu<F$$4Ey@Apl}u
zQy%-QYyx3j+%UlUq<WGO*?A|ca983zI0A}xwm4%Y3JBOn+f5jnMv_Ea%@Xq2#@gjv
z9+KRM%s03BPYs&Ceg1E8ThC(r-LcSJ^vTMiWRsQ>lj`q_<9erxq!8k0kiEZAz)Tak
zPIrMkb{R;q2<f5%@KaBphU`Izb3e~w$pyT4s(L`us}0%M$ElyMZ6Y{tJ$wypQ2w;X
zU%$6ibZP)2kxFlcg(Lza$*LsJRZBuBIRkCW4$7M)e5+7s3^Y2FDos$jOB;nO&YT)y
z#-4|08iU<Oy?6Z~Y*vSpYE9Zh@J;RXO61x}peLEbS9@PBqAFuZB+YjBVMk(pLV?Vq
zE}^vVhHBMdpJ@oEz8KEKWL@@@Uv?xE0$!6JJ&3M%j=)rhUl@o`*iJgq(4G``n^{a4
zA;~qX01Q>2Mm!{mCbQlcU5)vWJnvmvusFtN0?!{OduO;p^xu4vbL0uw+E^^iP*xhT
zdjauV?`xO=0Rk5Az{@qttZxG@wFl8;1jwaam*4o_>*IQ-`1Q>-1e+)+jegX?6~!Bg
z_kxNOl{iMC;lW-#qE$pEI^dh~H-6dR-x!ey5-Lt=v_Krx?o32FZL7B^*iG6_s>Wfu
zQcDDCtKR)=Udu65g;0wJ&#G*LZ`D`?t>+P$__%qIyME~P6cuC!!5s$U32xUR)9J}E
z!o%L&+2n&pw`Q_>#)-%{0g*9joWzK&s+GpAI@o6tzLR)LUJmHkkZ4E56VsXevi=<8
z0Ma5#LbaeeUXKTDNYZ}%yIU;mq8s%_8f;5|IQ^lj#17TqYUL7hGTBf$>9KsK7-j(c
zPMS2QC$eWmAkf+Z$n$RiEH5OJMv4;%A4~t6P3DeYtclHqOv__j86^r3ZEO__7ix{H
z;8qw#C&(V~sm6_Xe{_vXUICDWvF+wQkBTeC_{k0=@`|?eSc_w~u}%SWAb(P0NWK(i
zmb@}Z<V&=_5>J16G*6-x4G5#JJBhLaXKdfLBe&n?Tv~uj66HDe#LYy&e!@4&dZ)d?
zWCj0EZ*5jMB#pLvin95meNAO}V1QRD%92P~uw(qLGuv~}K0t^Tof<4V_c6l`;B2Ei
z<YmLI>Uc&x5rLK@ldz<RBC>1rJjC{$etiSTp>gzewq+KWUOPa=$+o7HgV&bj-x2uu
zU<H$Kc{?BG;r*67C%>cJjeiqA1b8$bp2!1`>t!2xIe_(InS|!K-Z_AbNUQ+>S}}ZR
zZi7mXTwHDp!cG)tk}B&_tHHab`)^j6BGnV{vk$$S%b0|DAy$sp*G!EMO&mzS0A-06
zJWiH)x5yPy!^Rkj?v*`h2|uz+P#Xy2+f`DV?~*j-Jp?bw<<ZLO`4q1e2qmGt1PYj-
zK>)qb;Xc5f^c{+!XHc!ipy@q;)9Uoa8YA)H!BmiCLI0>6=T4R0rsvzF+$ZZ-8^WZ)
zrb!8b_#DZ%iR@30AN$k3SEF(KB&jm2G(Zc>my6an!ydS+MVj^aEg&VRqm}d?VM7Rp
zBDb|W#chaaPFgw0&49Xfi76o5FaT6AR%p?BSBlve?0gprV^o<F9=CeS?H@DWk%@Mx
zPzX~sr*G{hoiZWDRWXO9u!DoQfQ=Pmp9ezZAyW5Ph{r$4Kr%!<S^@LxHrlmO%G8WU
zxP1z0NdFG7Tx>#Cd#GQl+s+s=gYlI;pcRr=1m;d}80s7mq>^EcDxDl}B%QsRyhs3G
zr1{vXmU&FJL>~;gh$d1~AJT04`o=0oVXLzbdCw)t_5+2HCxt3&*R9@|<wSh@FgW#{
z%`j*wUd_4vcfr~qgSN!ry1hw&YYm(9Tkwq0cw^GR4ke2eQ7j7SO|I0)Lb1??fF%cK
z{SS73&!yZM=L5WtVBXKZKeF>@hn|TmVF(}wp)i3!M@A1A6#M&<S;DR$0jZ#<I1q14
z%6%8we;HAANJ)mZyUvrBNDD-`(6n!V$|&x5aVwfasIz~5ublV`{~`2#DH04wOg;4f
z%sNFV1;pvYHD%n~7kh$W5bc-j!Rc?;C7NM_Oup_UJPOcD0*_P(b&0T;n6{806Qsgv
zQLMzLfgiRx`X6qrkeK{N(l?6+=_1Emr%bwZHP~Kc7^<D=D+#5`ul_yrw~%YA8I~Vi
zb?Enbt0nh@-(WZ)`Dfqe{c%hSm6TS%tavE3U(%6z(V?Ru>)U12w}fxW)N}pY^B8?>
z>ZONMb-7Hyy^+o|3oVhv5yz2_MQkFCHK}comy;$ZC(F~Ujrq>L&-r`VynBZP%<A7G
z<F8%4`my&;6)X>DgCGkFi*ZU3P)7dgc!u+#m>Y^}zIjg8z;O1DKmPBnLsnK+`??Ek
zY-~m-h_&gqvDN`TK0Y!h!h(W=IF-WXJ=^DVyaAwj>goCC)oa&`01(qG+hiYZU}tAf
zGHXzO`SN8)T}rG`MTGu=|L!9u{By<0oh0`@w;wVZ`dWIBJ_r`|3`!Qc5hVF1P-_nE
z*s+7r%s5`PJb3>CT(MTY9Fu-rNJM14(m3T7ckdnpljz0w>Mv3ujaxNkWMq`5S{g3;
z?Y9UlDyK?}<~&1keua|E@;kEXKS7B69bXIwA$KdFth3Ulr>E7^ZK8*=!oxX?_qRbF
zc#a+<Vnvqq9rNaVihO+(7^Re_MAR!bbpeO{X+6E93r$DJBcs5kS+pEWyD-#Ip{T1H
zeEIU_@_0Qdw^xX7ippg=O!nb_UrUP7k*^dg&6t{=?iYZ8N#2wPi=7&((uq)bdhFUF
z<q>~zCw0hJ$?L3{MK8+t_4S?mf-)bY9C@-u<sIebiRn3qvbE%oKT3h4y*)iW+1`|v
zo<PDlAusQVn>RN#>HRm?53c<+r=faL6J*T(kr5S`-bs#Q7HQEK6@x1<s}Vk-hd@z}
z#6sDaJG2!ukx8wMx5Qk#-MXcSN3({HH+hUva_f3#=9u^I<*)wz_gVOr$9j2{Y;A3=
zd+OEp@893ike(8u^z8J1_h(qbFlVjbx?Eq$a#B*#HEDv72wtagE3ZT(M2>01XBa5T
zQ92O&69jR~#VFthW29+%2$D&JMaw%o`Zm{pc8iN^*QJ=N96fr&d9qJR>&5x44KIYP
zJHi2CmMmL#FFQM1Q%h^9pyKRwFM0ZFL<e82OS6u&AL@9DRi-IG{~Uzw_0S(?vt<6K
z&Q9Y@C!EF5Tzg@Bd|dsO;H!WBDThC!*|s?NyxWZ%@6kBLuyJDqpK&FJu(0s#CR((3
z@lFnoqv%-h+_mob_elFJH7C$q9lahr2!{&(Q9Xxf{hYkKN^3cl_6YHyJkcK^J6G0W
zT8b-FpDjJG<I6%fAH^`)9jvT}9zA+A4iWeszS!Or70E!>C(#vX-?5`2{KTujrDwy7
zL!IM$t;aKanrmumGGM18C3b+LR(eQGheObN7=X4c$(yAiv^A+|X`EZPZml}(Z>RR>
zk{rkU{Ctw!4jep~bUJs*6darOH_8!8w0ZO9Em^WeK~j>k{1;9p<e%Pf?Uc>x{~{Ux
zaZ8Dd+rxM6!=d8hVgsmpl>8Ub(N9-MSITB4rnO8=y9ESNHs2x7k1=rW-Fu?QPe4Ug
zRh8Lr$(HQg*RKhJgn)4Kz<~n{jEuf!ny~N~Hf#w0^hr0#wC?1J6)P6`@0h(`?i=2@
zS>vrO8YuRN5M^z3B&LwulJ4cT7LiHRE~%y$FJ8dPI3jG(d;{{1z;;&Fw-pt99*K)Q
zt7L=3qlY@Hc4CUrVXk=k^y!Biq@<*h&XpZO0NcZ4uZTN!aLqnk9JTu5;~R5YGRZ^l
z)zQ)E>;|cCdvQZCP2*4d=C`Z1Z{OaI`f86O83TVErtCPjZ$G?vC9@LZluYbwW<s_M
z#$kp7mo8l*yF^q}bjg2ldh9p1IR^&FT{j%pdhJ^m4~lz+>DFJe!RK$>x>bfY9TI;?
zv|3q?LJcnv$UFAzkte&ZI{t}&K*0C7KgRrju2<0cS8>M1rIJ0G={7Qe(I?KH4L=#J
z{`A3v2fTYR&t&A$SN(wK_HF%l^Bm*<KNpFVe{>o7;VF)J?cB9X`r4_tr{kN??!wPm
zUcG)@Bi$4|i;CAyh1|SgYpa6Pu$F-#1RCBm{0`fh*(6N$Jo?}16vA`{NVsU<y?a;K
zVK^!yGt-h+_USQirA&Kktu(8fczTL;`}T7<GC@KGrfCfBAGS#aX%ljT3<?ZVnone;
z@qgQo+IL{i_9KhPV2LWWWchM$tCTASvbO|#?~p@h4KFSnY%2*;!87-B9H;13Q{&b|
zK)~iuk3W4CS;*VW-^0TL8I{B&X@z+4ii=#2daE-xt>FX7y=}AU<5SMOp`Hd26}@Qf
zO!W^9m*w3AFV%}!cibZpj<o|){EEbO?lh?>!`*c|v3|hMipb|OZy9E<eP;dKdV!DR
z3mbG^3+caO*sU5TNwPemwgLrS3|q~@(z0M!m1I?2UERR8r3|)nwliuHj4pF*=P1<^
z4e7QjC=c)lLy=6^{#Wz2r2LXScGGAy3yUw2N3gFRhoDb@Q!DK`HV7*XJ*^dBCquwM
z$LR>Qmd?&dC>sSOC2}A!Cs981<5fT0_D;L8#?=YG0UpmrY9nmzG!*w=YU49S_Ew$Q
zXJxps?$w@UiC(jRnf*U`Ud>dC!%QMJ<zP1z*)A@LWk_?9{85-sG>%r1!t65RY`ov_
z$U7fybH(x+%!Hkt-3}okHRJ(hO-=ue{3hYR4NqJH=gANHi#ePG?pDS|Btp#V*ROf^
zy1Tm@m4)6RWjfu~oX@Pj`oCN%Den6VL`X+Rl@v1Q3lQ9`w{joDRyf9;@$GWWIgDw`
zVmh79fP@3JSk;9jnWrHkXVSFLhS%KI7KSp<`PW|uu(jOW+!UavF5$2Y{<d`G$|}gL
zBorVEb7YqBl0H~C)tUq+RYdxTiiv#>1O{sE2CQxaL?Es?H%$SRz#5T;$Mz+)i|T&|
zf^7Zu{Z=$J>r9OFR^d^t0iSlI)FHEjU?@_I^RFsmZtx(FkUxl$x1L*DTA;YDP6N;f
zrBeyisrkzng&bexaeS?j|8B+MFI$X3%#{G;&uqAc`kLf4VIu}T)gX){U`LzbF17BD
znHo4`QVM^Z3@tY|x8`+WpqK{mGVhLqIijadog%%rfJ|?AUDm@q{S*ws$bs64^kr20
z{MjFB|K8W@#LxSBt~qhy1S+rmnKL0~A2ZI`2uo&7pidT1_Ai<}h&U3=8H*zv?{=-b
zg^6ovZ-0h;^$ZbYfJ3C)0V)Knn%*oweB=n{zI`9MtmGGNXJ_w3CyDTdFMt2@>Xr52
z*Arvo<3?y9HVFHzS!D6z#huP3fOFQ^Ap}3q)?GQ#zQ6y=0aziwd%H0qOL$t_C{N30
zZEX+@_KG}>nloFT-mhODGHpmx-XvgF4snxV-MUb$vM)+7!zO?SrCaNEH)H$2+YE<_
zU}0m!FS48B=>PKN%U}U_HmQlij-zq^{PU01P=|72*?ia2udqz3R;@Zlqn$ucHm}uJ
z`Jt|wL=JyZBvse>uO~NC{XQuvDQHiR0Wo>k*r<cvmT=VIBH2#oIjJgyea8+Oma-gf
zMw0WC{b4C7zkt=$y$yI~FJLDeg$#kH%#h4oJ1<!NxU_UR>h>Ud10kQtxZSzKC$gR5
z*h>nTt1GpfSyUAX`tFr0S88@(L*%d)mdwi$y?WkR_GkXhnF6{b3Pw95%MT#@Wq11e
z`xU?vl%v)8>@Hq>e`*0`@d>FA+k|}Idi2{VWae!RFMvvn(INn8F$AD~iQsXHLzqiD
zBYbAEO@cJy<HGmNq`f(n%O4+JimNFzGczM!S<N&2gF4)Zti-T(?e|iJIBI`W5v!vE
zA?H0>*p*{6Q|A{GC}VKSc?GrT$&I2CU@!cA_RBNm?c1Y85B45}+vjW6m}P3IL+<lL
zY9(PQ(4&=shh^Tarrcs7u%CtgJ`O!M35o9Tp+nX6y`Minf{Ckz7Q{x5z{Rc;hYlSA
z@)Q3MrIyIBe!Zck4mii`)w*gF$x4Bym?+$*<Q)TqERUdlysSiGCJM!^9mgj5M^uo$
z!LQ$cEq&6`+G>CwL<KS@ldXBxaqeVu)GT}T9yx0WS-;NelBI=(0wir-i{_(ZP|YmP
zpWm|=lc)xulw{Mbj-$C!H~^qvVUaMDg?-bG^FXDaUS2^Wmo8jQ6?*&oM?oW(_F|Vi
zX=U}?I`h`8TOb@^cehDJAZ{mP7rM>|HVr{OA(z_3Zz6x}nA=bmRMM}IqRY_8t$+F&
z87ogy11ieD?*%Z%xP!m#_<O+eXLgg!x1JomkU99_crEa`bpUPzTK@<VWjLIEHiN~K
zX9!qNJM_wOm;hE_C!OP1ek0(G2#u7On>F)Xh1@p_J%%5D4zfk1bx%*vMlt*6+AdSp
z%Fh&XUw2Wc?_Xa3kqOybJ>z10(P0|gkSFMIivWrucc5G|a<Gq<WFOck5%v?K+>-_e
zDMJg~;}<UkY4qIFr2{1ru~4PL!FkyX7gI;OVVm1S2^l0EIhEGP2c=j7KHVH@4bKAC
z%R^|-*FcL~%DQ9`LBX>KXnQE6JAR+<df+QTj8IsTP^df8aFo^f<>f`d-23)|8c@x@
z4Omhf>e7Khia#^rvHO4jy)YBH<2ce_bZ19pbR>E&6cF`$@k_2^gg#+?I*=VItpQQ1
zpsjCRzkWR#%B1zguxTcY_HkhR=uTK>5u~62QT+S6fGC4FY~G37K=09(CRy>}UcGDn
z{{GU%<hal9Y3!84ezad#4r%ETiZW3LDy?C0R>F?^3w#iFq|W4_F7e?2Ghly7S`30l
zJ=NkVDRG6%sQx_c_O*Zh8H`^#mgaD}6wTL<FtPGExXWAo2H)w|zwcH{c%2plLy|zh
zV?JD_rwuTYxW!~@n57^5GBwu-eFYCBn4dKtgb6BJIaFhr;$@VlBKB!HvTIazwJOav
z_F^_R0AW$hh18Z_!ni^8E1=kJry&&*rwJ{-5?*+Q=5nZE&)`pRfWFWN;q;!Zh)|AD
zjenf)OHjZz63?SMa>j&y!nWX15xf2nxS~;rS{mJ`GU{1Hq)H^RAe-TD)Z6H(i4g+)
zpiSIm#W&;tY)+!4lYv_FVwZP(czP>FJCl<Zp_83;GFCep+s@CQk=l}yY}OEgcGR!P
z3-k~OS?8MG+=<XkP55^#d?-rtj%(jf`u}A8@)>wC8Ms^TVO(u**xjQBxgYP2m^b@y
z-sqqyK)Jxl=CF`-_7kR!Sy~`}dqGa$rh<n)gI$#7IF^9#t&G-6KZ(%_;CqtVf2>@o
z0wNFlYzxh(^zqeXlbXNe<mAXjsT{G{IoattHpY8BW8;6lEnqcqIMEb}BdDi$e=@=E
z*z|ssna55s@F3Gw0J5N`PLK8>$YuIjPL3ogO?c$yLTFaay}jmQ`-vD{fkGj)AIQiT
zey0(DaD-x*JL?XR5jpX(kS|S5WDY56HHnWMOHTii9Pbl?xOsCFAjwfWnSp4v#3M-S
z`^W=1gAF|-2oY``jqW@(q=H5i%YDmc1H@tV>eU1)28h}zd}vs~A~7?W2>K>X0*i(0
zL4ZAV4;8rh+#AImU*xMHFkHuksC&D{#O5XwhU4({WBASLIcO2;NfDJXT10Lk7;cLM
zg6cS-qX5j?2?ZHP$Cym<5-On$&U6`I)%K~zn+ID8d<2AcVz}rkzf1TV>hrPL!*u}X
ze*X#?Z;-~01y1@0X)NOs`ImKJO5A*rr@>7$z}jkR)JVwTtW=@wQc7Nu-=vxpbbzA*
z2uXw;LKh?7N_Y-cdih3PoUH8WiGic`uBxi4If|kdk;8OLR;;*xYJ-KfHCeotwl+f>
zb=+wC3NraM{imG8@BA>#sbJOA3r*Lw?c29Eq`SAEacRe=g|4d!E_x_xm%vKk`<2Bj
zamef*W>2ye*eC?edKGD+maWpVe=fq0qB~?92U59c$BrIF^Wqf-78Y}hljMd=aYISx
z!np-J_Z>QHY_&4&mFeYSaxCaKtZcrGVt)}wVh~N-?L9YDa5~;SpAk2MDWngKc#MwL
zzzAsK5B&b5av0E5br+|ucGJdTNDV?udFYR}1Ihe<Fuhw^rY7r8*QYNdcFXLS<f@cY
zuYR-cu2rTh!Dj*;jzlD}87`vs6rjqWY*{vBIcb0n)Mh!QqgUVr0d~BwY9k<a&=hd?
z$*QY*ubsU;k)v{ULH~u|7k%%+0}ew&LnguVlAwq@z=@RWX%B?fCuciBiJ&7t&b?6b
zyEa@S6;sv6u<*ftY^!_cKYsi;4hNH!1_c!d-!sU=s)OP2H(WL>_Rs(0F(v;G@TjzT
z)>tTOJ7_5mRHaB4qsKXTR1dlS=x*+|HmjegDEA+90R+PpR;QOoDDOmWms@u$mxP1C
zaaq~>v9Ynh)Z1wUQ-G|K;GHfZr4aaorhx8Th_}LCJo71r-ymv|0LCrZX|?fs*K3na
zjc3;~cRg84`~I`evU4775X|Z<bo9y=48U`~Eal3Yk3dxnm5%2^w@V$H!2&9I^_n$9
zC~U{+AOl1Ecs<cE?+-ZNy0Wt$J^RL|@d=f?iCq0iztJywp`UE~Mwb~aI#F%MCMNu_
z(7}7`AOsB|-5;gnWJEaWcf!Z@N$9wT>bRd?9;w1THa12+4BmrYuQ<RSv|Bwt=jF~U
zW@G`pL*=d^w=?MfH4XKnU&l0a3da+QUQ1Kd)w{qH=$ALp3dGr@n>AF<Osb2-Fi=DK
zO<xd=ZP3y0F!H=fD|Ft4@SE1f!n~E%E-cx7?bHT4-+wg&+)Q$R+WF6yp`H-fhPOR*
zV2@c?SnNZ{NN)A=^bAHgwxipN9vK;ltQ`z{3!<RwY8WeegnzN-=jBO#{`{HEa3RHU
z8*OPyi3TzxWPz&5i&5$u8j6q+sNiZcf;cIzwzjgbKY@wcbDX*0z%HiqnpE#u0y(Rl
zBnH4q)SR!AK*uu0zf00au$R=$SF(uL&px~F_3PIIoyX77Y~?mI#GSf=2!&e1jq*Va
zzwQ2|jD<Al{9T5cyoP5U#b86S{SZGYZlTT0%p+992M_*0ebJLxhmt(AM`AHmy;ESo
z^46&hN3_wk^xv2KpHS_UVH1I#K11~+eM^WILg$w}Yi>S6NCfZ<(769!fBn_m)n$T=
zb}abDjo-<4psW~u%t!?<E0p~M{yn(}+V(+<lmX|hrYG&(U2vo3d8op!U2k(I9XEdW
z9<e-k2|vfL(CG8$&lkM3it^k6(Gk&hoDQYw>9=p^u{Gosx@!`pa3gw8Pu_gWlTkoG
zz`CpY1l_9d%i#bqhuI`Xxh?+p4n@CA&>7VDxAy7C8f*shDGpA&y=LpIOzHRF4_Er_
zDOF0+reVfz|8)+6bzdTjd1?s-%tcUcq%^9UPmGU0B(oX*VrJmWbjd1;<7gKKlMsn+
z%*wi1Hrc#<b_DO*e;ono)3#3s#s7B%j*gB|DRP0L=T5Ux#}XVS;$>xJ38KkISAg66
zVLhyMKQq}PHZ4}a?8pR={T=N+KAnMqfoz5gD2^qZC!@}0I`vvEq?XiX&Pd>M65B$X
zD9IC_T3YlonHU+#U9jf}H2l98>~Y$m5ThxiwriaiPRTMX;^N|hbmsLszn3bs?49vI
z9Oh%Uxw!#_YnUgpAkaMp9t16*5_X3Df_Lu{5KSX`j&$+D+){KzZo8FB?<qkCgQ1Q6
z*t@zqkymYjD(Fe@?-@l3dY70-?NdOf#;n<o-;t!{I`0bc?>1le+FAXS5`CoTk^>}_
zmM>rax`{Qy()=@7f6}JSzp%n}axc0ImdzxiX<>b0+0H%-;a|{%+O<<h>6<r?zFR=)
zW;+CJ8WESGf`j8R__sG5*h}U-g~ea&1D4|FzV8Fd)oK~zzC+hesk*pbfjagK|8l~^
zM(chg1YxTtSj2ixAK{O@^U-vknX>KFBR$hP%m8=6dkXTW4fc11tv!-^p>Z?0<PyKG
zVj<tid1Fm@CrbRZmg&cgnl39(skzePu^WK)XCV`&IZY%J4HwL^u<PEB5dk1P`N_H{
zj;#P|ruV(*bm?~sp+y?Pdm^H2&5vhr+whY=!UXt&6F1jk1hIm8KpRnnoBxog67pXO
z2mp$$K>J1opukXXqYn1?taLSdcaFb)9VZakIdDF=n>UF@4wy%FyXyf{bo-G!iLsj{
z#sJEX_vNt=9`o;)ywV&6h<sGvhEO6pKMwxJ;;7(pdc?`-h$U+|kEYcHBm&HA6g(e{
z_KH&m5&vDn^?>_N*iJtCG@}95Lqr{5VD&0^O`YiLB`0aHP{vx>7yC|+jgG$e{_anB
zZk;<Nn$<5ZJRyP(ni}4|dUcSfqhJesXwN6@G+l$zM8+eeo0MX-zH9@2$Mw#_0;^5X
zzg5$YQ+`cQyoh}(Gg8E<9B~$)ZyZQSDgJaWZ^1}19xI4E9ug!^(0orlDfi8#x45$i
zX(#-PFe#k*d5q?l4iJ`eUrPLio}j&goNd9??%|$KFu1cIn!<p2FC{L`aL0}mk)ML-
zstR3I3EF&fYilSDQbr<Q6JEq77f4u@zC%5j98tuHe2#yWvB%&EM?%OjYRt;~c(G}D
z_pBRn#-R(u&=@a$BEOEVqism4z*!M^9Sjg`C4`#j>gs9}5A1mW+wE`|XQp~xDzcm$
zARQb=rhxBvpFAdx3y0j4qtQq&a7%A*EK2CR+}y+9OD7<n5~vKE((F(SIldB2KNZQQ
z8pKG$70Bk=$a^*zXQ)o<IFCIYy6BY&?cxa_`JD}W4?vE+2UDHM>Mbo#3A4c(sAoB*
zViQ>28^HVxLec3&`?=E9C2N)NhCYbasyK#z8=1jp^Bvzn@G@g`a*zOpR@U6n5rKsx
z;yl52!@Z3`>>id76Umzg$jy-MP|u_D7UE?p9=v^}rKRKe10Z}D_yp`*g$S+mWYXgX
z*t1(osH-7e1p<^jw8yAkt(7ZR64Ht}FC%&u6H`E4*hB6WEaFi(M|0fS*%^WU5H=((
zvz@?8NN)0H&z^O6R}6uRAclGH1Wak^=-|YaNOdFKFvE?syQj$l%ziPPKpEp>WdwpX
z2)n#ml0IYz_;&;<R3A=I)<wxp>BXzwUbXGW1|Z3Hck(KiWho7q@3gu>*e?yEq(i=8
z6zgs{x&h3(px;go$ASztIn-x4)osALR6=9#u<CvL^zw+b^zFT;{#n>5jCW_GGcS0%
zTwH<*8HUodL0em!KS&X}{2LvYDeuqxpzC|d#!Hj|X}pc?q(62sCMM<@YZEzPT@ZFj
z6g@nTca0cArwHgte%2^AGDA);HJB>&nj7Q>PVqVnF9J9}B`7F(W&^RW^Z<%0_xd2C
zyhu+s+@KSov|}49D{H}|<u-}oqTUq!f_1MN8yn+Pcd2D9hMIc^8;9}c&6_qYSrBiZ
zU`MFA@63ny9n%MZwQ29(y=v}${{ALtx6I{$CUn2)WaP=BQS|L~y<2~EX%Gu0*Df-=
zTSp8BSk!uY(I6pKWP{gf91g9+>mOK)_=U#IasutTZ>-w3uTT#w{06l04sjtL_IhqT
zcsnG76@E<<0P_(<{JuLQBO@us+TLs8<Ky1|>UBSq5Ec>Hgp>Tw<6(J9=iR=2n-@nQ
zjg+EPOJh>?&`fn3bIGS+=Ylr|9eJkkbRSa+TD8a`#6yQ$rgy}HIpDM2W*`~J;CHl2
zKI080`-sr{Srr)T-T68Z+nx<*=G_PLC+O5|FE6jXhQ*sZFJ{k7r;|-6ZAn~)Zu{M+
z9JkSbwNYGL+(t|Vfu@I+e4CjVe)EqeiBrf%Wv~4OFZ>ge=YH$f@kG-@hyL!v3sY)_
zn9V@MLwl2=rn8p6+Jh`|N1CM*XKE5BlJ+XD5SI`S`F^MfbLlB_?Nm4(l!#L}%SFVJ
z9m2xua56~(g}|tH#}A;Mh^DXvyb`%KV4^xopIpzH8rJAcBiTU2rt3YVYSL~?EEE*T
zvVX2F<vagLa!ukQ>YmlGRb%1rT0$o_;|%zEA{Gwe^ukv(ni3X_2OPw)-{WRxW^~eB
z_6++AonqO<UV?e|49#ccqz*u@HEfV6C<+m3iJ>9sOp0W015YMP1|D?o^0UPzg!bTd
zY6DSIF*)Rf?hfc(_^M5{48zgZwL(HdgrpPk9k$%euJm*hv}Gelmtz#f&cPvH3_;87
z_H88v1v>CE>D8Xax5RJ<nJ0(C3asj(qYib~so;RD1jCv&+rVz`ut;d5bew=c6{C|K
zL%eyYV;>sK0&pb$J)FcYCnB{`TN=@oeH<W@4uP1+Vi4&`69MFRVu17Y0DhE0jU@~S
zat7(>MzkeC>VG^sdJ&Yo6TMu|-^6O9$dYz7AAfXLN21aF?Ve?*ibsh;h;T<kO)!3(
zIHx!%@6ik`A3^sO$IeoU$M*H(OnwIp5oJUSsDt*!&x=TzK?WrY0;d05_vL(&_2`3e
z^}?aOk>(05A&*T0<fihDjv1`^$jfMw^g>*RVm8Q~>~Z4gIrcPad|UX$7r3<Ls6Qkk
z=)@{QvLW0}0lzKlFZc+ZGT{fVE}@5@I|Z5AMo9R698A$g3m0;1*^<+$nu~g=oNfc0
zij^ShhG})&{`Fj%@=$!50hb5F;W7AOV1`U1=yQFKq@Un8iCljU!V19`A?bMx=v!C|
zZI?Vdev<eb6^Vv&GzCaBcoCwV6VCb9ILFv^gJ&Aypzu0w?Om_-keu&?NUa?ChOrUn
zh|tOp4OM9n6V<bwo!pu6q#w&564MQt#BTXJa<I+zS&+2P@u!u-!TlecR4zrM;(mrM
zLQcfL8U@HMGw>tlkcFM$<H|T52@vdsI#;6B#MANGw-%kxryAHKgfE~5)w4|p%}g3z
zi^nObJy@U<m<D6f0KpS~j!sg#8-zNueD8MIwCpkp5I>fN*0Co*0hNuWNG`YWq}55?
zSi&eRuEdV#7iK2dV}&n-+RS9>LVH<mYdf+m-JlpJybUJNS;B}s!9(8dX6AVD;)~B<
zk9Syj3~p-)wH$Kp^=rHpi3bjySwr$oS!#15)Tq#=kBu^|Fj?|>VZ+8(R^w3kUuD%e
z?53d?5D;(&BBH?tzwsn;swq_O$@$L%fZJQl*H%ms=2g>Gh2mM)W2%FYJ4j!}z;N%J
zrnB{zj7FSoGD&_0UR8>a*#_xQ#soX56k@7zl%JnpAKn1NNMufI!+@SqY^k_6B0@rp
z$h%3McGbqfbd`PRu`;XAu;Y1E4XZ*!#Ms`>PG7iZD%Qz9K)Ls(hlhauNRO%4K@$^`
zP(2s;7yPkfco5&ud)=|aDmWBFZ}*{duf`-1DL|TKo2Y30d0X3*Hxm;Rn@ELz6DVp&
z-j;HWud<|syFiZ%lOr>bmH$;Wm&IlBl%R-+h8{xFc;Ui@XGX<u-MqQCqbk<?4&I4i
z7n-`^nrT>JaPXFGXtH@#O&YoNm^N>|4Y|<3*m1{{tp5lp>b1Vm#?#&)O7`8pbMt0=
zJF|M^$=yvbLfYTpYa_(V<Tv&gFK$3zN&#<gxl1FYZvj$Nfs{7kKsY}kJ+JffHsKXa
z&)YxIe(FyYbrO9dRd{1b2szyN>H%8AC4o#6xKKc~pqVVX8+Z=wAy09rFT1h@+qwf@
zeaRkY?52%-bX_5Hsr(5vy=8}Og8<I~mEtc&r{mA!l|Pj&IEG7FBy3DYP{x2B!y$#^
zh~)MDow=J+q356|>US4EsD<9Cgs|xB2jYS*5rqSp5)Btg7pKY}G#l&4w@b%T&BbBB
zOV6-db(GWaTEA=7c2J{^fpZHCqnBdI(xr)9h81_Gv2G_24he-9u=470x|gi&CP({x
zZLICkpgV*bq2Z$gnHExo{+J-ymP<Ua69HuN(?oH^CO!Gq?c3#eJIu^6I&i)>cWTQU
z090NaR7-vFA|y2yDS%jS)<RrdvS<i?g!<2EZr*E^w+RyDkPR|YgQcAVy58j~>(C0-
zKR8&pCmFktcxVZHS!gRI#}htJ0a`tjaLEDzV<m6P`T$R6-)#?f_evPaMq0!D{U7jt
zo#y7|duEPM9LYJM-CAkoJ^ssIy@&f4Nqio9SXdaq3#*Dsl~q$`MPwM#yM_<6$w+Ly
ze|qw~LeLMg%tINHl{`An61aMi%25alzB#q_^q38Zi@-`k!ab$JjM1pSIR6Mue%JP{
zVV5mOKRz1`etaoT<LFjZ55I7pN+X63HgN>{)QPBx5Dq#uiMRDcH5}aDj<TF=!zEJ$
zUIhQ22nehV-E8AHE_r5WHlaI^+211<5Vf@8nW7YK7@-myh_^pIf#+dpobmX^9Co;6
zanDS@gc#&kHa1k|IvvsVe0Uqlvt_U1xEfs!I2<8;)Is3Cx_WbXtJZqhAF$gZ@J6^)
zW{J#rbl5)zifl)zDjXW!L(W(bhgmDRZD9=%OJ}~<)xKs`8airJuZ;)^3KAKVzmvQN
z*GkSJBMgRKWgS@?@Bz!95T<^;#tbxHfM*@Q?;!>Uw6hz{250e3j`lZN85c_pgwT9k
zc0o&16H05|E*2NCvC29;={ODs-anUtP6C-iTq~OW0#YBvu@S<^5DvYm`gp+H#l7`Y
z8_>P6cGIRv0DMWHfr1hS%VzjO-X0!u*U+BbcCROEg|F6@ys9b{6wM?qyf<;n2<>Cr
zw4L@zOHrDy_IY-(;mwhpI$0TLsx{NXI}d_*ZfiY1d$;K78(_F|1LGlvFll|)g@q2|
zSVWCx&e`g?Q~c|19Fs%)6v&IN)-=3VZ#w<-h@L^4>eYKwBfgovewf1gn5vHHrr(hQ
zl@aAyO@^&dzX=%~M#Nzejfr0c)8<+qs6>1!hcV)|904QE@&x<bez?Xkx&Y~07~Wxn
z(2+xj&cY66%wn-0{Ca9c^|11I%LlR@|NYumj#kcD82`q)_oEgXFwu4G&e{&Gy9`af
zEnmMrXxy>x#e)YM{32z2cB&M|cooC*;W9TjM~?v;OX7kLJ^(tPwAE1qgM-i8Wxdzn
zCDC$arz%g!e<jB>Dqf&MRKQIB5arT*S1L+HqvR%V5gbCXqH(k$8eD8Kb-<d){PD*h
z{GA|;j{zLpjOx)63P@69papnu;gr;a)EerbKZMgMXD%jPba>*Eh&T2B+Y4~$$dL}v
zZ<)j_6gaiO*Y_A-?Ric*ggQIRpXphCQIze%Yg~%*4G)j?HbG7ytsr=B&bds|PX3U`
z&>dANATm$3TU(_V?_@ZMx7Jjk=O;=_Xz{WQ!RVn7m19}C=m}mXVviGRIMnY*BQMvV
zMP{Ugn<bvfUrp>$P`wTu=idL+NI*&|kC&Y-^kkSUlV1xKRkUmrXsmXZPL^X$j2S)k
zF3Uvgh|UAno#+5bR1-h{$rb(fA6`EdziWwT9tW1=>3-jbHn~n9#*%N-K*(Rwa8lWV
zj@;S`89@Lv13SEtWg`w<+^-|BmP3?q2OKihxYdBo=-s}#WY02mROF?Z8d$DMJj2%}
zEhgoJB>?enU)HpH-7V1P=)BupI$8b`ti6^qqUKOqNCzx=nbRpp$aY;1&9iWxhoH)?
z;-+b#3kHj>qtqm;tyr?;orMV+0!Yxz3`orAsc;cNqR1lKd8(_HIPH~932=wV%hbEZ
z-D&TpC)n8Y`1g?$OeRcHFJ?y2)*B52R1p)M>@ZP|#|lN^Z3k;X>%s>VCUeWJ34?gQ
zUry@_CmUzE5jP2>WSrKB#37IxwDqFBMNHeG1YrFcL3T0mZk%&<E0g}Ap`@Jv?G{8U
zCVh1@+w9xR;Ws01_Vb7-A|^`g4`f)S2$t2<)Wn!-JU_dC6}vRz<_@olj~O^f+M+xE
zA7gI<)^qxW|9>zT+Zg-4r7V@LNC+`P2~kNYYce8Q6j_onj4ewWDMX@@P(qUIsZ>%S
z*`rjlRF;VTuk+zM%<uaBuj|+4nrmj%r}y$a=Q-y-_kEw-ek<ND#2k5^PD%Ur?R_CJ
z9C2C=j5-us<@vWED@>#6N`HHX?;iI-Jx^li^cAJC?YC(diY|0_`W*42iyGoFK0wO=
zwo;6x1(_rzla57xZ?e({ixx>jP2T9Nw`ke2WoM6xpIYg#5#5GDfF1Jc8@;_kK2Kx=
zFeDb<ZjqC?O=v8Tx^GkD94#T<Q7UN0Z`$K*kIQB8z1SlhsdGy&RSM;b`^hct=6Puj
zs>|Cd6N+BdL$vuJQG3fi3OCgv^js!vv9h6wFKL2L@ULAQKEzp?FIgY8=BC`$q`Oe4
z8Mm<Sibwb9_riI+T|H+teeJVOsD#XkovxoMXuMQ+683|XCZw0opZR_<trWO9@Hq)h
zZmf;qP2^ER(DfEY1UcCA9<XH;ih?)q-%sHzJ-VkA)%+M>)&Z9r6fL}eW)gA4f-9?E
zYG56`+r@6_R`uL;)AHq8+rrixTF|UjtHT7t5>(zfY>elnrF#x<^9%UAZIE@MhYb&}
z=dJOSQeVG(KAK9qG@xbB!wkC4Q7Y-64}o+30Zv=aW_anb<#XxX6PI7XYxa2b?86DK
z?4%7iK^|-0%IVd;*m0y5XX!ZOZFE4Mnp;|_%W}Og36Fddgn+Pgq0{r3Zr>P(ax?mq
zOrfL2&jHa~9MisKOaD7X&FhE9@5{|<RJ4}_vCtI`{8(?n?1aKghH~4Lb89)0Bnm3O
z*V|qL+`&A*dnoG>ueRq?%>(q?=DL(UAxUZ8u3g}wg;S^Q`c5MyrnWTOlsozS%1X^S
zlb43yuy4nX419jS<)3+7hkr=Jc_K<+Qs4xk3EaPb-xh9d)%|Kc?d1~zk6w@gMGvxc
zVeJ-Y9o|N}DiRG7EDQsvwh<39y1V}4_j1Xs^VL^gkp;n}z46LH0!{PZ0AQjYPJq@&
zIqo;>CCQ}fgM<rKBic7h-WXXBl)ERy=u~<3xxFF7P|O!UOo(?K)UDfSxPPXkN5X0)
zs7$1YOnI9>otK4}aWZiu8@jw-pBd9xWAh6R3dO-Skawllt=pkCB><s1ds@LM2zb73
z&#Q27MH{E?6M}AnK_5+h`0%0I49y;{Z{Jeg+TU@T{4jB`^YVohF(cS>SwKz@5XT99
znYTXQV@CLp0=p&cyJ$#@nY=^DouU(f4}SXdyY!;Bf8Vd#>zw6q^33B~mhxJaU-~+s
z=j$)FOYzf~7qfD86GJEG)l0Pv%ig>>z9R4CxxM<+)oCQ}p5WtJoSppc40XZ;mzu`K
z8v6=I5lQ=_Z|h&;GVNCzBGQs18IR#IAZp2nQPD7sawAhWFS-KwXlS}2zo;mn#OVuX
z!Kz-9XB-_G59=r3)fqvpa?Z5etoFdE6J9{pUK$UE);r36)hXHZ^wAi=j0)=2F*y18
za;6^QFM4wG5;6!UVv$EW)aTB&eS2vTHtI0g49FrTs{HNSO!A<2*kHF9>(s97A*EGh
zhjBD*Ptp?*NPqs*S9{vG`~COri(j9(pJKT5=DFe?XYJD31~#+N)*@9NAEetaYgg6X
zfz`+m`Tjn0kAz$ya@j7ML46cEC+Ro2@I!fT;bFz{nZ2f;o7!@-0WtG7FGb|!#xu&Z
zZHCJ^i@@1-$UH-lK<!;QTA8@%ekcf%ffuT)z^8YkINp6Amh(p)$%VJWlX<gz9t@f{
zJ#CU&IH5?V{A9%$G%3S|Jv{sJ#fvcd$S*iP?_g&w_)yWSbB<8@1jiESpDaK_S9dB&
z?5PD0=eAxytnyj2@p}g647PJSIIvGz+EU&39ovl`I%cRx)v~SIKHJ1OUwt)I`_aG;
ze?vwmkdmWYpE{lPQg@_`pE-yv*a#RA@ZD7HMN{?bug6Q?PEPFP{HeobZrzV=*r7KA
zhPpddoe%rDKXttuT;cdR5ANSz=pPaiQa*Uk$R}TqXpGDK+*I#fJvIFiJwg13fgkC0
z8u*Rl>^^LfoL`}5E@fqD&zN4<P{<Z#bMLcr3sTP@ub#TMxxS<wdse!8^=dip{Fhe(
zhW5$d{q931jQI{jmz%-Fybat!?s<SH6h;#Y*`~LE5JxjU+z^ojT@LwQv6t5XjSBm5
z{9e7>SH0`@2ol6Ef%tg)X(Cn7;hs;=xt90~XcO+Zul+RZXYbBeKDqo)4$`@j*LRbX
z4QT~uQ%7k~W5UusIH+C@ZRuB+UWyP7b=qz616gcGr=GY}@XU<7ZN-O0f>*JZG8t>2
zRh1a9(`EC94IA{@Nn=I-qNAhZO>pJNR;Q%x4!V>Bqbl$WW&rAtk}0RXDgD}nx12^U
zE1aNfTRR*fmP|lNxz_E~{rTEsr~nV0uPPfSi%Pm@;?izwHLCLI<~}s$-ETi9dzz0}
zSfVoOk$3%*{2#8aUKO|~s3=($hTBrjaF=WVzZY_L!c|pGPE8#rrLySjNWsG~&Cq#^
zzHDNam-Y~(r{UBP+_vHO^cr?8D7P?_g}j_qI%m$D@`|U<v=iN0tO5Bh%$?KH+})+3
z&G*Q|ZJ&kJz0lt!FYV}Yqc7)PCANhzv~CTs42z?yki;qbwlDeMvi#|ljdI;fsKu>^
zUnBfyN7)Qvn`T;FD9X<dA&Luw_<(0j>mIbuVw)XSMW82uUi3ywsuxJgy~ypMCx--^
zj_MYDZ=f`)fDIyR2eEO5P{w1*rbR;QtHAKXOvB=v``1c!2?a~6^MoKo$eQ2G1!;`%
z#DGJMmu|`4WAMkwk+!WzZ}o7#cTqZxo^g`=n{FRS<`VcBoV|jR-0<nhKPtn)B$s1r
ze$usMigaD^BK3moaLf8i8G3$mU|Q{$$K#kl5m+-_^l7C2LJ%<rw{PERn)-&<j*~Mc
zO^dbn%O7Mvz5QX0UkOEGNXIPzKV-5~DzF0x6EF7qy5*M$kk4VIp+sDTc<J^#ZXyeE
z?&37HuF!wy*LPuKx>Uud%x{(9{h!)7aPq#YJpo7n1E156eP-El>8tz6F(;f;`LY{?
zQs9+?_Fy=Co>ez_2OWm3oJx9h-0e*sMLj8YIMuptYO_7Jd>}td<u2+!N<59CMLYk3
zf%Sq;^OCq8b^^Qxjy#{;{dzh=2=D%A*7Zu>SltKYy6dX)7z{tAy+$97dxyM36ip@0
zaJ8HN{`<@i9|z4s?Rsc@yH!w-=!|X=sUw})sjmk%`~%P9P0%9(?DFVz=4uwsy`yaZ
z8my&(_}ALCM^i~*?!cj(Z!?_+A~k}9%o5xi2XwuvGK+l8GJR5Muay<xCp>$htnVTJ
zqjHMvxqAq*1L`CHSEp#nowWS=GA5(!*@2n0-*%W_7!(5ZIfMSi?dM=@5oCvzpLS6<
z-J{D~_2ml<N!`HwmXN@2QZxk>z30eF`@CPZDyn|1txbO<TDhXKil6g(xaCN1D_OVJ
z#aoBb(z4vT=acFZ?NE>D0>3E_W~eml&b>@A{E+5dhy~wv^a-x+pcx24wyQeiMjL><
zIXgLJ;`emJ=)|bx)K#2F=AgJi<yV|9zz-9W&~NQ^g9%kucSwg90NKue+6i7v;B#*9
z+0sq2PwVghJY-?p^|KPrY_C||s$*L%+YMw~_4Q7BM^USP;u(m`YT!E<Vyn)+6=Ci}
zp6WTNwYs~LZ{IvLrvxT*X>X$h@=Y5jHuUkLYwpm2;8V`)lc{oTOa8B9C>B)zY7f3F
za-#T?4(YqEtE=)m%`Y?Z%CA=t{1f-#`XNrYdo5bEXffC2z%nKhWRs^Yne*Vm0|)=b
zh|->aB!4=tvF)!nfOV<hbZ1whK#U)<?%wC+aE9(UXZdL-yb2^6Tk=S=%d6odbjT>A
z5puA4$RHi^$MJRRKihWh{=kW=X4@A}*i#gDXNQ`GG$g>wp-}kFjf5k{yp}%#T`{nn
z(CkC?-T3k|Jxab*S7*%HG5(hpA8Lo*cf5LH)Ww9RE~9&?P0LQXm7<k+c-Wq;Zqbnw
z+=f~0+iGQHxu@8yrC+C@emk4*9A#WP`?sggmj*Z+)ZVeD@=NBi%26tRU9#5P_PO+0
zsm`4NuS>po{O0SDFfZ!PyRZ3I_p>4_N<R<e<|K_W?q=$DOu2}H?P=re?2+=RaCmii
zP*D5OLx=3Cle7YiC3rS8o;rlGd|WplL$o*N?DHt{{dP6>*GY;e>TJA}Qg*{o&F_dw
z^z?+~k~zD?pE)y?5UOrb{d~{T`1tq*n1!d8KKj~D^O*Fq`J8S($(4h~xi0OsudDUq
zSw|8RS8ptZv+!a@^{1WXF=>}BjeG$0XKvJ~<wN&%KX~w<ZuJD<!AO1QpXnfGCbMTB
z0DfPU<0d+)S+n-L$iM<ukBy(kAb(Ojb<FKf9>Dv~Pil#iR<2s*3KTl{bWC*gNdyf?
zIxa<~Bo*@dv;O9Tj-C8eK>yWpRm@iU!VnnT7WJ`=9#vFSB!q#3-UfTato+;;J!Drj
z&0{Z}vrnX2cL}KdX!S{8Kj&DW71NCl!-prP-XoVdd}ZA|vT0G?q~=tMduVZXPeXO5
zj*L%G1lxUDwZm@o==F$-3eIKT#5Lh*)s0$yudNE)L(|@JrQ_~vmh&mkcQ5WZ#4XY~
zf8wsb`>(tgVub5`VIbs8@CN_2nmvSC;C_+^JtJ>bXeK0=5%-hY6j==$)8qA{BPv3?
zrSUTOG|a$Ah(gvUzTib^6nSHLuhrcR+v3Sm+vm2!PeLX|3$v+H!#K))bFV#d{FBo^
z-2vv0$XqNIJ|d@RTDucKgGNj>u1BY>HNKv?weM?1cUDs2`_<JSR?m31i%Blk;cmxj
zDNF|X%`;Drp7O}p*x2@6+Tbf`X-BWD+bAp!IN%|F{Z&&4MR4{9_O*F1IiRp0afaV0
zZ<?yd274IvUgGPK5FDd-<mAE7P`|scNY}$lr7KvN&%x?f(*7J?Rv=My{ovqe+ub-*
z+)tuT#A%j|1x0Flsf2+~czCKhh3t=jM<4AR`}vFBYa^F)k21z-Yil1Hy!u9Rawhdl
z>VU7n$=T@n3g2Um8gk>t4Xte_@=8M=K7MHwIc34dz(&ZT!>k8RkIr34KXSV&3B)8w
zA3Z<p8tr_w@Ym-9Xfr@{9(+3SQ?W)udHRshQ|LeDZ@l!HY<B?-e~S+JNY!Z>C8#Mg
zgvco+u7{$g{Y;Tkzy9EX4ZU2A@&;+7pprbKm(JVkJ{lw~VfG4MuYrNVpwlEruJkx2
z=aHJ&Nc>}&MRi$~6UU6PaVv}BX;3$9m0G>y!6Grvf?xh>(=vs^I%n|_XS%@C$ymI<
zE;dg`K<6L#so?eN>D$gefqT^Z#K>y^jcQI#kw?r6;wm}B_Ef(fC&1G&7V(S|k5IZw
zqc_sAfL>7%4Iqn^=Yxaw@%MLHB%N-AmzuGq<P}EEn`h9PVK7Kei_g8WEdkUo&>XcI
ztdQ>O=S}C8#L=8B5Sw2mR(lk3#>cxpxl|ja`1+AY)K4LCEsQv95zQuTT7FyOsP{))
z4E^qUJl)vccgK!N_IXdA&K?TG2DRa+IOk85mGddl(@VE(X(&9Lb?erJo*E8nC&qE0
zEW3@ntpbfk%e-`JKQuGAUfI0eMPE^LzTO9c97+I=dVg3pIvx?za}K}z9{FAaPl70E
zoT`?iAfcwc0k#q)r0u3u`az$?z_+KGT4?rrggz0=-@h!nzZYBGfFmh(wzfjMO1W{v
zH`!JiDIYzK{q>)h`~sIEQ;Kj=;V`7qRNC0W%4Z_jDjG#?>2(5_4Vd?t)GpzQ!Ceb2
zC|rfD_Qu7<(Jt6^p7jM<HU)~b(YSG05S$P3n6Sz;Y>O|ZdBT+iI>51w+Gt&g`$8<Y
z6iXsh7QTLdq0nwcPLD%VHjY_KaL#$bd)Mj9zS-FhqBXJ+QOw?tOQ|4PnYyZ(8qFUw
zGMHW``gHEBPj_<Ph>gXEQuGFXsHn(hp9rz7@VqPr(ukX61HOCq96fhVPYCRN5-NnN
zK;j=-e;$(w%t66oY2k~KmRnKp(L;1I?|AYNLiE5QL^Yh1E6jkDbA6pb=o!usulnVz
zR*}up2cbClOq2wS63d(b1+*&Qp(zFII|9DpC)ahpWj9PE0Ax1=R&nvz`Zf?`b@HFj
zYm!Nx!H#$lc1ue|2#+38Siiz90FjJjc9&a=R;|W)9|#E<04Oo}vy5&OysyyB-aY~(
z{3spwV&E2aO-(am_Di>TVgJFHKR&9-lblgw#z+?Ex+v+|HMtdN%OdFZ+qqM0><z(R
z&O%t}`p@P43{A0kfgRVJF;}l$8*H&&EFp4Y7g!Kl#^MIK+BY!ph`gh2-HP!+ngjr}
z1OYHQ+G6xoX56ku|Ghh45ridveRnLqly4OU3EOWH*I8CpCQly}xCqu%Hf#-ziVqQC
zXn|kAL~vYG`0UxAlt3xT$#IcKk+d$<3FJ`fW+;l?8+HIryF=L$?2VqohtI%aAzLW^
zG)>bigFoTr;eV!Iuasl~yOduhe*mG}?~<HJ9v>1C+)_j}9lVXcxZ}duo}XT_E3H7|
z)Q)F;1B|zF?C6S|eyDi-m%MrNrbo;p>>4iZ*thTX83DEK7O?7;=(hhYNhEi6GZL%e
zot?b=#8H>LJ2l6nvg~>2`Ssjx5@%P`@|8T8_Aut0Wy-0=!e-5yxqMk|{J8P5YK2J?
z@?^)q|NdL|Q6PnI2TI(_uJ4K<|80NvYl8+y4*K>@A)d{6dVenO<oU2uk>zQRsw-aR
zX1U*p4LS^F965517>7M?n@@}Bw|_1_Ii9=lxqYJJlS>P}4vqhG|Nf%WL(nWoecDZQ
z&%b-lJ?8R*2MO7Quj5}2o3{_)_*OpuQ6?+KdEdSdFETHmAdEPF*^1Rm2SU3GHHvlY
z4E9YD&7=;sYuCP0HE>NoE}iLL*0FD^?CFWArGEiY-|5_GTIQg_*c%HFlw0rbhb3@I
zjt4kIdcP>0;l9uQynp7cYju>Vsc8q==E3EOA!X@TX-22M9H^m@>V(OmO>E+$f;fD}
z@3+Wn&;HKtSCFyq^_#bE<BEfpuyeM;1@ybSZ~JzWSC2A4pHkk%7omp=D1XGx+<9c*
zCXXqTGON?U3#QqhmkC!EUlvXD?n)i^mIZI~wvyPA!CPuiMXPDCmqTKRws@plCmlR=
z$p5+r5j_dM-^!~QIv&-{nm2#;k>2#Nk4x2GS3jDZMyI>&!{O79o-MQazNpBHTE-*6
z1bNgEuFEGt=UC7WQHLpoF+{adKs8ZsimgFICLvzOW-^R^2`r^KvOB2+#HnNqZqr}~
z=5M{#y*Oh85g5w2O<AO3EfjkAa4OwmU@23uW8(i|S{zPw9TobqAMzrCR%6O$2BSt4
zon`Oe>kMugu@OR)<SG;?{uLz=+Z_{9h7IgQ(@YdBLRxs_@%g9}lLB&5yV0=m&Ryt_
z00mBt_~Q5-$>5*!8QLZY4E_d6QL8AHFy7j8WDlaq8Ejp1Qdt?kNjEp*eqkzUa`n3z
z)uG5I<?P8K@e_y&EHj+shZdt5L}~%z>4?oB-RNnND38+KpZ#P_wA`us04p)kR$Sa@
ze+;%niI9?{KLmU&)FlwQ!5&p-L~O{C6s{KSi!6a+@T>D%(0dAxLEgmfVXMwc-6FV(
zG$E!>-&?a*En)IDYOiY|I%bZr3R>=Fx}WLo9_4H>B}>a8wIWq%LhD<4_d*i}Q#;1$
zN=0{H_qX;voFeHjBg=7ea=OP}!UnHh;Zg(O2_N1YKz}F`bquLPvOrU(ha~oqX{!>;
zCBtG%@w>jr!6^cCfIHsWm;T_B4}buLD46Lq?$4XSYd9c=N_$EwOOXcBT`v;9rZ;a)
zI4T4Oen2<uEJqCF<Hbv!wM~7-q8|OvzsvFrh~~+gRObR6i5dhWP0zM}eQ2y0T~4ot
zr-60_#kMVuq56$3ex{A1|NLoH{WE779Wn!$_JGT0Q3K&2wq~Ou-{$h?B<s@=zx~V4
z`u7(R1S}yHzh3$B_3Mn{s0#!BPmi2$-)VA*^Eqf67;$F^n6<JL1AekL3BKmS#gCEi
zRNqpL@PV%_NmcD#ufj&N)e7iO+uJ$z5m^T*@zeVrw0t7e+nSX8WDjAlIehqVLtvnt
z4JXu8gc%Zgltx~4y4NtVaO&7V5f)ZewUVf6_*4&FP;hkL;bUy(;)wxxp_K>yb0oNq
zW6ME}IiARuoDKAXt>SP0N633kOr=!mem#7ws_@)>fIO0(qxV;tMr!TFW$PP!`8ss1
zT)$@uni^MgC#MsX(O;oSYEX8wL`)}5+QF5boR`s&VFe3{b7Kxe%TqO!0a_TPG@Q`L
ztnr<}!#{*=>Z^PR@{g$BfB(I6QFVQ1kN7!S3WY`4=RS8<uVB1}#plvT`-o%LpEzE(
z)oR1sSd$0H(Jof*`f&-^aIFyzCrjDpgcs=@IYBIq?t(5Cs$W|b7XB!sJlh@tfKN|i
ziJK5&)*!AJa1od-Xxr*)rfozC1_g+r(WlQu;b&4QZ(&?SSb_50-*6W_+VhznThDfb
z^eS#uq-hmI(jEf_n73)uX7K8-=S1P>lY9`wP%H!BhGqkM3CJ8Wr&dl%|8pAr%5PkN
z*z}+En**jWJ}GG1hnKm3;3e@mf}F2(W!2{oO%#ryqBe&!<?S(a+jEskiVHc8ylRC>
z<*t1pQDQvAPl~q!joYg)boITK94&V_9r3I5C6r%}>nB!Ee)eI8ze3@tW?l7L@Zc9T
zy2|?%GQ79!Zp^%G%70ZT{MI6~3_cL$`m;1C>;#EcFDgyLTNIY_7^SN}{y4Hm!^z;y
zXFhiDoVDm|QMbZ5>4=*wOwG&=ifaHKTAsA$XEG>4;dyV3e5^{;y#IbI>cNAwcnohP
zR)1Lx<aqb4zQWVK7dBG!=g;>(2r=jgX=k18HBBsFj7;EZdmp45BSL@vr^0L=EDV7P
zReP$d8zI?~|EUP;AQC3^r^^fe^U_NpDJ&g!DzCH-(>KhP>I9}$E~8T8zdWlyFAXDR
zLkHAh22JqbeC5U|7D1iwop=9{*bcY{*1TsM5!IeC75c6Pj=B}3bYs1ZyZJ;;Il?sO
zY}X)kV<{nQ8d!cORKtN?+V-3MH-AO+ZzU44vNI3=v)@Kzw0H!*G_ufy6{jf(8ctxr
zZ54$kd_N;^+&iblE`Idz@hC;{S!5?l!7Y@L(QxNl%Wmk<YsPtN@UiWx5188}W(rJR
zQ0SM6YS*OWogqI9sIr&QcG*?_mhz{eF##ls`1x$5o@=pLEgxi<m0uFaEKYdm&rEAY
zP79WYLSe3y-r7ek^;qPr+kR_G!+)iPkL{@zL{36BFNzgv;c&<lqj>;k&BU<|l4$$i
z{y|WD^yJC@VDN33b|acW>0eq}S_;>TZ6(IfZTK6L@nC+Vp<jC+r05biF@PvWaVQkA
zjYZ&QsgAsQr{BM>VftQ$rKLxOQ0{GJZ5>Hs-%g&!9)@_BCO&xd5th?o&u03uO)#=j
z`s|gJm%pR!r;H#r<g{cNsc}O=2kOgbM*PYzx!`AZyDI3X-*I$1nLj+97X%a#H|}K*
zz>WP9O@-%0=D+QS0Y1)~g&qvA7QD(#Ll_becD1y$L_Q7~aD-h<E;jUvwKA?{IrWRG
zA;Y|G6D5@9D`6u=`5791uyW83Q&7))7deF~A9hrfJm}lW{aF)~4a#THb*`OVF)FO3
zixSIHG@3is40EI15VX{}54HRsjfWltx>;*lL$O6Lt)ZmL>7}?zd!Zjzc&dv^{>>P_
zLH}70Z6!sm!+~%3DP#555W0gGS!s%{Uog|oBp*;SgvY|bfs)>9o7K=SVz$A4*0iCw
zl%#}Pd73j1!fl=m5#LW06342u81*({4TCbtCaRh~H$7uK3Hv0=i^^yu|JdU8=J5#0
z+hFpEDiK74M%fm8za3AI0m(vUweiNXs1d{9PPAi%MEwbtkH>kSnTd%UuRx{qK`&*&
zow+N$V+_L#Cqr4|uo1*a9)u#hgFN8Lp&1#4(LV;7gvlJqV1}m|C>M{kFd$}JThAwD
za~jB;?k{c=^J&-Y;o0R(r$l}SEN<jY8t+Qp{(>s5&SAb==;m0dzPeAIavFg@5w@S~
z@`zpgVt)E{u51M9?qOyy&RYssQe!-e9x=VbGlEv%phm6x>Z4irJ}8$0U$q~zM0_W5
zj$(zY!;Knx^NzwOgRbZ0<$e1pyj|d3WVm;Y0Mis76}fyxSO@`7|ADzdd3oGV$DUPB
z4UKWIE6gcKRp&E_TWDLAe|1pUp8!*C)M=PCV1I&YI4ypZ>46Zu8cwLAc)pY}_gzs|
zlKYPBIrcds{S;lv+8T-oV>`P;bsBe!q%<|0%uc7AJxC(mhT#>8;8)_C;ySTKdPd<{
zo2B2p6nX2O_SZ4qkRBO@nyR66P5}3SHUGp+Uf=wBMur7?rx?!|78b>NDBITy*UUMf
zRTk;y&m43DK~^JxnAZAm1tZ<uP7!s>Yi0_!hTM6-Cw-v{GsTHxQF!rxHdOsUQa#at
z^Q}in=LuU=9378yt0J;%)8W|PbQ^{Bn39s>t%_z=a%)&w19aPvMA^=pw}2+ARl@RE
z)o^&b(f}f_lqYm&5jn6#cvi8RAq;)Ijb3z>V8??nv6N)sNKOPK!f)Q2boqrPU#B}~
z^M%Forb-L{s;spB!3NV`$3Vq7$ZHJ4GiZQ`oMLB-rKdOmk!9KEMu(zev)R3rYJl_8
z7fG<HA>tb=grZ<jFK)FbDO(E5iCYk36K`R#P)y1`;g@=roL*V!EaDUDTknJP1Le_1
zYBf~EUSO{?Ydn-dX2^8SM&gyVXcP;|q*1@&bv^H;8vI8TeD#0fEUIWK1qvT0jbjX2
zoxY+96nUQ+iQtiE<CG|CcGx75>Z|sI$ADx@_OSXYV(0>C;PYgK$#-x@;?&c5rH~R6
zF$NDViv$rBQV7MXnjzJjQng6uQ{H`44%wb7s8I<{H9J&6Jghmtr?5*K@lF(;8scyj
zyRE^0GQUlEiK`QZ4579v(6}38G%m$$DqO7LUZ-x|Ar|HQx~~J?B`}{2D7T31?X8MS
zV**N*vtW-J<!S)6Gik+rT3!9Rx+m$Ow>S>tr?P(YAw{+kl>ca}qtfPAt`I3`rKO=<
z1x<eY=MT`}JDhK<^&U!)SzL=GtObe9ki}&pvwhuK|H)Z4Q7mwnPX^cOl8>SBRNjB|
zRL$Km;aCExt6s_rgP6!_9`u1ho0DOiOhG3Wq0(Cs+*De)Vzb5X$-m#KO`CA&-||-a
z_idvvJ4#|<&ttzw>gO_pEhoGX3YWIKShF3aEhhVnJg)FO-XJ1@ts260nye+Bd+tsP
zX2G70DE_zQpFN#&nAi|AE5ERypqL?$;utk!f6WO+cl_?*{Subcam9@|p==7lN{|8K
zOUhQNm0w69gz^p>Vfq2j@`<ZI`M@6N@%R^$63hce5RGz(5J8`*8lVkZ2;JR1X_-R*
zoi~UYj_QW&1DENYi%CC(*8<9rKN3d*&$jDHIkSyNXZ)OD-%&-RvE3<YOPdgm;Nrxu
zC5cF#oW`$O=I&lZ8GD<12;otT^_EB>XPPWKD)vaODP>C|#hK9{*vo-}b3h8EZaur~
z?E%$r-hh}qiT+l(@tZ2FFQa$bx3X-baCO)lhTbSGLV(-CNRW_O@3{P=udbRx-<uOo
z=D9p@yxeFx03A4^M$w(kU-R+BgS(KD<~2@o6MGeo6cZP-ot1IP7l-_t)}N^(zBP!G
zQm@hH^<#F?c#f5*$)AT(hA7D|tX#G8CiK7(<b}f9HzDaclHHF?edpf2Q3ALL7(f=Y
z=QG3}Q4}PADFVn3?At}rx`aO0Zv3!>8rSqQy1g(4os424t|zg_)?~0EEM6?ggdz7j
zvHCo+fhYH0<4A~PL}?m{8$v^4yj46?uCTx(4%_Pt-#6}uQQnzu;zdgQJhJkt5$6hp
z*?94f6R<$No%r?ve%=CDFnW26T#<^Z8;+Dv3DtFV&!li-*zEN8KZH;`Wzp#cge;Kj
z9&fAYatPibPr8oUFP_EoSQNVlxg0{4WIa0(e3c1(QGAF!`M<=|5%p>Wzn9PedJNNA
z_*s4*Bd^$M6){7Dd@<o5#ErJQ#5m7<Bde<Qv9|wRRlRXH4i67Ezr9*uuRhZw;Rc)M
zwq7lMpkI(x-;q1$WdcGIE2JCD21W7$@5*H6%+5y`Dq0&!2Yw;tQax_%G+f*@|0I+d
zO`Mn*d6X8daHVAgs;Db!%&!q_H<g1(%3W}#u#>54z+!W#{)FEwaTmj6a>8e&r4IU5
zim(8Z>1TS|nDrPY?&8uZq8qw1_8pA^dJX+)VX&y%YzHk+j2530^J8`XQ<pxspr#X<
zC*nw=+$S8LTp6li)afwY4Oz6}Y0Wk^Utay(3BwO?wH2R$6UAxr4|z!dJ42-VBd|Qs
z>>Qw&uzo^PVBp6XXd7`tz71i{*&fCRj(Na5S3%<N09eF#LP3?F+8Yx@Y9uKLQO`t{
z^$XZn%kxA;?$2f%j&p@%$;%2d2$aA9ND{G7k5XJxD%439a2q2Q25piuOI|cDK^%5d
zu3zuV*qiLZVk(Rs>7uyjEjc|YvvOXHRhWbfYX2L(nPiq2y+Uz7IP$EHro!Y7=+s0E
zqv`D&m!z1=ETk)Y^>O3#AD=dOKuaUt0o6SM#p?(&Kpw%5_lr;+Z6i&J>(H=pA!Yyj
zuq5Su<>z;W&4h`23V(&9TpWJ+tC*v(T!8e(ZAoJfS%=VmBnQo6W~e%wL6&BRq7Of{
zu@d=lQ<74Tk5B3nm_({zVYB8Ip9uu+r{w`9NxTvlkn8x}nqe*=BRQ<JGlcLg8<NB>
z^Mt3mA5<S%VARI+=`F~XGIyA)#UfOn7Ovckn8^~8u?M<Efn!qAVUY7EHk~;KR=J)I
zNcW4WVQVOa$ku}^Dwcd<;jrj~9UHlhWQmp^h8r(7YO*It=?oKF*7|$hI?Va6+sCFX
z)M*#gnwRgr1gKi<wh5xW+hkA`p>F0=w)0S2YMa2+`@&cqbGA4O;)3hP{Hk+Yf_MUC
zo=Ex06WP^#`SP<a>(baCVrs{7o!gO-Neu$h?C1@PYb|cAM1$w9E&6`?`0)j$kh1YY
zMr-3TToG|w>=5Y^PxqF>g7pwiQY8fdxkh`~#_pH*NrwdTftiQL`SMB%2OA-{uBlgJ
z&PwSWVvd4)qIq#SAmvR$lx?gPQqwL(y;fq@b78TR`NGdX{#MR#j_$FP!t3WM6s>>q
zvoHQjx=_GV+yi_O0cHunlMDJ$2v`s@_80`>#jF1Al^@Vk4goW&=T4w*gbF!G^cD>6
z=vhMHE7cY0wVx`)k_F(b&1Z0=WNXpP07Z=tDo^4m;l}XC+>S9MflzR>4eG5a4uv=Z
zF_1QUbY7{Ev9V^X4G5xi<Jc{*8;n>My(eJ4W^!6QD(L(7wSCDrKLa@jWj9~{s8C$~
zfo1AD`0m|1efGLEGH>6hYN7pUe+$nEbUt_C6q+D-+q&nRkoFhkhABN}NN1MpjZhLk
zGO0%2DXcRpsZ!0FGf=7RUG}t|vfg`aMHV|BwL`kOj<Cu(=_6RfH}ZwXcH@RIqAa|H
z$dFl4aVlCSrIIP3U-C<{%c9{s?u*_v5^P${q;t~og&kz8KbiM@_Bl=@y%WR3#K39A
zt3iydoQ4X$cZYTglcPVVFZ$zm($FQ$4%w_DG&fjRzV61nmF$ESOTVu*h!t||9h+Z8
zt&7vuA!~=BdV0#!m#UtPpLpo4qBR#~BiS&sb`Q{?+<D|K(_Iufmsw~BNK6J><Zjuv
z?GDc-MeiDA6>Nq(U}k$hOK7E+Uo1*2&KyF-l$^{pZF{0P1<aT1gInbT$2qjZRG1+0
z;>Dj>#O!NOMfi#sOGp$0&pfp`g42i_EX`tTt&WOds~>b=&)pOcV_{}%I9+tWO%=rd
z*s?90RXi@$nB13|_g+$vhuZwU3><>0^<_jUtxVD0VO(XFaz?Smh2w!=aGR-y6s^G!
zjC|d*_T4JjMiqRPBV=4;dL@@JjRem}Rnl9x_=J?0S_5pTi2=!MPv?myj&sTHUT^5e
zIBrm(nwyz5A9zu!;87ImLDQ*j)H@YoACS$}U~zw09j73Zu0OpkS2qF-qKZGJom$xQ
zgL+S%qJ@|~>jci#2^2?D0`ipk>A04#al^6;1+)ZF84m&ScMVW77RCsVua)yKdOLgN
zJdJh{8m8}~S(l1}qq2%6CH6BQ@`FB5&b`1ZMz+7IA?spEvcXM#2Ood`Zh>jt;vYnj
zyhR*8{w7)njR3f@FFO}ajrh*h^5&khJ3{X<B{zzQT+K-#$U4c{Xg{wSim)9&PJrOE
zJv245(UP?2)akY6J!2aDmYtrjShg$-zCe15$-D;|uyeHntan+<gMJ54w)5^h+7Eyp
z-2Oy1nA7}@a%=jA(mf;n=xHdelCOwLM3-%`uoHq7$-|g+aOErbow9u<5*?Y71+8{v
zPVKtIp4{RKs1kmFJETCq0Rfiw@f2{_T&CK`M&Q)YySj>oOz<I%-o0_{IEq{CMx)n5
zNT-BAz&g}OSgg?Z8hGdOzvGk?+2+u`h(1fAj<@O5=@=p>t!;2j*4?mk*?3?)=RqsA
z^U-v&n;HaxnTc%;CsyZLf&?iP9zH=qFIVcpO4Vwh+pK9*X}F};*HsJWC=#-V%MeY?
zNawXRT}=RySdlcfyY(D6(1Nugdq&Cwj=~{%rI^Ej&I#$eIr36S<f$kK8BgfLv9#tG
zzbV-6dUCS3bEqe5aN)MZE(7kEM(hlL*^57hgPqZ*b3tB~Z#~O}o2Cvn2}t@_Ss9KH
zQ7bTwKf?Ov+6s4HXmJwrBJq*n2>b*OJQDht^V`mHV`g2KK5*SBuJvE6e!WfNGdWLO
z)fkgCY4qsP;Th93TS6NwhV~$gb1iLc`hHX9#;rNq8<^EC-bN>IJQ<Vu{GBTq10qg5
z68um!^V;l`!LL{*ZwdY(NirUsQfpd0BK;8wGmX8EMkUP@CK_p>|B^aA)wfe&0DR2L
zEC2<8bYW4@k7ivD$n(rfi{xaD>+$v-#t?S^4Ufu!2{~mT!xQ;>+?Y6W5Z2OltP}zQ
z0%!=HiA)n<4wn#`&Sjgv7?V*}f4Y+>*e}rzKL!W$HtjCFuA1Wcd$VRC!d}SE*iEG@
zT|`bO8BxtVgaMs<vSm-u=e3Q=<qy3<vtqBlQ5M2iVJDO)v{nS`{n$;9m&<`c8)!0o
zM$A?rLUlvI5^-Y=3}DH+=VjV~exX2kdf<v&y60I{0xk3p6WU@>8HC1eeY=3hf#~8f
zO0=?@A!=&MOAOs<@ttJQx_WZM;DxjmZjs|kw~nbKd)RQ<Zk&nK{PH%i5W+>+CJ6V#
z;Nli7TgnX+%QK-mV<>cDa^3(KL1KT*!!X*@%I_b#V3cnIpd6($^XB=04`q4b%3UBA
zsTCR;I>_QK2Z+V@=zaGg#~ybk-Eq!eN*NF?VEuwi+23xfSg0WB(^kAIcFjdaSrr5N
zw=$>Z1Ve}`PQzY-zdv91i%A#8PH?&W5zlGNoaXYFPv@a3d9hHcHTa97@fH8A@AVwT
zrv?hq3l{tu$5rsgj{rR6F4TdEqpqzTlkG<1M?P_?rep`)6<n1S;|+qSWV56;&jTS{
zo?lB*-1-NI*5eP7*i(xiy(0c0tmE2J^}0cn8u(y|52qQ?U3@+e9>(=hQWF(w=s5B!
zKW<yM6Z-FMl56SR`5ifin~%o(+80`L(5DCfBfJBsT`nz`EMjN|2?VaW)dRbA8x5u<
zJ!{sGATmfH#V~!qM{{n;`hj1%i-%;cz;xUM%+5&K2&*jHZN94FldK9Z{_4bHK;BjO
z1Ht^wbxo+F(0^d-Z1&8C;Hdoaw2J(8+PXv?kHaB}z3g9EJBmaE&e03&S=!{UJS7?w
zy^WRo=YpTq(ri+HRh;Q8F~HcaDi)sH6gut6ai68(55Iv^KpHZH$Y3L_T4C*JVd7Yl
z00T*y8#G;g??s(#A#QD3@TonB5qEB>8~T@#)VralP`};+wI`Jwu=3*@=QPtrumqfs
zUW3&W%BbuYP^ESR22NWi>7{qU<bRzr!(znIg?*t*iiEqfutwdDkqQNAv_l^)R4t%a
z*GaP#VR5vP#R_=|sIuqap4=yK$}ibgP%(-kKTse*OqlVZBUcF>i(~#QJQY`LuGEbh
z2CG+{|8+JbrNM-a@KQGE0lwSv=oE1vBV}IdL}F*MbMCx(oIn$ZgK+(kqT7a-VhoM|
zQSNNno0*7#%>E1lX|zPx-8)8R&x$81WwUwMVuj+X=f5rm`VVL-hmqb5vA7GhHe1pF
zy=z%nrj*|%*yqGJid#r;Bk<NFKzqqANdHDdM4D^Ril^L)riTosVD+N@I!4><InV&*
z+8wws$KoELk%94bKH^-k!n${_UgL?|EInwi1e>>W9l%?fQ&fCQ>7*|6JV>-SC6*50
zfL#OA4$(+eA3nS~S_kfF{l<-V0_qc)=uQ`wv=wpX+?Mmu4}C$V&}86A&Xu30^u(He
zE1@23(s6+^rdM#<T&HW-uCkHH>7*@Tu(3w4TM8gPISO%A)o{1g+)bjlqIoQ+ts=XT
zkm<7b|54ZQ7T7}t6vJYMoZp0uqH%vGTYK@&K#}q-hZL$YKtqww0Xte!5IJ=)KZ=Zu
zOucs|E)MhS8*xwl2oe&_!+No#MK?zZg$C1a%Dtz6-T;@3tg@m+narG+P_NgHx=M}+
zr`i>d)vH&Fybl0*MyOY9MOgWdfS<h*wgVO-Q;7HY>&Cj5+fmpayfFv#<rsbV!JBp9
z4unybNbJb&#U{Ir8_yH#oVjT2#H0Q>v&iUY(D#0g1IBGqI8{Ry&TUe%caU_gO=nQu
zr=+L*$NZhg1jTP$09C>rL2;6we>44eJA8aXI61k$RZK|}AEI#%yqqC2!B9fTu$5;(
zDhi*<+wB&<wXn6!>tFz4uR=F+*9yMB1l3SvG&r9tiO{zo0+*xv(4h%e>IVYknQ#dK
zNzn0nu%+x`Ecqj;H0B|jo7Pf@;!Jc{m9O7%ga|kETPw{&ZUe@nrs3J={?nMfe0ISr
ztkfUH4SqxpgweQl&o(v87PqiblLQZVsE#U3nw&J2I%d!65<9G7ji@uvMSATxvg+8k
zelKB5SWEy~NnX58^EE%V(>wqF*ILTPw>8`b=_s+Vq(&zutZT8IMQ1$|)-UDmQQz|%
zpyq%P7cS_}wY7yzVs4bEu-}U8?@taB3foWcAaNyJYT1qzOL5Fyh=7txDyD`Z;;2b1
zG!8S4G+&G)0EJWUeZ)NyDCOG$C2(R5!%!GF3|lh5j_D>PW20o2XtN&+%&Zd)2GMr1
zx;PfYc+ph!J3l>twOFzhbdP`%WEG?X%H#JLlmT%hli>x_7i~N6NP{Q*38stplXdti
z$~_Z-46=vS$kzLjLr8T5{mcJeZ&lSRanZM-jtPNUxn|TUkwtS$#KSM}4tbT4!yU{u
zxUghy8CON|A=vEW-@AMFF=U&gli0h9esmMJxcn%ZoLV}Fi}Q^+V%jNzV4+UxLLE_+
z5XhDg!{y6<EB%inEC|1Qhe?s8d>#s>?02t^=WYV{Sn@0HG&@r~@gST}vyUg*Es!jf
zwN436?(<lA-CDMAxIy7~RnqVRr8iMl6ovMIkP7Aa1Aowu)&Cb_zAyl+j8SABSCf*c
z$6;ie-%iSgD<bA;F6+j}<VI4EoPW$C<)OCUyRyu%m-U(Bw>|}u;(dL)Z8c#n{??~o
zOooszCELXa3J72V(0nG+3vf=QYga~x;B7TkrZg&}?YWfludgsrBMnn0{(x~F7SL0&
z=!gzKG8FT!By&JG=`D2f^Yi_Dd`y+uG3ZC{NheW1=6}m3Df%Rn6~4UI;=N0Ht(p*^
z6?*GaEt7#kFwH6FBeQEmY-7%mG-BsHv7-hEFP%ESox=0T4~ClK7aB>z#wx5ING+3z
zzyQnuSQ60d|Ex0PNhq>si#vo=KrC{j(t$cUQwXHWCDw~$Pddc>qLcIzImIKh-N-h?
z*g%pW(lPNCQND=>aLm<?(#K<=G38!(dPd(t2Nk-aBi7bQ&dSQV%GL^-^wS2h=Aa?f
z2Fya)X2ypFB+*kmK;$BSYu0LpGK)2-EeK~C0zGtIuAxy^F#8Xt@G~lDtoV1EBFnrz
zxjKz(B1iD7SS$YI0Q*TjZL&i2#k39mDea&wSQIby_;xBTJ>Th{_3PViZ%!YHNp#95
zLNS~xAT;gR??5<j)kh$`f_~t*;&2%Ar?#G+z*A1&SAAzgNRTI`Ln5MH%9uM3W>{J6
zX7A6am#lc~EU6|&t3(uuj?@`%qjg9$S+XusRlpC8#(vU(oX(*yTHmvbFmISzLvbNC
z?1fizgwjW)b1fHX^;5(&ggf<ZO>xxsW}lN;!AAl5gec3a!=yHib6W06-;zS`9RxOe
z;Cyi4j{`oqz`jSmU_M(*(ZgoK1R)`HkAKjcgSDcfVsf6)km6MCBC&l&6&}4sNIvAe
zIZP>KYRM?+ey3imqX<4gGdz|&RSq!x8!dub{ZhBC3^rmzpDjI=hkwL#+0fZW|Hc%3
z8~=cSa4JByDU~^RbgtI6dcXd9lgZ`JH1BO;A{Hyqnka9wlb?3V{2kELRTu*c7e-1i
zgoSAKVk#Q1$re{@c}7h6wj;O#P91pr{`aB?KXPo5*YyXHQQ|sLR0bk3aa%NzHz~%k
zq)-t{tQCrs5_qV>k&to>#VGW9fF_^`Y!3<c#kd?p`r7QnI{Jy9>1qHvMO(M<TRCr}
z#9o^Hi@oS{RZ+pHGG*s!bRCQn#6{w=7*_(u#nhy*yWlDlp(wgj9t^`bfJUroi*7YN
z%LXA*rHi8Re^21#z=MKil+lE#TdYsNyLY5j%DgkMvm+I6vD?{3CRym!SX!}Eb;?sq
z4t`Y9z>uauFAc{MIQRSj%(JI_@1-z}Axvg_p*^KH(5HMC{ZT?ZZ;7F6edv(4QmdnQ
zobr`Qym;~I&w;8%WFFbWt1Blm7OHg+aB#4l*efnN^P~==%X(0R@mon+LtBpgatN$U
zVo3wjniq1uXW{%6;oYLDN}DIZxAMFN1Vx`3wctJ~QR)3<7xJ1!82thttb`~^xE|XM
z98i&omy1qo5Ud24jJ2WP!#pfCA`MwTww&?0q!+Gj|4oQY+A*ih9;+@k3|fU)5Gk!~
zZ!0Rypi`1ZKheDZ5^A0#L?-<@t$edajt8!C+$UFaKV^$Eu_~I5PQwY*X^#if<nQKF
zboSFSv``hO$qc<kYR&@FAktGxi7Y3COy9<B3t2aM!Kek+iZG++y=K?@R!2o0*)SKi
z6uHpy00-0rA<~`|*`OmD1mCM+fgq)x3w})N%N7Ns0P&Mf22f@d^h-P%O#v59Cko2W
z7(hfRG%mawB7&=xZ$>~vH}sZ4!P3r3N=kx71nqVQVN-NMlohuc4ZQC-epS$5S0l6n
zrtE*(^h=QbQ|S)chxhh@VILpYN<hped<~as;hTXqP5-a}3WMReaXx8wLgeD`MhLXb
zvK=5LfOiVUe{c=3uQHGO)REMcmAoRyjjPE7kBX>KtB#rOaD^gqDZN7x0?FZy=7`Q%
zp9T4F=%9#Fu}RND4sFT9^n3n$Sa=rq25q4&!E-PpW2eb0j{YF-qzPi)3Hl^1=#iG%
zvQUGISrA1`j=w3;9UvyGp<KFrWS#OklHLjjpLUe!cQ{_}ob*n2$ryTAyj;C2=xQz?
zR)(IUT)a)u^6uV(WAo`fEq@hPC_QKEJ0%K5h@8x5UVZ<6Vb9R0l`9FfJXLrk$E@cN
zA70?MD@y)8SZ<aq?*NJv@~z#H-ma;tGJ2uw6_FW4YYF|Ho{Co8$y%QO$aTyHvK-W=
z+`Uf|HGFB-mjp2DFEW99<;243>bGO+SlaHYXEggtdfokM{bs-RSC1HW^wi25x9yMU
zn8%yvFF0*5&g;O@YdvPKyW4N}olSnft2W)Zqw@3RjQGErYE4kh_*{BBuk=RA@@|LA
zCZrEKR)PS7%C;HgA0a+C(@h_-kv*^V5DYmvaNx1bevE({b&D8O@*+2;JM5K14*1PH
zVO1w3Cl{@|N31(A^~CJ^XI&d;XlMZZ1Rk&~#4l)swRL;W?Nyl{Ra64f>EA%muM4bH
zMKrd?Kqx#3zmzSkGHKv}$N%l;*Fr%adzjeQ=l-#6+ZqEU7`<J%PesN1%l(8Et)R?P
zJ)$@114uG)Xw<T|Psu0#Lg70Z%d?@)*z3tvW>-KLzJJX=$jRoy!Z*z<Kmf@YpKIC~
zCr+O}%<PNoNXup{Z1%m`mG9r~ZNPr}hfP#t=A1c~K{Cfv08h7XnNH)mFUW%e+`PIU
zjtl+M)K$vVB^-uNwSyj|R~l<IZQ3+yxiNshZ_CT)fK~L#vEU|m4PQM>)MpM0xi}Sq
z_D5<-`u+pJ4>-;WHqN<`mKM$!tZ+wCs4UNeT#I*o^{3p+_pjt0w0?v1+`TDxfJ!(^
zA5#Ocn7)<_YT2@-Z*QBnygSoOJ0SZR#RlWQ|Cswb3?fbTkF33^Wv>)=2}Y+k#>Ki+
ze_cfg{<Tq~tCueQ1%7FQ`(6JD8Y*S>=**kS+o9@Qh*Y0r;jnurJ3G6TD4&ffBKyZ5
z-mt;8PoKe_7uZ`HV`8YKSSPApVH-?kg($yN699gq7e5-t`=6yQ$NjQ`L0FLBhTnp#
zq;R9*+7f+#8{etle?%*UcW!#daTb!`>1Vy@i)XUm6=YW#pg>vbdpC^rm$7*e6R+@u
zE7@If9OS!C%?LGWK_}|3`1lpOPie5mCazqmhknS8hhK&8&l<NZ)>sb{Pg!Hwhe*$7
z_Ua~0c0~SRAX8Glf6ebQY}j7Nd`&R0P`{_=^z@49nDkZGZ{2E2^_bN#kcl0>8Wz3H
z``7|pD!=r4cj>YQr!vQa8*LQ46CA@fT?`*JYI+AM0d!#7!HZ%zBKj{Fh+QHG_eT1$
zhb#kmwS%y#^<ih_mz1QEGj7<lX?d{|gsKo&cP!?bEXkqWWg+jrX4!U+{`Y4zYSc4x
zMq^9Aeg671g2t<d37>Dc;8g#nqQZCzb;%qb<*t?QSB<wiyCgi^GlrF{{<wJ)_Yt7K
z%a$*9CP(P+FmUkT-4y&!3Ja5HEIKV&Vx0BfV3g_{I+4GT2C%`>s7{&Lyy9u=1q&8{
zWE!#!u{3XFu&H^!&YVgcZ2PDEw8_fa6JLL+T|2-|LRMX}i`4~#PMtbs7G>S$`zyDJ
z*THeDO*47rdo}v^Ct|k>nIn(#U_3|<OYjn<jN;9Q4~J<=)H&R=iTgWQ@yKZoP5aD`
zVy65z@BZ+#PCUX>v<>gZE@(x{Uw}OQ8M2ItL-Dm(N#oU*U~mhj>C4hmZC$z6*2daS
z|9fp1`cOpw&Eq2;fa1gfP-<sz^@ci^?MU5^qyGz*U{Iz4eP;r1g^7$qp}95!?e0I=
zn~xiNmi6hyv!(5!e71&cnWB3h%%|i7X2ZvA*lAdM7nkxSr4v~X_?0!py3!73he)f~
zP3-xh@6OQ6l88qi8+YQy{{HF@e&>Cs_ix7j`u=6RfOH8f`3<t?Yu3Db%Ap-4UWQ(G
zf)9>;yEf@?^AkA2w0oT3M-EQm)#e_bRZD^E9OF^`X8kQt)FBci%LG3|eSO87U`04B
zzdGG{wgVs$1d^i98#{yJaVDl7x<iJ{8@rktUkLAgCP(haPoL0+*TE@ZpYAmx1rI+9
zJ2Szi7=1w%ET4VwTKG>}KiZEK&mw3@fTBoa7=T`R5ce8rcNfBBH|Slrd%WP`9+Nzj
zlJpAs?gmWu9r{>AwTH9AZ1vW=Ca^-H4?z9shkqFRcO-l}?URea`EaPLrH^)o9J()2
zsm<;Yqf|A{xR-Y$x?p`gZ|c6<^_xG2+Hr^|-{v;YvMQeg2lLAyI!7)H_T4+hZPvVb
zV>rm>Ga7;-p&N0ijw-<C4vP!AY016*7Z6k|0m++l5d&H{;K#R_H#n@kecifs*|2u%
z_bT4M$IeUdTLilm7#BB)F$A?~xy%MYs$v)gw|)~&X$wad-q_usx-;pNtc0LTL=qoT
z*baO4w;?Fav+77!u)eOrsOh)p4QikCY}Q#*a|`ugKboENYd405POogy#Bd@hty<^K
z3fZ7h*2z4}y~~4+9MK@F#DJh>e;jNs*R|)J?6B~$`*C*c(4_CQ2J~q6x4lU%u&g-j
zOk+zctEZR*@1B*Fmlw!L?V@#~`O$@{J$h_lT4Z0O6kfi4cpCFndiLB1ZtKPFbS`^l
znB|7$J!sALS$*6)4O_X3)CnV@zXJo?fi<75@5zdv3<NY4H;-9gR=B&Lz-5aB*=}kb
zVlH6I2Eg3oG~Oh7UGV^YMmjQq6O(SUnv4hR9Li!o_w?G_B9(L&xHFS;%wi`o{D^CO
zcdHQjw=-yx*<43Aw?hC4;r98+IZ%fuR#XmheiqDkg{K6Uylrf=t?rVcU-$oC{%uE_
zsDH*<#jX0RbLZHM?kXyOQB8y2d2LlyDKm0%cD{7+;u^fVo&v%w32G>FN2mhF!0&?;
zLw9a1U5laR?FnC;WaAA2!8GpPyJu%I`}?7r{2L9)H)UnBX?h=WU~@edI{V~Mqc;Be
zbVp_KpyI~v0h>ui=1qO1qGD;daG@quV|R7+l;aKk{r#KwvhpR^nBZH<KsqK>7E-!r
zB=qU6qLNH;kpJSvhLDi1veMa+zUt=cx&3r>bkMf$lfXw*!4=BZ#7cIjiq?!K^Oh~s
z1tsp>ts82W!3QIbqUv9V*7b0N7u0-U#J9H!|5>*#Qy9&2nd#b<tv}AZ9g?Sh?8rW=
zJl52#jmwNfgw~7YRe#*cuVJ%o_5bbPY}c1`DboV$Av>8#ENai<2o4T5Yo)q*!}Z}D
zQ(;q2sNmdv0LckR>rATfC6Q*Ew{0t=;yu(xr&g_6g8(kHGm>|7tqZKTY{iPe`U3}O
zMg1=x{!Jh5Nq+txEcn&~7qu6^!<%2J^$j@d4abhXkeeF-GC9Y=fsI83*#zP3CV+im
z$3mVMd8M11+q3jN?Z{6#8BWB;UQJ1<&vd%Tqep2Kk<!-SZ&*GCI3;olunKClYuApy
z^@*K3e||F58_}T`RTcxrXg$_w)g-wi{?6o+bvW=Lc5H^{wE^^E!OWFBdc$$~nqCtW
zjq4Aj`|q`HUnl5fs{Q)ys9C%I!L$F1m|Fi31~IkkBdn9#$7USMoM|xX7IT|=klZf3
zdwdNb+ib!Wj3aq50PZRX8Ap#XrqKJHoWrMu!$XptfI_{Xxk7Q{k(tKYn<oST2q?eS
zhnXJ!4dCq{?a}oW!|iUg=G;_#ME$a48%aWoP0f2=!a8A7Lc-A7w{KTlEer4`+*`&u
ztI-OvLrG71(0zPft0p^EgZc#evtcp-Lm6gRk63pA&~FzTzm+m#H~eF>?%KcpntFCg
zPjJaDI9}$S#}^D()ydIuS<(AcKaDZi15Wlkp}(eP_~1vkZ|mf=ky&*S<u$;=cSpG0
znyuPIxw!PcZD5w!|M72Kqmt+}lkceFtsh=&5xtE9nSq3IojwmLJbZn8nz3}6_nge-
z)^#Q=KhectRJ12P(p*QqMvb0>Y~s^8H1|B5zY`#Jtjibo<Ft`KUZPiY;Yu<SIULb5
zNtWHP;gv^9xvE}y`yoSyF!RY){az%A47>A&GD%QA<cxV{H44E7ETe7fDcDk>oEqz&
z{oW*5o!B@A11znB$EYFuLak+ajF~$1uIaX(Q;&D%LOFBuA3yfov&Uw=ZL=oHMh6ZY
z;5d9n&Py`YgrRWvT6OH0&H-w9ZdpIIasCI1J!t=1tXQ$4Rx{iQuOCU*gk#k+WXMq0
z;^uI4R6BR>?A}^io9;t62QIP6x6h1@<5G?LS%%3z8hdJKdBJ$#{Ru#s^c{u@7P4qU
zN+R@wSsbq`Q9eofn74D_;_38QN3sUqzJD)~woYjCCdp@^8|9LBK~|q79t0FIxgI<l
zj<YmoHDlXoy1uYMy?SI*1uT>eUS6vT23_6UDk(3o-M2*0oi121Y?nG}E!igSr`Q1s
zuM|0H0!H$_0tPPbsDRuwm*E4FsO%_|9R^cp9taEt;K1DTFd&)Sq%Ai=D40BS*(s7b
z)F``1XV?W))_31riSJER$fv(`kp0Jetk%Ahxv%N7^0iKGpvC?dDDKw8ub(!LiBI$C
z-WZ)^O9-2XsFm8Z7{fR6trt+MT_-3Sm(Wo?#5+9+g|1HzBQ5ZsXHdT$b6OvAAgzXL
z^*2OtCcT(yYHBt#?xEtHfEMNips6fP0x4B+(ENP_ZQ$(zFv5*bXO#cHRHgl!o#CC?
z+RJYQ(|L*zAB?8=?%LE`qZ$gpi;12A#|J4N3!anV&n22onpC!;T4adQI&5<`@p}SK
z72XK&p?10nZ4vBd9q{#<p6rXABqA`G0<eXqhj^&Q{l_C_eevySx(p(a+y{wT@D`1y
z*70t=a4P6Pe_zYN#mVW(>gvy(XcMRrjM_B$<J;r91>;S0nwHF`j1Guq?m=svg~GL{
zbI9k$W0vre*AMpq1q7TT^SDK4OlfV*v({Q8Z5u2J2w+!12#oU*5)vw@-6V`e+?zOd
z>|+!>ztpMIv~}yTWU=_37{8xc?(~1wy#vPn&BHpU#Zew^vcD#Ij-oSBLVhhydcTmh
ztj*fU4sgqY>BEa@bb*Fm8>OC?(&|zBLC$}}ZHl38&C;QeC%M0hAayipTO(t#1O*7c
z9Sua8l1y^9>(pt0KeVWS*3^)2A70Vt>6VQfpYWPyvu6%D@Jg79<i*i_WSO!ye%8XJ
z^xgng@mj!(#??XB1GKxt$ZP|6)=zg&>$$AgC5Q)eUmSmZuKYLDQWvqzyI@N~-k(n3
zQ~6*z2Zu6TIHsVHjwHR95s^kjiJ{wTyRY`d{Fg6pFob*rg+}<KkU9jo9Ak6dVVO~b
z1`R@>izM&;rzUObwUl{neT{f+ljp!QmeQ1r4-lYdNdCd896s#VLaK+md3ntkRDu(0
z<_VrqPh=-4>~Vo~6fAq<kRLF7c;~UBRGZOq(B+hRN-fsk!NuA6DRAHc%Go;Inn^-B
z>QhT6`tAMzgm}Ev&N{_;Bs0HAOZQe3Pf@NQEwH`1t%ok}PfxkO0|BND%BE}g?%lg!
z>%D`tqRrqXJpe2Ex2VCO4hg0YJw7|gE4^Nh-OJ6;ajhmWtMlG%ATu6v`K5J!a8)H9
z=IV?aH;#ZfUVGv4<;!J`CMd%#)kF!0b|jNs&VPP2gw$y?8;LUa0MTc3Qd2IwIZW7m
z6ti=nev$aq@+4)`#x``VPpCDd<jTbA&%+fg<4e#k8ck4DQQ5Y6>sDq$-|2L0`~SEA
zy)~zux^eO-9{!l*G&VanUHMAA1!Z9|t>s)$%(Sz+#^XwXp?Dl@Uw4n}`PoI&u8^X-
zF}QEy<jGCnzke@9Wv5R2DvIT_W!v0^ni4Q<)!=ddn*AI!rk&EntfdPJzD_rvEx)&D
zVE~rtlh(hCjik-PBIRjh+)YKL&r{xP!?yi5&}G<Nzw8KQH*t*PXy&H8N(c{f&;WKM
zZ_;sCxbT(+BJ~Yp4=s6~(!vAZ6|@ecI5)OzrT3xIbNlw~`pf~*MRgKgDr<&3?%0*o
zyRaKBr=@w3n2@_9k=Z8i8=k0KLQOzI_QS($KXiHXWhNiBq0F!l45;rKO<;@lqzoHD
zkdfM9{xo{CmVjps8vBClsN{}j3Drao?!)TU!jX(5*LSZC8=9{XVB4(+*f*y#uJM?8
z<7}QbpW1b9(4cW+v5|QBFrl4-UBgfjeK-{jC&+U^Wo3@AT0CbIpkb74wK!mnGb_=>
z9J|uMi{GjInwWKT+CW&34fOGne{ttnCz?}NP?kQn?ODzW)`n=SsME0Z1*B~am1_Xy
zj~=P#Gpr31Y$0umSZNwNYH`|GQXW4iimU19wDcqrpKvR+g7h~}{<2B(-?tJsXp<un
zmA#s>oWMD*BIMZ18*A3KZGC-#5(g}3DKWLX&&ifxL8L9Z9Q+C%G9yEF;>mIOF%iSt
zis%;%O2MH}%OuRC3$cF`uWCyp?p*ohosQz;BhEiXIY(M4Spn`{1}=VhB`3#^LxiMv
z(9&1!N#SH8?5KXt0`LFlO;50k8YZY&32Hl^j%W^KR1U#iG^s2|MdLbu=boJ3jKZX^
zZUAou)zyse7*Q@+PiwRT^*G^c1yc%3iA?g-F39+nk*%#NR{+xS>oSx3qEb*d3!5oV
z#887YSWZ`_Cn@UV=S%db`qx%|y7I;ArP04^J={Z2K@6OW?~gKN_=h+?1{TQpwOuU2
zOx8dSGvyhq_1U;#tkod5m#t~*KH;e=Kv9lkg4Di4heom_R01|^q73It)q!5eEx4vc
zlzN`=@k0m;msli^ZN07NX`z$Y4nAF@@#qZ?7&;}hS}!K^Q%9khVEEgW+_6RvHYB~|
zXQ?|g<c+X^{~_#y58X<g&VQ8|Q+Qz;?#wt|m{7Va-fJ?4PG_+Hh7Af5mQc<@S`RHn
zn<xGp%z#j?+WDnO%d&a;WCV)G`T3Ut<d2^^rSt81H_fkT8LNCQ2YZtKB4ytSJ^&2I
zP|0bvgDmw4B&6+HEgV~_n{5z1kf-S=)y*j+Km>~r8vF`eW&UEZ{LXNC<(k~_@(D>S
z?4Y)sp9BiI36hI%@zMBCa3F?Pwev)mru;0QGdQn^=>+H)Zpvg&qich*TB3z)hf`Ab
z)nBU=0xScCwhfXKNUPcZV8kMo{{4MPZznMbH6&zg+mCbz#2Q=S=<GZjeyq%3d~C+r
zehBy=)8yk;R%$7TfihZC;6gUgC-(hia3>!J7gyK0U|InDS;72}O28c`?jYWwv|dJ!
zNj5=`!_()7Mi9?*SAFU)>q`0bVfW3XVSQokrM>KQ**t4aSvbp46%OZQ)oyfBoC0O2
zjib8!&R_nA*&S>QB+$aqt_iQ50Q;)~Y(uoTH75Y8NOL68I<6^ENC9wkgNNE^f*Kd*
z{f;%U39ildJ~M`8oe~)Q+%l(H{K^fsy*c-1LpW!hK7R6K1ZQ>fqB*$}19iNpdKBUm
zCGe7MQx-%27LJ^hWjItFk#`yxW)-VWF`RVcMng|@b|;YJ%58Aq1qXav<D;I<co64O
z7ts>kng$o(ezt}hcVa*5_%cs-Gskt!?Gyomf`R}v{C>4i+sXjgw4fqu_UdJc+jZs+
zd5`OuX4-do#h+wfJ`OmTc1J1Ew}<&P>WYgT2E6<^vu0hSo)!CVISitur^Oh|4DxP;
zbVy{_*0*0TK9Xc5^*pJATsWTzN8cV+om<A=G5fODV%?$sxZ>^GS#S|#+c)Vd2Py)F
z8jh4Wkgw-Cs_=q#yzIg}m~%K;u^QI{WTm~(l?5@H>DI=zgMblH43qyf6im}GdW#&E
z7<m46foFM4-ktG*G21#Gs+le_E?LaE*#J)NSXU|}^pY$L(A~kNYd!vdAoF!o+!yzO
z%`mp!^XSoDII$jr7p~khYmvoGI=|AL$;mY?u%G6mZj-Ko@=JBdS+);f{TJNc0^Ts0
zl@QW(ZWsmp<AQ?Ggivx`6lz;|F6}yuqe`20Vs>p%k=hroUR_6xBn5!1%6=&jv1xv*
z>gf2B*E#}xN=%j7ph>^={{shi9R5r$3v!~ENmeLGc5*@e<~=TuZI3U3X52SzSr?v^
z%CS0Mz7ifS*MXsc><Pk|8|B$db_sV=<L3EQE0pW7CMRb@1ZF3P@idkH$EAh3*-Q#K
z#g(g9`$4{4+f2^u9sk=g8}u$_WNf3LuPMnV3vDaREO{EL+6-rrrkX<2>f{E|MFx$P
zCp1|_OG_Fu6smuGZ%6>@d<#acmV9_+W8~Xo1PLtJDfP2F=a4xYH`uN~-?5(pP9cGt
zu-95XO=~UxXA-lBVFLZC&&fiq{4I>sI``2yh!#d6kGZG`gRJCBYa_XCfd3e;wxKMF
zCjH3#x6F5kT}Mkt&J0zT@^|ml0B*rB88*}%Twxgv@Ieg2%FN`i1BseX$Z$s|Et|`5
zq<^S+6q3MxdvX9!TRIMHtH3TdE1E|X8#@5{|G=ZeKMmW;f6{TYzEX+bgmkcoaetVZ
znJu@KkjGao6f8T@sMV4f!TV8WApz1>pgOMkm^37EdcwWOqvTb3H|5yTrTcRmwniGQ
zmHv)CeEbG}nKqjjY<u($6@wSIj|OJ$m~IH*{2c7E#)bR$gFt_kFF_&LGxu0y8Dt=-
z<&}wVfFZ~ZBYXQRz{y1)q1=#7x}Gy>GH=wVUkK~WrQXX<pA9-UJx=*VM_uVmwXW6c
zLup<sm-1E}tEmN}nM3y}z@Kt;&P?U)8HSVmNZka{U>pmAW^|vqDZcw<KaFYDWQXP}
znJNId+OVC@77hcT=usfG<eB{j-_w{!;tnrXAwKvrZ9)0sJ6D$H(D-S$OWtZ%N`qlg
zbF$9vJqA*5JJAlaKEs|vmr?c^?;F-(nd916&f}t%#^0`4>mfJM!z}D~p3T+d<c-AZ
zIdsTmXC>Pb`+Ay3H0Lre^Ta@7*tK@bP1nS|2Ac!u-!L(W!{n@t)tz53p4SZkW(@jW
zlo?W28cezmj&ud6rCe^-vgP`9>uOVC@5fVF)_C%JiN%zOHG0r3fw-<MfyZ_<S_A;E
zzuEhw3BT5>m-$%&TyE-_R=X$6r53%gwp|2QJz=ME38<*d4x$WqVz(${yu^dPEt~8(
ztTk}pVUUFjYyV0g{`KSKZ>y#303qJ|0=@J$Qp?d{*u}D!0*(x)4W~tY-5EDY`s+1l
zP%D`FQW-*2W#m<sGetlnV=f~gVC~vk05f$vHeZ5;a2w@D*0S2PS@-hg%j^zS1&D^=
zE=4URAK+^XGSRkKGIGH|)<E*hjvbzrW@a>&wVpg3XTuEGT!V0!&78!(@^J=_0Kz<g
zWzjiQbfc5p)u6uile8Q(rZqypiyNt4_m6qKjfy?xf-1Wpx2Q6Lv+%uLz^z)fYStWU
zZ2ZfGyu2X~YAVZ#vG1MfkkgU}i2f&YQgi}R3BcLLdaY~RWW<6z8}9(Wo~G-Ysu?+=
zwMIO&anq(taCBZw<&IUn*GRQWQp$@_`1Y9e7iUoxD-hE8V#mRxwG9B9L81d`GPLcW
zHZ4%kU5jn0SjiX&I{C`2TGNr)K(rm|O1rD)l*nCj6RO%YL^_hsfp-2wal4(q3s8=n
zrlVBX9$eNfokTgynQebdjjp={j!q;a6~#H5xdazb&5-<G-@w^$;lmZefxkqTzL
zkea`@YL&#UXf#1eJC!yLBpZinPVj_9qa`C}7Us^L$-!diMG0^MGd^KKOgb>C9!c?3
z<ZziMqGKtfKnUvVlTh_<aCpPmZqblb1#Si{-gt37mlzXs;>3PvmI{WWk?r<YZpTqt
zlUWRyTfDB@eOKUu{sZMbH!+MQW}M&yD7lkB3n@7-Jh}YO_*81sI^6_PFrbxSoVv|J
z;v=u}@$=_b1wWuMzybQ~MnE%9a^to^?Az(rSbsgO&-7)STt~cWYq3FV&17TL>7M#f
zMp<*iijP$1J*S^L^KG%Pm;?OZtz=5_<foY#o5ys|WhL&2>|<aM1lWJ5T?~Eji`TAo
zv-#uOin(pU0hUz;H=ae`7>&V|lU4hwtCD23l|?M8DCbj@_m$ic>><oLP76JpE&HA~
z2XC+r43sTfr7G=037cGJVf3j}hrYE*TDMXUjHkm%HC(IXjZ1^*3CbyYJ)}aeZn`hJ
zq4}0dCPVJ#xkkU8<y#3&wJ)Tl^|Gn;eL25*F*Z{oV&9E=6^?H@dJ^~BO7xrFVw=Zw
z%8c7`nXKD3^s#PnIcBF_C`&jO#{ypW)z%)j)|S5&1C>Lh&)G*T8u#hbhosDFtMAcO
z0vH<@)DZk7;k25Hce38f^Y-)S8)C1cTujIQqNnoxVXCe$oYbU{w=q7Iwrf$#Z)a<C
zW3pmK^PXApA*!hMFa0Pc|DHdHUjDZiz0S+45vM#%7S+$~zb%eNuz=+7q~Ke7`t(3X
z;MAeNo0OW`fR0T_K1aUQhu4piQc~3aKdSx&tmm$4|HnUO%B;*p8Yn~NOht-HQmJGr
zAu6JbMafVa6crH)Whx0JgcQ+uA(ae;k|;w+(xm#ocJAl<KYks@ec#V>UG*9Ed+)W^
zI?r>R%gjNHCil<PwSSx7w2$Z|>2@lr^Wf>P#3}xj;8l}4Xwab8c!Qj;34FyTAD^6j
z8SlzpwBcIx)}1pi$I2fT{=!Fg?D?gA|2;xWc!eiixQDw89G2$dy!q9eH_O}`2<yI6
zrcUJ!IP2xtvYcyR?Ij(%b)z!x18FM}bHraWs=X<h0-y3*=oc7yYt2y;(SX})z@YiL
z8Dy99vlp;=#Z~+dQrPKi^@f;H+_IafhsKTi`Y(V5MaVxTI}ADM;NXxq%5QohY<C^*
zH>d7wL>NHHG<IJvy(MTsy1?l>uiOdJ@|yO8RzBK(7v1|rO=egtVvok>Ze&Og80@sK
z*S*lozqWd3<jwfoAg`p9R;<vvxpVQVwQIewS$JTl5G~!cYub}16;Tc3U=P6o#!m|H
zPK+lYwXt2}Al653Jiy9XAOS!*$Hw}t5-RPvK0Osh{Tk$w$Uz@Tp78pn$V*Z5+$%iX
z4i+I8u%gup2ZtxSTeWHV&I_u3*4dlO4XLp?^;JwQ`-@(0!Hl&W8Mwo9)>?s=tp%Zu
z58r~flks?mC_f&%M`5|e;>CgE*BZiNwH#lClkB*)hzadpKYl#&ZHo7n@@IRPoZRN;
zheMjb#1Spl4l)NXNJmdapIc`Mo~ma^LdLwc+_Gf?Yg<av_LtO)JR+niBr|N<WN@Ge
z@I1=!mjSJ>0#e2?FbIp{42VIrJSk6~9^_vegM-uTa387HhF96o<C6x<%SSmuCezCJ
zlwwpAoq~TR8AF*BG;R9^*u=`8KhJ=V$*7u-%x|Rp^b@iLDj@3RIl!*KJQuZndJ0H{
z+Tk$b$O*MsCZl-?vf5(QJS1j<E)j$UAdA3y!G{;3DVp$OJG`?iIopM5knOPuoF@Dz
zTRgb8l#-&NjcxL;E#0o}Yx+(`)h9fAbIWsF$2q+StRg`}>I*I}po+KGyZ3O><^dm;
zbt-uE$_VupayG}UTPKbVIp>3k)yej6e;^PPScXVWrayVuP||<MB4W*>hdY<;NWz^X
z=-xThVwR67QTl<|u?FH@fazz<FJLB=8VlNf1tA*HVp?jMl9G}@``Br7(bLv}SdD)>
zzk$+mVXL*Lu$IYx907(pVN`Qbb7N8R!I{;U=q70gt}MhoQT7-Q`S9wMJcT%>!f`R5
z-th+M)otP=sg*Up`A0@B4kEiX3D@#8tv=n8#D6VdFAA=oK5l@KCA>8cXg`*EVX!S@
z*+89}Z-tn@9Cpv9+PzOdYs8Wla&D0keI{*=-D7EDUN=?YHf9j;Q1{vVAWjf|<l}s&
zX;3Nt5I{fKP^irY;RC44LA6TAhNc-$>j|pv+DA!{ujn3hxOMje;DBa^gUyV2ll~v-
zsI@#*WR|M0|9G8Vi%axc{$Aji=(()zj!#NX@$&Fc<4-2o&C@Qm0pw8HIRw`HA%d>X
z1}$-(q|BJ_o%hZ1E%p=W$%ZhVDxB|@aQE)T*Nu~U1p1lZ+OQmHqb`7lsB|M4ioL&^
z$%xOxz-ZW9tyipFdvP&1!Xw8qRm(PV5Sf=M*-esZ!ixR&8y*zSwh)vxAg~ymFmhQ!
z_i5Aan%-)lMd26!Iq_m*NU#X8FCvspK8yM@6z$n=Fn~QkUVSyrmIBCwnY5FDWUc@M
z0r|~)W^kwwziGSIG?JfPXw!=P%CG-Kl_!>uKt{(2UhQTKmkm5!Z(t2BhuMe$kBzZP
zqsk=9By0~N06wJA_%SrFm}YTewaW>CCnN04vkhU_UI3s4q#uMX0K)h+n2$&ph`mCH
zg}df%c;1C#l;VHYMoX-^^$2{9G-4D|_DL}($;6-lQT)$gby1iqlgKAGH=U%v`Y;)H
z;E^MbcnIoS%DYqdq`ZAQBefA7(gA435FU7IBce4?fB(gc%5d>aH9bM!jt_?r66P_p
zIWx$#MZuN#cb)rl3WPcu+;(>;+DBKC7m%INIo%#yJFnOp;v=&*k56j~f(4DD44JUW
zTYU1*&Og0a+)XV%uODFab!vTgJ+5CdYmYp(i-Ds2>r|vg9i^m7SR;V?BYcVxW;G3h
zDL`kGs}{bxH{VxiX(g@<_>viE@$7%)0$vz%&=hu<p72tQH-#Bl=_oKVa(ENdM~g`p
z^EwAmNX&mZbMY*Cj^e7xtoH@|-MRm9z-OmJeVnqQ@i}yZ=m-TbY9r7mF)HkW8@u0=
z{jE5_VK`V0+Y~LiAsqpHp$?$dLRTT)`r9`HLQ~#^4)-~%Pw4Tr+YwOm{`~_+F<y&}
z9XseS5S!q%LCwqaE_S1_-~&gJICB#H6n`E1>Gt3(@2^AUkN*b=YS8UF{WY)eZom4#
zr)fD^tw1E<!VPb}efRDZrLqZ93vmkqK~q5}j2jn3$`(r>6@c>T!d=8`VYmVk3Kh%t
z?FDI{H&zH>C|42F20{i0CnTtg2F>0hEoR29OwIix-PFuu0IK)3a|2w+Y(K!!$%*LH
zl}#DOEkGLjfd2_mcan#0(>E((gYCS~<gP_J*Ycw<1Ey4IX)-_(phEle;9n~NLVs5j
z9;2aZb)gqN_S+pxJ3+FQauC{zQu+4}%uC6KL`IZe^x)v3Lz9QMq>hRDCKK|-J?lc2
zC;qdw&6IWkZ~A=JYuB%Tz~}x9abwP0p#+Jy&zFKX<^N9wAl$I!+-jj-l9FnDp9>ML
zgVN8^oCC@2dy$eF6-F4>WF?ZQ914&L6&Lv(aSJSNRrbNR&zPqmk_KPNwhcnm_Ar@h
z!jm8l5ZZkkmE!oDH`maHkSV1u7tCc3BC;<mY|zgY+Y;s;aLzTEvs@g^S}2Co!;4*)
zp=T2AGwYTh7Fj{>^ay)*ifH6XeI-=?5KzIFcUKvcex>K<%kCC|*gC~Srn)jh)E*>M
z^Y^)P<Ca>jJ+N)_=CfAGyDn67LWBzK*mq1u{9w{r7k~a-ms_D*P>4jxcU=pKz~jix
zK?7ld+ypt)Sr~&Oqm~{8`4!k*ochk44a!pLt=L#AXZFwhCZKCRLo2Z}TPPF(Dtb@f
z0J<sEQtQ^uqhk47c<X8l-l|<H8xSFH=py#hN<^yb3vc>^lnfZu!C9@%OCaz{ZVFOx
z<Y8Imx|y{N4JUp3GIv{)v=4krZD^Mh8nb|q5M6>%Esko8w%vB3CXc-)GpOw2M;%f&
zgRcSwBOkRpVZ&h|56FstM~IW^J+k)kW)WYZ4d#DEIshm+a{lI7X=34z*;rf80<d4W
z<r_KytH)28>#Dc%46+AhlBpC-STc4d{(jV2waJ4wBCKK;u3u7gcl#Lb3Pa<hK9z_z
zMD?4u0}SD`iz}bcx#Gr4gRhi8J3lAk_&A%)Or$~!PHT*Ld+^*n2%ah$h>E{_(TCns
zcm8?i-x`Sn5qE@5mIEonRhvQKqt4(0-wp?M9z_%20iXGkIX!A{x<t@N+!47$V!iv0
zg80hc*Opce0sj**Qivs)+?6ThZ9y}}^lp{l_`WqQ!IJwA9?ZHfB%a=@P52{#8c(D~
zqCsKbzJ34qpjOyzk@VP24_C-h)@$90o_f-J`nA{^2s@y+_0ECDfiq^z5RislBS9C%
zj>eO?_3sj=yxsv_W3q*?Qf<MoK|43@W4jYy!~GISCdV*cqYLBSWd@y8UND$K+(0vr
zk&(R2NWMtZ%gy8;GqL`#yGQ<?U2%~BiVh}ZjYX4q+PtRkxC`1aLNs%##R#5^=&n(d
z$XOH>zk~us9e4(?%_TeuyVKb~q!OxD1uE1rTas4$QNXL=G0E=>DBYRjS^%DhWq*m~
z5I?$~Vj-|ap+k42-BNgtR|L~NQQD<C?K*V04y{E+^n}wxDjaVYQ2e~}k%~{B(l}eW
zYEi4rm}@0V2$P7=g-gsou%IQzY3KTaVLybv@FKTs6+j4JYXd|^8OPfilQ<k5-EJSO
zTz~o(*3uFN3wyCP5#^o2Qui&NuE&Fi?^*qo{?HLquaen>fT6r#^0C4A+zY|K6Gfh$
z!-K)jTr?k8l_&OTJpR3`Z0(Yp*RI{q%d5KW#yT32-sja5AxMXwN`G`c3iJYPuwe-^
z%3m(r^6o?W3=Qx5RUW5j_c8rHEr6F-3tLQrbEITehesAn*I7w#H8~oJSqCV;;7`f_
z{aZVvUH{F~*C!;G_*6uVsO{NfmFOvelo^Mo1z_lZU=1Rt<O-iO^~CBnLY+dKm#9yf
zl5^Ec^B>aw?=dXd@o`N_=dN8p5ti4WTOe|_AyCs3B3i>-cxZ?rCr({?Nx(8f7Du9d
z#qwSj(J1rS7&(~zi|io5I#RQ(w!Hx55sQGA2b<trDG!rI&Go*xSZDzv&13qLUAuq&
z(Fkg9Fsu_+P@{AzzQKo(9BzV#V2unBRtqg==HU)t`3QxWmGj{B>$>yf&U<mw#nRZZ
za1D}`UuD?~sI{=HRU|>xGiyM|y#7r>Ae+z;&o}n7C%S8t5n=(OOce1<eX^zb)atar
zA&hkmidnkkLdBt^bm<W$^)tBt1+*fN3WW9#$n+dHZ=QF3{qf2j(^c*@FPA(@ON)bg
zCRcyJj-+Z=9y+uYS)f=Yaq4o<W%ly4o^zM#iYMGh6o%wr9$sFD*?pm*p^n?PgFhTV
z(2W4qVas!ap-}w56hhsJr>CgO$m(R^U75EcwiiLvcFSX#%rD7b&%OAdnRmK+XAYnO
zqQeO>$A%lXR#lCbsa{C$PTuf3jsY5mlN4_Ev`du9k!HN!p}*F8G#oRG5Naez@sA%1
zz!4U{`9ZiAXk<=KATGi3ANMFOBsp14_+DR>6b8@ujA`ez{LQ3MG$UB`SWcyOZa#(E
zaTqv;PL^^#2}T#Tmm!~74EdWYbR=C4{+do^FK!wPt#IMDtuHL(E$E>l@#-SFpi1N$
zyjXQ{+X0HNC65zIcR*=@mM}kUj#x#8=Lj>K%Cel<$d^`;Cks-Jif<DTKUiVLX-!Rn
zNdzyNh@Ba*cBy9pst|qxpM#cvV9iDI3|trYj~?+OOetHPL{p3A3v>|+{wm97;z`Ev
zzI~pasq=1CI#+HbT{jIF)o%^?6iotco+Z1xxA5r{LqSux_58sX<B)<fAa(&s8@b?l
zq9bM*UL1erX-&YeLU$tWd~Dijq91N_MzBTTs;aY<=e~-wk#_%E*?}u>7@Z~HmhVeT
zwW%f}m);t^F}ZOPNUgzEO;M_$OA=3kY`>?R+yag()?kC>+<x~UZQc2P()eF4Hn1fm
zwk3eHWiU73n1|(ce2tQKxgi8<PI2!AuYVoC8j?g(s}i;w0^AS0pE!JURf!kl3(Vj8
z$1jdE%EU@#Azs(j?MNmSbtmm}^$dKyttowYj!u7ToygJjNM0f4L5l*}o0T~1B|a}&
zj`U)rWnze+3OH<(b7-RF#jjm5^`Ucv6vJzM^OY8@(&*zCyf<^FgR9j>HE1-|u3}c7
z4@v<dh2rSZPvaSA;7iT037bH?h#v6Fhnru1)ysjD(U;AlIes?xwlmh6haDQYoQ|tI
zl_Z}MUlTqrFye1^75!f0cm7$waMp*}<f}rGAl8vuZ!C)OvYXYyG=n=iWnm3gX=PS&
zD`<`TMe|-Pb|<9xHIJHr+^9HUN32PsFcH~@6zv%wL}u+yTXy(`kCa{-Rj&x<;w6>W
zXn1sJ9z(e3LGcAcDRO^0yrhuJwgTwq%7}G*uLa&+|NN#q-UVobe?R)epgTY4*cq|+
zbOYtFfRf9u`!ZN2z;D3hgJ44%v16uAJpn6E-`70o%f9N4#hVLviV^A|s9u2rT_Ucv
z8~nF&VDH{(By)At$)bkl_Sdh{N_+gRQvSbq*Jr&43^@6%=rY?|+HE7_=?rv2f3Dc_
z<lE2U83eNF@tE9*&KlnE)T`~_O(K-@YC`1MjDs<=$EJPyl&KtL_#^vJI_-z!5GaX#
zLPN&M^;T%$06fLf<)DrwF@iamOZs~H_?Q{^)g1$rnjVY2^cz!2jXD;6jPpDFs)-iq
zvFO$NJaxn*f`aBh<vFts`ui*L+^In0aK%O-Jf9QNrD1Ar!}dYDNu=TkXUaDzlBuKe
zFP;(%2|6mt^aP&<=CDk!P7s<}Zdc`-c+7;L*Zm$<CdAiIq|eDUAGeHKo#{T~55VJv
zy>p8_cO~5c=~$`>d4EFINu#ORUVk%VN#4`_TYvobPit-c_O-8Y`nUV^l(5)ZS@~ig
zgRotE4WaZ;OG_&se$|g!OkAuWYq7xZqM~sqJSu#RC!bcFr>Czkq!=_-DZI|!yqn}l
z@L-UkdZcI;364X_HwwlVaxV;IwN`Pq-%x}3G}5vKNgj);R3u8|vPd0v`D*;fo$&ct
zcpGxwT`R3P^h7u)YZ^<J5u3^jZm5#koQD?6cGyYK{r&ajA?Udou|Eia%MzUf#lr#h
zBq(H4b(>VjMf}Sf5<?-COo;6h+*&!4bZ54|)W(^N1lVH1iRz)hRA{M`BXd=IJAg^%
zf?^FNxAHmm$4Z-Ofk4=mv`FAU<?sA;mw*DS6zwXd*{3<6^(`rw<wUmbj*_?M)6vV%
zsGf0*{Mv~##S>)Q6307Gso%l~N?yu^FHsGI%U4Nm(-64|Ey3#~P%kHJM3Y3>%5Y{b
z6@x+!Zx$Kx;X&@~9@4io^7OvrR(B+17@i(e-aHL)Dd>*tmz|N_P8$yfiJ^T(2(1`#
zU-!Hwi|YA4!{~lL*O=qi_kmFLPO$9`j_X{qJlCs8wC(3%qQGH2QL<-k-WQjTpsIBI
zx!B8blto7m9d>l$=`rTbu86E>ic?8jV5edfAfkps{ZxPZc4K2B^g(CtNY^iSZ;(s0
zM4^IsAIQfO0LIKbedX@fO<A!nZz=T&hLvL*zt=_{yPam&IAe#H6e$F({GH3nA_3Ea
zSP^$tQ3VM72}($jzLa$>4h;#(pers!&AK&Gv&=u_1c#v31aL(gmC#gkKSYoxWj_(R
z$e<pNPrK`Wx<;n0WEiT;de#ywg+w2&@B&njpu(JgrW=q4AC}cfR6#nNf}6QP>Iy!E
z&`ybz3Jff6(?@+&O%HewgouWP%^`;KkXo;%{zUvjd=rq4sFKJ*IZ4BKkQ4pudkq@&
zgpvi-Xr**@uReX|QYHL6V{`!-PV)|OmYsD};q%*t!eXuh<m3Dv<qI(rWDU<w4;K)F
zvVrQ3yD5`3Zb?+{z2VEU(G*lonb9$pUG=c2NY35eT|8lC`lbS|SzdCEf`k<L5N)HV
zK!1zz-nAiDn%#@9e(Uk5=OdE_bD8|M!A7(6y_bJjRdnmsOL==e5APec!!ag4|7W>u
zkD>NHiXKaR_fL>tU8H!tNUrYVt}??lw}*{C?(wp!@~87-pUpqqvUjZgKIKPM8fB3q
z_x@ilCeFj&<YY}wT{k=?aC%=si7H|@qdT^^EIJU+f_1z#YUIdg9EGroLWrT?`zByU
zU@>T+RO&g0ROvZ#5h>1AlHI(DU>=9=xy~I~`}yMO?lLkS#l<UxR2AvXZXcgc7`nXw
z@#7G?w``}O)WSJ)+6(PC-2^?bg;AS0F%_on8oDlsiH@f?yY!lgI(#2Bz36|OWbt_a
zQPf8yb_|m2i8A;BcEUY+_S_2?*$eHlR_?YXOZubz=PfoCdz&>pOa5A}wPVNBQ=0Ch
z<4;apcj0EqPkwip{kp?OmtHM$vT7HeaFq?H|IS?F3dYqGVzzITG&C%ARhqG}>wKa%
zhC#z)c<!EL`u(RKSvg|Hj6ozl>1+?<m?aYtXmm!ikoTpbqp|VFuU`)!%nj$u3m!y#
z^5@S@36$1|ic;=d85wmar#{lPf7Spf0#MWaOdSTxkRt6862siw+<>5<y(oCB*;$du
z{dd1f?I>J)HnwV|gSJJ7yT-HaQ*?>Nz^2{@56&?+?=AZBND9!^4<h92(lQ2$r^dW2
z|3eP|>MOA_?N}4TUyHhiY&0M=)DNAGLD*E;3l}cvGEJ1<%DK2e?!4Zme}$;&-{Au$
z=f-;n%8<@J<jU>duU}n`<;DvoG?KO^uBGq<uyRo{C^@$2$+AJ<O(X+d4Gj$qiqB~&
zYI=RZ0CP5Ja2LUB8s`X^FDk->goI-5UETTfMRNy*=or%ukF&Dqa-W7+?*NcC`F#(X
z{)7SzP9u=DVDljCk!o0ebA3PI8htB);m~A@9oS<a@U=MO@Zo$|N6u<%WC1Cyivd#~
z0B!idpxC-yM*e&ss*<RiMt}sa_8@8EQVJ)ENH+D!eM_Rtr&6{%waoh9BDh;rXYFXr
zPT2ajH~y2ApPmdLWu03)Dl{-~4@pf4-@Shg9;Q3tPuh<=-kJ5;&&DTe1sp%_1NO_-
zT>*^5X4cuZZIXaZAZ_}*zI}sV7I++CLCrd)El}aT``NKcgU@0Xv4T?|7KVH2J-Y};
zk#iR=ln_*Kf@sI|r4b`j@EqDp=oyBCQ^l^l+}vlFG$ADGNTU>N_Cr36BkGrP=g+?f
zT^HdphrSre?fj`7y7X^LP7~E_yeK$!W%@>P8)bd__H8xwY8;F59|rfM?`|$Tn45$s
zP`19{_3KOkyogQ7+!x)e-!gfkG=#QL8}a*G7q+s=c=Q$|QJwcroj?Y=*xdYRZW5j}
zK@7{vgz?iUWY^pI(T(g7;b1xl+nG$XfkS_vWK_pU9NTP)n5wY29NuHzYgAej(}K#_
zgef;+c#K~|wXKcg$aQj4HV7H=Uq*(}>t^W6h=%@r{Hxhi?mBLi$>+zuBYiR~FGA7m
z-*{})Ql+VBfXJqQW-u_|@w<1Hzu(-n%bW9I?5}zzfi3Timr7g_?L4cwwR_Yzllu0r
ziJ3<?PLCPAk=cwo@_`3q{(IMNthnmN-q}{6a%baJ%X>lb%bDU-J|$<7YgXScbpe@D
zWp3flmefZFvXY=s;^^z|<VM7dj<>Q7)f8ru-;${>;@EzX-x`>_=v_ilV{&(6UG`Rb
z%3kvk6w=4!lbLImwHZ-bq{->;-=O;L5Z@^?G%U;}`?Thw0?lE=W={gu%bCQvm{i*s
zkomi2`NbhMvn}=Yeak~w*t~=%R4dwrbP2C>Teel$(&n!{#)FA$2Eb92`^udc;n1P$
zK#ONTaZxqEw{BX_uEHDZF6RFsUGL-IPO6t#7*7&cw3(b~Xh9KsDBjm=%U5BE-RfDU
zCc1;y^qQ{bYuYFOr)vKuA-41G#{~sT4%Hmdy-f)wef2$eXa)}6Hr$+MubW8-#&L;Q
z58#L{X5G>nPcojaQRC;YkMkCjQ+hQ1!T87F4Y9jNJ<07Qw`|x^BPO3Kv>5_Da=M_g
ziGGcUnrc!)vwx407+O9Jb@Gh{rQC_9G+R=_-^1`0{Zss)&1;e-8?f3qXzmQgbR28_
z;}&3c%0^wBKaa=6#9ZcsP4y1Z9m%=*Lz6=0(b3CQgn8b2B%E>d4rdTRrQ(aYeLG#X
z2B29?WwM%B)<69}Z{G+4(EI9T*VoKnjOAPSY)<(o+W9ji85odSmm4t?)!^vi!^5Jj
zl9L=aM488&_{O0Py6M*R_x5Ah()KfZ<DPZ%>fZ!Qz>10Ar>UtKP53U$z(1#x;F-QR
zsg+M<P3h82P6PMx<34F!mM>ejnrh2zYW4)2uZbQr!rjS1C#F?t44&>hM0Ii8Imb|1
z;Pj84IB_X{BO-vykWX4goS*2>ZozY#DnH}=^_WGgSeD2ZIB=8Z!&3!~Uw0<Pu@qYF
zbLY-oOkB4ilQJ);dH>LB9Fp$vX71ey->sUanPYL7wTq_kafJ2oW<uk{?@Jf|tsl}{
z3%{{^4WSz{;dEA0fkt}0vmd>k+S&O85wkz@_4EQUpPp0B1-AS~rSN_%U$<tBQUjs{
z(?4deBe9mgw5`Z9CNfMs#7Lu&V|oXxF9=oNL$cW)b@sygW&*F)`OXqL2QFv1O_?_B
zEa;uqPGFaymAd-+DWvieiI~oI&fy&4GL@UoV;dj=M8ybG8Dmor1%s25x^QfB7kB;w
zi0}|+TL+hVWZcCxH%bDQUf9KR2&$3m+jsbzZJRck6wFmR{NUlkHrNGAb_XfWOM!)r
z)W2ZBTjrZrkL*mP1GgFQjKeum^!CLi!(AhTeb;b13~(vy=&0yl&D5)EuC<QgjYQpf
zOU6>(u0lPagL?EDQ!q1D$JO{|8;J|=jlf=4QK(PVOrs4?$zId7)+l!y@{x^6!3Td?
zmdRZrkxJZAEhVxU6jGo1;$E25#D+*kL*lWGp{0>K)(v($tqHDk2*60Xf@|<S<bvtz
zA~FDiOYZ{me>t1f^Pp5{3OfMQnASHV9=6~PMf&KO%rwr#4713*kK4n#z2-{sEM)O#
zlg8x#S#)`sC5&cfdXW1Q$xK2`sYAZ;n?sYvwkk;93VHv2ye;a}C1%6UmsD`g2wqn7
zPw<~r<SE>lN5db=2aQxwkwUxih<|MM=Q9pvktb{bYeYTn-*AfVE~-J(CU+`i(Ww&d
zh8Bbzd4WwccY*XR(#^|%YBp1wJrkjQEO+t>GI}kbxgLN_=C6zL^UZ!+TwZ#sD{xN#
zL;qSewRKLOKd&fMucDQhmbypaDMHuK`p0CZuO^H-oY{TQAb&va)<0_aSBMVM+5Ap?
zDc-vq%yQ}m*T7AmSH2PRomuvgOaT+^B=p)zI1Xf6qo3{4Z1^@FVUH}Rj$CeDUS)0N
zDwxUst_LMK4W!Jc91Ag=0=;bSx8K{_cN#GWqHmV%+&ObfsNjNbHUgjEox2bDg>==d
z13`@YpuJtSF&Wb447ZThGzEpWB7xzGKJs+@jkvg;{;6;8?ObGHGuHt>*v<@L40&mc
zF7GeeLX|QfVUh$P&?iY6toKsbqKjfmWM3x!6gcn_?YVHm?|}ja2X~X+nfQxznfiGm
zp#1sY%>yRb4lzB^c2l}z(Y>9@c14Jsdq25rLIL0bz-MmTP+L2MEPrLGXBl^{*oXyF
zybGqM|JqX5SBzB;=~IZUB50TX$6Xj7U!(l-uWTjy9j|RJGYl7VOKJUFY$HZNKvqas
z0Kv->wsT`e2>*TFLcs|gObXM~KQ;@68wj6P1F4cq!-<n85A3=;!{wKD?&?ou(?dzR
zMV}R0Y}nXMPw?j6LlPJM7D@n=KI-iSRxUzY-YxFdt+}L~6aUc8+hc>(?M*tl-9i{1
zNxIj`!Avg{x!8RYL62?XbStTT`iyN~nnt}0bDjGK99eM2R7YRVM*rx@s6SqYgX_vZ
zv%~d!<(JYt(-E9PhXR=VS$sJGFWg%>!2_Cp1+i3XC+XX8PSsP<tT5c(3Pb+fZQPo5
zO@G3boqDG=2-^%&?LGxe0Wq2uI9S5-ZoBHk8!!c>ht!_Gs&i`UW@{KDN*@RXZgSTO
zu-EuBH;4e-`hTc<hKjKSy^nbzULoy8m~0^Esxyx(X?C9P=;W`ik$^8i7V{;$5m<`}
zc9r*HoZruX+q%22Ov9HiRg0Z83%Wj1{j=!UqvK@3)X8lm-~%J>gv^&ahRaDGtcmZ7
zZg+4tF-Fk2{&y6IIoB2rSEY}77Rv8|IduX)c$vWZHQ0ZHR_%Zn(qh}_hC#m(Cv>_+
zjymD{(zi2BP7!sypZ25H&2xTt67S^lX%zsD69#*=T;JXrhJ4EuvW1+x+c#3Ux2LgY
zCQcAiiAB87ARf{jasJ7&u|AUFn~wQe`V#As>2CEhq7cGAb<c-$e<}4azVRZLil8w|
zZSv%~V9wHsR%X)DX^$Ss*d{CL9}%1YlCI#^6<!%)_SSx|*)|gT>Ag7(1B9r&zPlgO
zTaFA*09*DiT)k#Z3iY`}4T09ZRP*kD5@yqW2%<Y;8?}$Nbm_V;rzb1xEaHOm0~zZm
z_I1jO7v>d{_!8Ymz1b`*Wk0EBD!(4FYV~Rl5|fTx5bHrCjO~h224D5__m@@lR~e?6
zuFe3t30Ey-(b}s|o3`$ovSzm*d{>=q50=%HE$HWP>cjLqyZn&scov$ZWph49+ZorB
zHb{wT9640y8;$<{{yJ{JmBCM?MdOu<!G--@XZnJ8+a=b5_iuf_4RHcP4n)W}@77TL
z)r&9EDZh^7O<CX1Na1kiHOz;tm(JJJju#M?@zSiU0P09vjD%RM3)j_UK*Zhy$p<yK
zBaBI@AhV#>()uv7h)l;*b~#~0G<}Y!P`0|!EhEf8m2*65g}uFffc-jZdI0Rb&qkBA
zoXd}*d&r9`+gGX8x`-!L=W6>BMvH`yI;#m~YxbfTVNZLcvl)tln+6F1P~X15rR5*h
z`OKbVwm)|Oqr|5j7%gliRi~(!TQmg7q&3LXaw!Uz0-koj5$BET)*WC+NU2_Gq?MH(
z;8{u1?)6jM#TB!~_%nGxz>&*WEM|SdUMoU7?~rIOQ3yLmR&xjvu44t*uD<@6;Un)f
zU8xF(MtB53;2|(shHN<a7iKDNX|tPLt#p-2QnPGLb&^cYD*H=sUpl2WoEd$EYOn;Z
zke261I6B;zdT{7;q6dZ7{;m(tG&R;(-IM!3jKl2X5vjm?JY0Ze=_Ttz-Md3`QsB*V
zuI^WX<avUgWN5838vTW?J;(QiOKI`}eFnBH50_88Gpi5*RLA6YDEJSF7a%9Maff#V
z<c$8?eEIUozhu!`cWEdoXO2ReGcR=z9ABg}l1<7_$!3C8YiMHU$5`WpQd-g&oBZiy
z%Qup`UMeSq_r*b=e51L!xiIy^10t{H-9{Y0AZSD<V~)i>EJ@IT{(RTaZqurxa%<gc
zz7Jv(1aDkFcPZ<ah6<?NknOpfNdHh8>?Kj@W19@AT6e`AwrUt9V2DSf22&>diLqxP
z+yvUO-oe3|XGa#+d$?spw^@dUU1>E`h+lNs!ninl9@|?YdPKm}hxkn<`x;EvtPMkB
zwn6wCB<^bL&my)Tb1K~Tg3DO~(DO^lC5eX_tyLp!;ECW%@A=WVXX`CB>sCrA`(}~w
zOx&=D1-+pc?LxgL1r-<WYXjCt<F)pIul?;4P4dRlP%y?`ldGuW@1nYuy-onU%zd8Z
zQgz{6ii#(W1alz7uB7cU;jEGlmzK?#15pPCie^8wz>Q=(y0Mwn_YI0*gI&G3$#!sE
z%4Eq;GPV_HEc@RQoe8<eRytKLs623boUc_Hq0LljlCTxTY#{+STQWpVtvh6?it6v>
z$-4&e$m%X#lVt*fQWJS&I34XHUDR|d)4jt-C{3j&&CZ~tbLY-78=^0z91jcg=WV4O
zxJ_>K=~cSe!lGjv3(pF6U_@ySRE1vBD@#g@m>~5V*7o*N{F?QBH6~3sG;`t))R#5J
zsFTck8GVjJ%FI#P^X{O{h7AX)&}npOGjoI1lfR2!XifM(EkIorN;%~v`6Czb?2!K-
zv8=5N=oKgR!L5>7Ek|kvP0E3;eiRkbdZ8|-v&lPqRp-2{o9$$F2~F=j-`52N{b>bB
z(eVQU5krkHTT~AX-0@BbrFAMW-XRAH8~xa*yEvRfaaDc5Df4_;ht65iU2b7w$519_
zNTnv-(T`i?r&pM&B-0S>`;~MrWv?6vb8e>Z=lYu@DtfZNbti*wcE<{W63+nBGp1+V
zhySY@RXl6;aA?t^@dZi&cO`Qy8Ild8Gkku@^P_*)UcU3CJNdwU__tQ2N`XMfGO%l@
zyua}UgMKlG2K4W*a!Dl@Dm+uV#{>p?_LZ5xV8L8lTiZR#UV@q6M{gp9y}G*k5Xf|%
zD~^PDdRiv>`ua)9%uVTCd&hxtnB_S3;|KT4NXMRKygUi&(E37dOzFSV8Y%GW(t@|d
z&OC|6fu0^~2Zt_PBA7XLg^#B)8uiRV8yTkJ{BcV4ecQEd+p)@8xbkr$^lQ2C0_a55
zJK}MR8&6CzwrM72wGp-jW~&c$x2->DNef|2z3Q&)V7b$(k;D`0@A3gl2jx6{YM6VI
z=qz*N{AknYC$7RR@ICjaWYdrB3>mUL6*@Fcl?(Q2tce{Ak4^<fIC^9Ctk<k*4=lg~
zZDXXVv3~d+*Mmh~&(k?N8(;<YKiin%(Fqt@K=|*bjWw@yD<+MyyGu=bmNyU>m_!)U
zWMpNPl#BqRT&<Q`vQ!t4bmAZ5(C$~GmIMJa1b9>daR>z*2)s$b<f}_85hoX|Ykfh;
zH8UPRmMK72mUXOil<yH)?JYyckMBh*atgU^WqHmpV3c7L+^bUd9I-e1Wp&9+(TT#t
zyJwsV?9a$c-ei2*&yQX;z2tM`<=t`2?Mvejh+Qp!M>b-_%VXnCoVrL}heo}fpwFof
zaya=bjXkSC5Pw!xwF#J}mViJg^G8e#15MVF%$QoV!)3cO;P`zOLU`*5XCqkc1dQC^
zUiOgN({?uyrFt5on9<+LW?OE(srIhVFP};#f6nQWnQk<&ILW;^DOb8ry}G;1gzt%^
zH<@M0YsY6O)_*<xyviC~ptX&SV(3EJIC?OJWXcrnqTiLv%VrBv3VyZyW`5UqLcn+|
zF(x=TI5OsQ;_wAdoBT_X-W@9`ODu7kVC?!XIBYU#Xe{it<hFFbzbFs$#_89;;)xF9
z^CNeE`Nd7^+=3G-rvLGsPeq1J9v+!R*vEQwH%M495r=&~n~Ji)Y|cmbV=BLp#~@ho
zu)o8@37KHm6Y9i=j3Wj372@XUtw;|QU$ZgsbsH|KUvuFKRC4-ve2l|!-g@@mk7uG}
za&K<R$jA_ds#@1e>Uv9;UP?=>o_Kux#@6d*mHNw49Xv=$X$=PaT(2i{9r_THq!Q)i
zgFcbFh!(GBmMESA7rWw6pwSt4Q^;~epY)LJ;fNXJ&Tp0zod9*eZ2I==hkmk?=b%fg
z+1CpjntczvTleLv`e~y+MK7zw9nT)nYB;<=!+1$&t>Z8_XVA@gA3D@QA}*#eR!Ps*
zUtVD|2Yooa;AfjnR&9P&m3AI$xfl|`-%Unt_SRJizX0|<U`U7JBZ>{<EG@0QX)cRQ
zYL<t@51PHGW}s^B$5p1>S5Rx6JQuAV*N;}5?yN5-<1Cm9H@!6cf<pGOQRVB#jqw>j
zaiZDBL?iB(h!mrdLF(uKIvX$hsI$<ghNyLh1Orp9wh{zb-6=|T6Nn;<K;T3{EV=D7
z?G6J(hS;yWzW4l<o}Hd|>e_WbL<sTYeD%!}gPU=<DW-yuY*H<ZT2ejyhnMZC5)-G(
zYNrb+#U5ab1tpkcKW%1JRa*&onu@uq21CpRFR;%*X*L3_Sb}hvl2tUS)V4yJPS!sI
zRuwS66OLNv-7qroWP6BQDnh|<ezcyfEJCReaJ#f!BEdjME0KBHpcQ-P8{x-^F2Kz6
zz?wf5OV7K%Os|_g_B)(W;`ZuS9Ms;!eMq7FwG>Be6^?Wi6u<8sI;(0j#<BJ7t5fO@
z%%zE0N6BZQT+vpIwy4ba{p?o<EheC&=vM1fv->&pv^uXaFR5E<_IQWGPO=TE4X&d4
zu0$yV#3WepI*rr5m)N|$52Q~0{h2E%!hUc48V3h|^3@gS^00N>5BUXVV`ZDJY%=%c
z4`1SR#5N`a<nH3_L%>8p`J<{`OqVx<s0t56;z4^bTUFC|1|!@@zU4@IF1a0i3$b%+
zOt@X6A7w3E7E32s3@}WzdF3aU(?wPP7#wXK|LfPs114`xDuS@?`0@-6%z;ObwwKs)
zE9|Mc$2lIVH(|{CI1-q4?|y)*NhNlZAD0NrRRT2MD&X!ANAjZ<0>{l84%QoOL+gw|
z%0kH^NMYATHc?v%I__D(x>6J8IM^;7YPv{Phc{Au@|7xF8XZ#?iF4H_|MKSeuw)Ca
z$DB8>shDjIN}hkLy~IUv{_%gtp$jAI9@p>;1;B94IdRgKiDSf6Z`1<OX}dHuGz6?t
zpi$p<_RE*67>2%w)PAMl)0R06*`8EIni31f#&b9ef-7Jf{{4k*h9M>EHUaSVWhmW?
zv2AW&)oTeKf#EU-6b#)ArZX>O4i?yep6w;Kt+FVc5jF2LPDZpPstKA+wMZAX^xs#a
zqEKKe^}J^FUxL{~W)z3GF5#O9+al64lz~f<<j#xRfXAiqLlby6;Vorxa3g7)c16j!
zSgXq0`AQocP8Cw2hzKIuas%$30IF;(E-Be7&Ms=rx?QgXkm78qIRPoq1B#8n;9yVw
zrAQ_FFM6IkanYhhfLX0Lc-lZ|gEe;vWe3?1fL1ZEwkHWWW2iebKn6XdcKp3d9-q3+
zvQ8Yc<#y|%D|_B{9eGE?U1rW3ITZ(u@v>H%b?j2(OM+G}3c0!AtuOKyNyeZ=M29cx
zolKUUUr@RC)Kp)$8&QKVA3T1%9~;tx2;U~7@roD)I%VkMO0DFT%K+a*+lNF<FJkJ*
z&8`un>8$uY6CSAZp#8I+giTxYLoXCCOgQqq|MY1;dIGm+X7{6<^FEEYsvJK7=E~|c
z34-`uQ2afcK7IKjMG*s7^pG}GtKcpO_UNE7H0exE*s){B-E@%%E*2K;fcd0$yCI-b
zXpiK6c?6%WhUDjb8{@1JcPrnrE`&YtQ?E&wdReFS)J1?^#~OPYR~`!r5@ytr-GPBU
zqKk0?dIzc|LB!HC=oaq$p+wHHV|!qur@H&ixdpjhQY$qadPgmKVHNAVcbxC4u!3(^
zV=ZU<EmAQz86{jS&VW#*d@XwZTn2T<eTpY$qOPLVLaE3i23XlpHSWuaE9uXw8~Y+u
zYMu>t;)HQKDM1GWc3!rI&Sdh?ZF^~aJA+VF<l9-Pht&5AM=Uwtm4T0<#t=eN=&11q
zE|lB{d(qZYrz%&+Fj(?-(fhW)t`0oh$19xQl0x^~y=Nm1B!@6W)p0oA-9c}DiK(j}
zQ?9r4-ls9bJT-Uj8(2CEW*-LJw*#Mms{*N-RhF*O6^?EXS<NsqBR&Wfw^g{Z<EL1}
zL<|<)hp+IQRaSLo`vEYdO=rbMs`}M<maI(+7&==RmxamAeOcBmHQ{B<QZF9TXSJ2B
ze<gpCI(up2z0+*4j;ggZlJdB|-s`!XO|5EQrf8@i-a^6~Q-b6)ty;NKWHd1!#*H02
zi^@Q!S#IFKk(alKB-J|AAd2YOjrIap_hhFJ`^_7|BT_Lygv{H64G)tZ{kU`cPF0W(
z%Zd&Pyhw9RT|E{v-aXaJo}BE!T^^Qh0zJUp`G76M_4)wxm+WN7)NKd^gkBeyw-xYe
zb$`MWRK6ghyGC{tmj1u~eV&EqMx;ktI+w&cDCfJ=OL8QZG9O%5^!7{c6FVKT6eEY7
zQ_c{*(F)4SXNeEOcOqP?v3U5wnh^)r+DE*yD%3w!@ZHME^+v*>MW2vcU_GL>DT3X|
z4bmQ>yv<y5=Ls{?J*)cyGYD(|4XEJhk&RuW(B;N9Iuhc7wp0LTB87!YIJ!~O_i83P
zQCNaZ^%q@ek!rg7C5hWAP8eNqL+{d8!!^NjpUOzI?|Zl$gb9=+fvedv(RmNoO`DAQ
z6I`o4b)f?~M|3?8v^q6O)$`oNi(jV>d_Y&DPJ<knfMGB7vk?(L$(JR!J4JlS%2ic*
z&up--BM;UPuoR=LAh2~D98N8b(tHI@=Z{oEx&pT9%h~zGiKMj5q2B{pPtaNcsYM4~
zQ24{3QubU*1+b&-y)Q?@DU58xo-OSALpn9PvzytH=hby*uaz8KiIYvpI2--&K?}Z=
zT~aAfXrp>+_7;~t*U!{^m>qr~eJC)!Xc3bLqUSyp-;E`sj4ZjER;*kpe_2xVw{CN0
zp~|0pn;<u!$L`FPo_h3%!c#sBddN-{q(culU2C0kye?w1Efh)nyUAV2`%$IsIqT#e
zgVf-)lr>}QFJ8DHl^@pX43GsF&PXFy6VW-2p+Fi3+KJ9!4BfSBn>tD;yVId;ioY>w
z`U<}tadY${Spm`g`+11Wy-!a&Mlec!oT}KP2eS*_z7@u4IV+$2(hCLaYcKJW(`>re
z`(;yKizQ2>4xKjjrHrnuT;gz5y=jC^_?qPhulBB~%%XgSZ0}S7irjzXp1jml9b|PX
zR9Iaq`(jF&Li-PJu@E=WquW&shJfbwcHt@lTrf#;IMA?8_`eT7bk1ry;zI5xZ8W8^
zQ69sCT|FlpH`q_Km97vi2@-SyU13PW<nB@_g0}W4K+F{s?PT9@)^GrN3fUXrDp7fj
z|1Em|7x~QdrrN8<XZJ^HEr(NlYPpX=zY%}X22R*8Y@$y*xn#sm*Bd(XW!$Gn$v3x=
zde&HFx=e5}(3Cppc-V=c30}0cJbGrW2Zp&%Kp7R?+$LU!Boo6MBL09|E7$cc$o*J%
zR_((J<+|*A;QsK4>r>m!e|l_(Mhn!M&IF5SgFllVH4gXv?Q+W0h6pVB8zpB*ek{nR
z_ayD(i$9>YwG}|muNrU8P^Vk;$!)zOwTX~mHh5lGa&yyCUEOw@lIM<K{>@Y&XQ(S`
zA>g-vdu42=siM(;@&-SO$qyVB(I3wpe*C^(s>!>V{vw<2aYMF`6oa!w3G{43;mw%x
zTh7*^+z@OA`Im1~5-aHHmRwsRZM^iPPRD;hxj4t=)Ji=r4PwN2PZE=Hj>Mwh;nC=@
zx$sBE<22(%RwWU=E!_tX?h2cW>rR2u&|LunySg-x(q)8;aWgGEIj+NLk<hCex^+s@
zk~is$gSjnl&&SW18CZUF^yyWzHRpgXr;&a20LD<+?L^_j)QeAFt=?E27MZ1sM*aEC
zCnh<l=${gnb6hGvzr7ysv44L`Mn*@$%UT|EShHq7Nttv74`RAtH*YdV?~o;zf$}`P
zC{rcH2a4IE?kfauF)DmX2p_d1zWX52@^N%aC>CJi_J8+oFmld}nr4i{BX0gZXSucE
zNCd|+BFloM3|tmw@MZJKkdWc_|23B{(pjWhFCz;U_=rn4;KT_jVZ89_+EA;AGl>YD
zcs<d$@7edfJw-{6(Fx>=U(c6pG&bq`;{f|Oob*epfy-ctn}*0)YTrCNvij10RbCjj
zMmAF5XO1l^Ej<IubM4kGujuIL(u&`+3bXT}S2GXviIlIt5ndZ|DY~_TcZ0DE0GoHu
zDO=fP!Y{}I%W26OE<+*GwH?|)2fV`*S8DR<$Zg5Qrk)$faA-d^T{RM@JVIv_h-l2D
z-VPH1j^g!80FxCQgxcA-VfO6|gLaDm^o4GV`&#TUfYL2XW*p7v&e2!;{HD#2qP2JF
z(4zJpp>n81<?N*l<)_h<vE85{Lr$E~|B_4_XHM&Ov{d}}GM(-Id)i-!$R%zUw{G4H
zz|@0T!oPP^?EZf{@7w`;F1l&{l?>TFeG}^X_QIA#qxY{W6`s5hXG(6HP1*A%Rxwp&
zyQvyvp6$ykVb6|ct`7PX%~+?eFLfMfHqk5e^zxeb<rwv({?F?-Z?-VsUly|eiq^VX
zC>@rqtGi*+$LH9ufnTe}inGPXxP}{xzZ-U`J~Zk0G+kKVdAZZUBhPDk1x!oMeet3<
zp#+8S{?ZEZS%1GjyP%})5dAm-)*~exYHCdR7;KpwKf?UoU4}xPt6+%6<wK5=(gp30
z#O)9Wg+j$NV9v<twg4khJ601CbR6pRPkqVIJo-!1y9x%v{P_#hy6=36py&_8QQw}X
z<kpI2U?yLJOXXrXD7|mUx|@yw*E*Yj=ySw3lqjNu6*<WYAX0dovWD=*A+%<;p+^?!
zu+FbZFbzF7uv3?r$Zv2m=ZXVx{r&CrLfiDz)b}8MK{w|Ue#!48wBNQ5GFQvXuRAOD
zy~i`m0WzbnLWInBZXPf-d|u_|iO|{(?orwPQ(dx;EB%u0DEXpV)W2j``Mx{)FHLS8
z(|oBrrNcYQp?e2$l^+4mj1DUtlpZMRznbZZSHUn3XjtJjf8OaNVqv6XogC`2?%p{Q
zQtC#_Y75J=?N{t%hL1##cP|xX0N?Qgs9Y_b9I>SHu4a}#zSlYFD2L{(N=DI}vptKE
zf)k2b_xq)~z4dKrHQk{fuTNQk5RQ`Na_niiNj`yMeDJ8AT+Dlkz?~|HOfa;1zQcGT
z>Q;cJVz^Rpyg<7D{d_*g*jKgo)3HBl@tWnI=A?^R8j*{yUwGC8Ls|)b@E-J5a>l80
z{SHma{a!b^4W15$m{_KmP4|$Z!*(#F@T^Nyg{Do`lETm^Y08{bx5Qs=X<i(V1ApHX
z5G^odvHhm}amr}FbY1%gr{FAo3$oIlRa8{y<g2q(o{*4w4VttgS|;!6GNjpc3-@_M
zN}7V9gjUH21y3a1!r57}Mbpm87+IK(Y6x23nX7ID(o5$rTOy^EG}ZQ9<%B6nT+X=(
zrnt7}kMi=4gl+%A_;oK0kG!EC8XCSMT!m3-^cIfU{YG}>5yeR`oAb7~&X0aB^1tZd
zZKS*-FRl0kk$_~Cp>@B;{;r#z$xMX42=M6p9(g&*#uhDEIy4u~`V#gjEw|-~ch;ao
zMZ~cW?b8Y%)F9P4`}gl>9){kkWLWM_K50bDRI=KM8)!IdjpqOovB3hN1eQ}sH9b<^
z;MPhqdw=IWfQ8T4U(x3($Sg)HICSYZPCIwfo$2lwRtfVKEI9W2l@Jrd0ZTTGIn{+B
zu+gi&oLHGZxlVt7mhL_EsHc?F6V1nxb^8BIl*|NI%U4Ql*}#9>#jvRoch(sGZYIUh
z`iS)VrOI0|GQgt{Dt%#tC8*jR^UTpMLORN=3#HTn_J^%K8?HD0-60imK$lxIA<3!y
z{dr+<<Mp^W`pL$`^C+v#wk%vj4I&ED9E@=FuKEB#P;<_)-N+b0hh`8a-@#T6y}O+0
z<^{q|CU_8hREUmI%6kxF%irk#9%{UGSI{j4<~enCtY=+4$dajP<GA6&X~^khU$Ya(
zotQ<QZ?CUCl%_!@58NhG0Jw5Hcb6zO%|edF>5q_bETvz{P@F2-4fwz0P(JsnNSr?`
z5VtW_&FhMZa^<U5!t|Ut_M1$$^Fa0P^gPaHZ55yR45_Ft&zXH>3yR`N{yv*H9T6$r
z!Pt7q&Not4Rh8y5n--#TvStfu6+U{@mak`Ca7+ErB9+ngJ4-Wq_!pMSZWx}$By$0X
z+;BUmk?vX9H%!BA$%lmLlAP&R@C5Vz6rH)0qS0~M#EG(0+|Lt!KxzvM3<+fKiK}0f
zd0%7eRXvqAML*n%;L=v}A1Hc_nG`g*pMSZE20*PXPLn9EVTiR*2j(`xYT$>VvnP68
zR338GkdTm$6>x{L(7}sdH*<vssd{HE(mGuKA(sq;47^WJ+^?RmQIQA#$%g%*r@|x4
zPJEveBcYt}cP{W=xzNmE)b!~C3AVW^QTR_QMW`(0BrZI8L>9USC|!To1{irGpx1CW
z9uke5RyNJxGj;~ujc-1Im+gU42!#m^<_CI)cYe?Qq7!m1%P+1bM!#?(EH9>e;{u9v
zEwfytamKf%7XgeSAf60JiRoPtlX5;&SK&_}2{`E|pmx1ZPlR}yWKYPvL2^XjeuTbL
z>Yk*cJ~KDi#DQ6MtRVGY$-}cheVA(QJx?FnAn+Y&eQxX)m!?|sYHigSjnKR5+rNiq
zHBrmcX*!E>C=yXfVv%~uJpK6%`f_~DKNld~&1zdNBo&}CJr&<1b>qgts>sx0)I4K8
zCv|TC#M0j<uz9G#Rch?K8^XK!M%QF1`$@W6PLmi>nxAh7a37RR)$nnpgq_XX<xH&;
z+ngM2CPd%AME&A1GcVYRL%ehAjC*HpvrNIAY5?U@QJc1sa4J(si6?$<X32zZ47B%W
zef2tGlC440DC3B>pP1h1D!MuQ6QYd@YN!gy&cmL4&%1c2TuamK72>rVM9<#}Z*`-a
z3!J0_bCe?fZ1yXkE$$M6mvFvOX+WLR6G^C6A;V5s!zV9AKXf@-pJ#^p`pC`iW|?U*
zzgTfpT!bzCo47OHDFA5HNbx-bS|!Z&a(8i5dX1VR{A%cuTv)Na00={9Uph#H5esoB
z_s?j0ZvXuR4n~NrYvH(+ewL}rpB<q~+!ymz?@t#$KB-wPYuGq;+_)4pRZ21)bqx*Y
zuu;sO*32_B+z)G|mL5f~QG|!#e2AuwIBJDzx?9@F5q9ZVU22<+f)CQvxaGJ`oiar@
z&1p?jSGO2`<ji9ckd>qQ2LUYuG@RdN82-F0L2d9^MCd{Y5ZbMq1kOETY%|Hkh?qy6
zBwS81isuk^MTU(J3k&N4*cLwX5tp|S(uSDtZTLEYSH}8OHFF#Nx9EYGe!64+(Z_i3
z`;j0^S8#<1sZ3<oAhfColUH)ASg+v_OJf1@@1-sTz@hqBVteuYd23p-#e*h0-A`#P
zl<Dx6RLvd?*d9(uMoA*HbiyjWJw$oTN-<UQ=e|F{o?Oq}K`4Ak4OZLOFve~$61=`M
zcEktRzGBdU$eBLw!s=+m^mHPtXj29p!Mc4nHPleTG}oYTxaKBAAEZ2G&5&DCet&%@
z3W?L+cGBH(q2$fQC!*{~6a5nO1Pb4ma<sGO+)kc2u@Vw5qSIgCwELvl|5Mfr%23>>
zWH=sFd163zbnOq?z&EbEe!=&Ay$oT4iHZy*YCrS)rCCI|o+_Dk^^>7{_0nDFOHERE
z4~L!U)V^IiYx?fECuZSQK-?840b6rHs)IXg+1slCckIJBXx7K9S-z#cuIzr3`SHiX
z>Zi|s*Cl1-0nh-t&USY1(6+6x84_}T(WxNTitx%)dB|CN|M{~I*QlUZh@3CKT(>$_
zCOU6Epy_BIu(O9X9V)f>?KlD@(Qb{{ZvamF4~S1f%_^+RXKsiHd;ap4xV@|eO4UH~
z100MB`VmhSkYVS=><fz|*D35j&vw#Q+I^T*Iz*zUJ<GJWxB2>=ax~sKH~bj|S0|>^
zQA~=a252$ipigt98hy_TpIhZg?cJXsSO1fC4&TxD8|vzYq98@&lYq{`G3R+!mcGq6
zSIQP)KNeo`@uR1BEeKmKH+U=2FM@vy#kI6w-?3VoTtzQC*|hSq$az!*pfC#V7v{yp
zNs9pw7T4Ew(Woo!CAkmQuST~ayT3gzBDyc|A{H#92$wUzyS!lhcy8*n)a<p#4U|5+
zel9IlSn~b}`cAaDwX`D<IQJll7EX79cOafgBuLm&=v7oGswpX^DJPXm_=e^BlG(_n
zJF~@I5j+Vtn^u<Hz=-WBZ})fgk9<atO9U^49YzmYO8p6#!`Sj-0k;1b%v2A5&X81D
zpx^^|H<jdlF&42}@g1KMQ}%<dNs%px+evIBg!Xh152C0R`C7c;QA0hw2i(7&9FP*x
zext81uHI|iQ<|G<G;{91;EpmIeNG}=Fdor~=FvcB;8Z}kVsiYsm>M7u_JV2%#kd_D
zvhS2l3g{-pwIHUQ2JO1|Pemjm$1bLKclxk^c9us#2>-YvKT$PMDfl#*3)%#a!kInX
z1|l8WE#Ph_Qpu)XLutv{{H@@IG{H*H^q(Su;g{)7ibfM8{FC`q`Wi1t;&b4~=^_#y
z&4M3?kQV(s0ARQs4G_q8{r0afx?rYhU$b~wy6Th1kFPPVhPLKT?kB0t=%d^NtS=4k
z0BJ1jl)7N~$lA+KoR~i(*8U*^YI0v4j?Y-E16#L1u#J36M@Pr|JOK|EH&Sxv-{n0p
zH0y}8_Ag5$*-edLfBDL7GAHX+Wk@A39+G2krLo{YAtc3_WH-|YJ&TmH60W9IM%cV7
zpQ{DFZ`#*y^_QbMb<J8@TB2u5a-V(+ZPmr)S?mFw{MV-a)E$SA9JZ4Pupd8=i5uv9
zYVY4^z|8M`o+Zl7;?kzCye&qP6E<{^pd9X@@)fSoaM1gb%Ml#4Riua!_SU{UhPDu^
zBBgdVl&StvWEJ`SHTv1V+-dzkFn(UB)wSKLP(`B~etu<d>r(R=Zh>BANr&;$C=%^b
zLX{t|rI^u9$f9li<}uAQ``XG>De#|56>Hb8_d{mC0$j0wuPAB@5B5Riz0;!O2|=}p
zMSQFg71wF*!*yeFQ4vfb7H((b?wFtA8w%2xW)&f<3I1dJXF9E>ofrZ#%hffpps}!^
zAccQVy|8GB(*<&hx=U{Wm%3Ah^oTjdsP|6LV>i+{AAQ8Yv73j+k!Wcl{4{N9reuw!
zRXe68Is-Ya^HtR<Vl5kH>5&YadRO1b)B|Sdp)y5ZR086%PnbC@75rG|v>Y6*SX}z`
z!?#Uyn<3XO{6~$>DPrj=(uSa*<zb<r*i-MJAYPZ$MoMZ^Q;VK*XUzsq;PS}abG}hg
zBM3~s+zG?)mEQo<7^t~x*@v+ovt!x?A3od$Fc!wl)JBelhO0P3MJw2XT$K}m^gYbG
zEZnXs-15MvL6zGQq*M+GzcGO&n0iKE`{6Y-=_|dJOLbqErU9gh<$^~u{lj#<bWWhB
zf|gAZaN=c)*w#2>nfCiZHmSPfFO-2Vr!Vr&MO0=tQb^bos=*x|kxW0oqT4}&bj||<
zQXE|r?2?ht{I@7J4@E4L%;9)9jP=-tVkURoA?(NKiP($(eecNDA9zP6A$`3G2XNqp
z^kUnys54Lzk3doFL*{UA&_y98>i|H4h4g-wc!0uwP=BEfxpwoWdBX{XTu;22q79Cc
zcgzsDD9{JE{>+M9JXtSxGs2$!_6<UhEw&Z$?fD<dx)ff@iJJL*nIG7~dk8<R2BEmA
zu1@6)KEE>kq!^1VJUp;Vii{l-&@jywDY^5nQlGRWp{#*g>nMOx8jNKB*I-1!b%f)3
zQZ0KmtdB}refJ*5m8<RSM5`grlUnf^&c<*|ilSn-T_C`nfreW@?C|9lFhU_I?xBya
zV_YCPdVRVn_(h&=u&W<C7;W?5{ta^6uDtb}O`GD3bhX2XkWn!|FUHXOAbbIZXf`cL
z%+(U(!ncgRBwv@(hTexX7@wBAdD=FEib-_KJ>Y^t(>Zj+uH^?8(YN1^#Mr(jC(&hd
z_=!@=BH^*&FBa;`@I5^wVo60S45AAATQQdRrtWI27Oe8D&(ljNfLBpjJ6C^^Wom82
z*xnM#n^`#3C{zdE!aGOgeV?00Stj=7&^_Qzef~83#iuDRs#mp@@VLZH2U>s8DcMEb
zU~pl+zFi{akr%^_`EQQd{bqGOsRs;t^vtT5-{I;lJwUo^U436n?%75Jnjl9ES8CsN
zQvb1zVY9*_LI|u$>P7S<?h5Ir{qsm+UxVhM3#IWKU0v^YAB5c|b--TkuVOxgbKNgR
zxc3z{HlFBqBY+RfnS=+fI_mEV6J^2ASCp3iwj7&(W%&TuiU)Mebnn~Oo2m<E*!I|@
z4__RoAQA+D8}r4qZj72Q&=T0efjmPU+Ubp!KL&!ttUw16{~6bPjc=cf#)t|+9MADR
zRVy<&bdbwJKM1>i4EnnR_U^rYy=te~_jM?%MR;T%ln^=Cg`G&Bt$_?Ve_|M#4uvby
z0Qn^o=lXKDti%dajL8LZ75y+bd^F^Iv7;05l5}<^@@{B=%TD>R1iG_?*pAZD57}9w
zEtV1%we3Sf10xH>&<6T$o?+u}EITAhXgavoh5QR2BB3~#sN~q&tbUhODWj7gi3VCA
z@AnkLA_u{F7Ws&h>l%U1At5;&dilKH=xjM~SdssOiFyivgyt3O%S3yx$Pv&GqL%G|
zte~7EBJkvyjocBa!^3%D^-ZN6sRq1|PNXo-1HI*bCdrYX)L8y#Wya?eOlPRNQ){Mm
zij0YVU~5hv*Vq{_Q0(LmnI8%s`rS(d?~1m-g~BG=5rgxgJ@V*}0A-oNS@$NoM82I%
zMlD3{bmsLzL&V9RNq)_ZX$1Gfbd5HYb|ZV|Cy6qa-dhN&5^@cZU;y#?bMJ)<eKI2_
zY5Ni#3C6Mu=Oe1|pu#JL?H5h!JaR>4$JR6PzFM(y;~`;^PXH31n}rk7doeN^LS7LD
z?aJ6GKE9d`mtOaD>ntLpfIGxV7P%a|{XM`jxu_>ki|JNkz65bq*~(^%jYyh+a#ny<
zBWscqWIg4PWG3=Hp*T>VHf<H(1|jucqv~b7AQt#1(!raNo{2VvVvuZ#&;5+MKxggw
zBaCD&sJ&N}I7`HbG_igd0*Hc)7^17Ew+G<s%ZnK44jno$|HAaDUX5UYT7J%j381i3
zC>O;emg--o_qgDu!vV5xY3WmU?%o}s9&u<c;aZ$6Y(<JuunK`Ed>u}%ZGqQ&Q(+4g
zIgi{6;3}!ci74*HwIt9}VciD><r^6}90*Zp)$k>3L-0Sp)!M6Ms^w=o1IT%Wj{-2t
zBW{Ksefo4D|2K^^YK4rC{Wgt~<6Qpqp|hF9J6QNtiN`Mz48E+#xmPtV2dgXlzJ`aU
z)up8f!t*{9B@?O^k=)}P-vRCumA&vy7DK|t-I4z_4H!zWy|52LdM7qLP}5bb<T*gx
z;u{cx#8f)ic6Hu)v`dj#v3m2>Do(jxqo`LH>MR}uS10<YM$7!~>__kNhbj7@I?}39
z0NV}4b^F9W<7c=g!Or>{7{(yM-tQ4w@#U)&CyrA{)ejXD(w3=dpZj-XY3OVGuzKgG
zD~(WRE?(M_GekZE;jR7u`HAS3O9_XE03UM|VYY?Q`S)htoqhB|O^0Lo>sd(q>wal5
zc<}%8qq`vP`t`BQ$O#wvF|fM6YXAQiA2(s)h%iAQ4eH=l46H<AWL>T%{yx$>Fmv_(
zu6y`DrM$~F|L;#Cnm^zJK}2mO;V#|>8{DEy&Yf(j%+NIPGhe~1BtAY~W&>El=TUuI
z{#ktjeeta&443LWuv4p<JiAsB()oe5e9=rOVsNH;$0j){v}{xUYkx3q2f%mI6-1a#
zm9Fv;-7MbK+P_P{kUvm~vX(+Xa=(&+f)R)O&nXzzjp^ueSj;LBA68|=h)$9*`n_do
zux%$1QddRA)cZMP0!}?()ry;+4*S3Nk&C8q5Eqq1q&Dp5{z7J*dol5AiOb7Oq#_dn
zHP<@2BFiZMAV0$AI)GG)&_>^D_pf_V?hs%qctor2&7X!m9I^0%pCkO#)7^pM&4jXI
zWWAF6;FiHPyl(v~QF(|;R&t-<iGBEZK?<MS=<+g^7OWwMGdmCL;#Q7KuNBP3`_t3O
z{NMi&(DNR71^E#)v3V2@v_gzfpMXRpJ~md!ZE_efNa?qvXlx6pl{0sUyu4Q54E*!r
z_M9CmiL|_T$5GV(?fuvoz6S(AUADQdO7ymfpPckHFoeX7RuVDgh-fqU|9z(M@WcQu
z{nh)^q11J-hU@S*pK1|h=cMbH^t_0|Ow_|Xpa|~U`h%<%KN_tCV!7dE^rGzDfkvW2
z=ydDHo7=1I>$0N~0yM2MPS0)mMe(~za6K)Od<l;#b$gN9T7j5HstEyp!l2&0*WA}F
zUnPot1atn)8;X(~%8h(<yBxfVq}VM{LGgdm;?E~s&(BqI>fOblg#F~;6wu;U$}4pY
znX*}l>RKWJ^4v|&l{ktKDPlP4`$_4GaLJddJLmrI<(7p!(reik?S(ITi^>dT!1`j_
zjuKj7aKKZQ+$R$nDjpK+CcD+WsZM$Ko2uIYax8SqHfNb!0lb8Zh~x~=N(i3d|K2he
z%Y^sa-c@hn_slqsPECJZB*LUnWO{8{hP--mj$1x}X*og;F&Tq)w~6*1LLa3r-&#VL
z#9F(PgbX1we<hzihA;~b(@TN@wih=nzxBVLh!o0{eSBK!3bN{l7?kYhBO6<Tm>s>g
z!U>}CU6NZ1I}thj?xv4Y07`9yCUg(Fx0YxXk@85vu;s;6w!m6z1p|W2Z8LZFFK9`n
zHyS&BN46R^R6Zi&JB2|?-JAUUd~v@1{m3Q}s-+GryJ4Hty1AjK`QLKgO#J!*?VFa(
z8eY~0#gx+j?Io}O_q73vjj=J7>}JZR^}eBjG?Cqh_7Hx8;+(ZClTJP@aGr%|=Ekj1
zXjv})%X7@AQC-Nhdlx_v?~fgOk@4ql>r1a5+E{q=Ny}k07B&iETB<}$LZj6<|GxON
z#?~K#g*gBZZVTzrLr^g>DVNOe*|YZnp_C;L(6vj%DcE$wLSEkF3-6w=t)joA80c;;
zNMXyjYx&~F@0W10k!7_SGI`HMf`3bp?)-rEXOQ4ll4S{wt(dIR@=tw*XOc)6B)gFs
zpP>ozUqqK0a9F+N9Ayk42?YaPTdt<t3Ap$DA5zNBeLJ!7%usvx*+Bn&j`(K(eq@t~
z>X+%mltAkE^2}QrB*&ej5bJacH_`w7rBF#bE^SJj#hM{603RE_rTf1{G~Gg=JPY6~
zDnE%Bkt;+2d(LuRB1%{mF{V2JZ7Vi+?Df{r0^_z={J-^IL-F&9>4Or{@g33WHA<IC
zlu`_+n#f$3wk)|Q`)M3+`6E+p(H+ZvYbBXr|GXoMqDGeNXv)i(X1H4Th;bMEXrr>a
zhOu3E1$WYYTLMGm;%h5fwVY>0r;Y0MZ>fmQXMg>CKO5KKE)tQv=2S!O)~g4TtWd|b
zk%-PvM&*bHZfwl?)vnrrD+(eXK4sua#ee%;K1+Gpw1(!`?PWO=i+q~XtzNJ0u{fZ+
z%=|)+A&>k`^o$xd?K|#km)?2F*O7N@?QUH9YU^c^m(!(sU9`=Te8acV+kD@sCCE?k
z=pCb|8{Bqi(3{d4uZ>sx+oc7)S#Y%MLugZAw%eF7Wt+P`xmQ!|BJQ063}_WTU(eRa
zbE|o~)=@>WI|m-@97sxZLV^Y5VLcd@86FmASG5A3l}HjEQ@_Qd{?f}WYWg#Iut!Hp
zfWpVp(&4mUqNzRV<>e&?OA##l4U(3uxQ0diOj`4gDcoRKG$O2vPzf`uAL)gP7u&tg
z#o5^w%kZbC)BOirB=_wb(owP^J~45+KTLq8X0Huk*)r-aZ$0xy9PriJ7W1ARjJfDy
z*j5rLa*nXnU0x5vDNZBLoVj0&x$H0-8=KEO-R(_6N@h~rR&u8QS>BhQ;W?z;6EPRl
z6Uj{Vqh1U8hVqW*s96?MC<dJ)D%g_>w^wd0eqG`iMpYjDPWd$51GM&aHsC#N2fLK^
z5N;JgM~)u7fbHynmY2Yh-a@nS_hAJ)lA|v<XR(#$8wS4db>FpRSHIyZDnWOr-7^v|
z8mR&_d4>K&^#$+$7VqmeVN1EuzJ;7l2k-*%mjk<cQLctlTtrgy%t`;>XE_eoVBuc-
zP)<RQBKZX$wf2;Zq_{u&AA1gpl#~>3RumP@h@nFtlv_w7eJD*&k@_m?U82*FrXtz?
z{q1?kk`-qbWtE0KiinVB^x9k;4@~=Q(`s4e_JsI6K({c>=8t3C|9_Od2UJw&7WRMC
z7!x&m6MHviP*AL(0@lQ696+Sm5W!x+0thNtAjyqhyN=iptRT&<C}6|bKm{U2L`6|V
z1gsPh5&i%60K&cBx4yOhv$(m54m0Px=Y4m1_OqXDg>uR7+{TebC{<<<fx8*C)dUjb
z?ylWM?nDV0TToco7a_EqMB>@tm~RzbE3l8^r&~swYCzSCW9l@1PE`JNaBU?ri7RGi
zmQhY36EU0DwNRA3eLVZW4GrF%1d?7v;O<PqI^(!XK6&pS61TSZdj6w23BbSj{-aN0
z!5RA)HB(}mxh-7i!2oW(Zr#i@DL|1q+egdqC`#B`nhkTw7&L)!ox{LO<Vo62Cj~zz
zx;$!8W`+K6h%KgF^2bmdq7eh~_kO*xO@9DyMrX3z&3>Qg+Ny29PJjPH?33`2kdfp(
z_+ZV7vipy=4IeDuHMH)g?c0M%l+#}>QX^RW@)R6QQj?pR(-ofrYfH<$NNUa|#nJ?l
zOvcVX6HWo{>n?TVciC9%_UA~8rm%ssA3xrQ!wMq;k%w*TBKt@tw$S5jrXj^IrW^St
zpy%DmijUhu(hZN?q~SP9O(1Z<Z2DNkU^HEdlWTfoy3_wks}3CyMkzk32}JMl{D-`7
z+`^~{`rpYnhbYUIEjtCENXz+14JFCb;k(;@qlDP8>tHL;G)TM6&8f#NIXCi3O=!^b
z++LopnWp`+H$T_+SA6oVL5Y{YOP`gn>WQ6xb2+Pw7?WG`U4_IO8k!~z`~eKl4hEg)
z7nGP#bBiY*(;3FS4^Eh+=IN_fN5CAyLHTAx4CG6E>h~Bj#F0lDuz&ylbsPS0D`03(
zSJc#CS8Bu@<%>?q<jH&GCTHmIUhj`t9g5M3Gu6<^9|4s@N2O}%-%xwWF%j5uDr$%S
z_@j1dS(!6cQ}&{PNpofI4>B<&70I%k%dQ=kWY${FjIn}(f<8vAl)Z<9ibMn5iS)Ue
zKuA*%#MQ72w2!RW$aMcJr6+M_b5N+(1afrF7bTe`Sj#!^3G3d!|2XdK*ocko4+Ao^
zYF97&>5;RqlE6TkK@?ZtJ2vIdSLaI{5Wh&0?Qufpq!qu)xIZDYh}H>M*du;#qP=!o
zGl8mq{`q&+92}b1zW#?une8H7_MJQx1sW{?d{k7FcD4Tk3F^vWpOA^1^&N6ZJ(t>8
zRo`5-zsA<y{t$JfNu<#34*2UmY(f?DvQIpy&le4?>x%gFD1Rd>amcDrGSv?ZmtEpD
zyV+?VyW-sB>z<uvv9n~2!z7a6={`6Fg!x@-!;zGiG=bDsoXDam0&SP4A}{+yr--h-
zds}JT@6S7<hYl6BbU9D9V*%b%F})tdoN8-0EV2RFGp%@HE+?spgiuUyBr(|TURq{N
zJ;o9%@}0%kiqduOi9xGvG$@lAQVT6FP4wPI>031iq}YpR=K($a#KT5>x6h?*Jspn_
zCJbocPrL}#v~}Htu1DD4B#B!%yPdR$U!Fqz?m-i0d_2@q@o4}H#S(Npq4q2gK?(rg
z!|En@<H5W#f3L^XQT=(4dH}t<9*xxAppxB{y*GN`+kY44Q0x@n@<2mLa}I2%`i&ZG
zb1%KLrsToGjk59oB&D2h>*{(OUC$%D$GCz2;tY@NAdI^5lpM!A>*D$;iB&9O;q&PH
z((rgXtZ!v7;v%}V*aRzw`(pWBeg~SH!%cuh1{B(%U@spCM$vLd!y`o~Bst-5?9O{d
z6xv}R4N**G+yZ~6xCjuw4~X4~igbcA+xa#H>89=5uNdac-kS&&$Nkd_Tn1Un)$7;0
z!J+HCMNK6wq29#`Z*OZWvz!?d-ch|~<Hj^Hf6vi%6wA&c)hPP_qvz&;EPE1=mH3v)
zec*1Q;_JgXSe{xEB#T|1@A&!2Yga|F^c#00;)<S{;@NpVQU)X%A7u~ZuI{qi(%s$N
zbH3Q8y=|K|Z=Uz)^QSZDe~!W`CU~C5OnBvm2HV0AsFQr@8)}7K&vwo-q;n)k&z~*k
z$9RzTE8b5j{qUi+|G$(sk8p+LZvzLF7bV|FNLb_`{Rh-SyR-e<ILIM3VbE9TG#WHh
zK;=AuvE`weci!T3=aA>+r4={tyPb+0OR0<G0JZ3Q(A9Xy?c<6v25)fW+J<)FjKGcc
zJj}E8%HBH8kjl5Gmsjf@jTB4vt9?9W_M_(t)?i*#YCgaVgDI{u3W$rV#LQ<qKJg7C
zXUIEz=3!EoJ<G6toz&8g42Zm?&5Hd)WY0}&+N{|eibh^J0gI1xot<)I(nOktETvV-
zeASxCIGomuUKRb=oCpH+y!2UnQinAjq@q5V^>-~rHR)5yM|w+d!*DL58fSczym&Md
z;#T4YmnAz7XxnJU^yzq~oBYf?sYgAtbj4&fFica(-&^9I4m^%J>9b|A-^#@(4Gmbi
z^GK0v0?{tX!%Fuv`d--^_Tl+~jYW5-ZOjF>O>hPiNT!z}gjn9SzM*2Pl$MtI8O++?
zu%dK^m_W~RF^&Mo(#u>Yxt8K%dZn*U!k(8@q!=slgJ&&G6&K(H3I1Lj=Nxo$j@)a_
zCB%#}S@>+7e)F{ysMdzx*o9~Avn$@~#qsN?@l2lli>(UtsNEpw3mUg=lv6BaR#8k~
z&H&;qvnT(egzR4N@$GNtQP1MC62@eY>fgUVl}U4hnYS!qP~BYi2%uRH#i4$?M$w&n
z4}5)$;|M)9B3dAd+4>#;BvwL=Eu5=e7UgqyT4trp*0n31e-nybNAVyMHHRI8S8TO$
zV8MTg_}o*AIc##q35t7eFL@2>BqN~Y)XJ)wxIX!JVKgKc#(Eyj*}c>P!ob&^al~8K
zIh*+>Wp^o<sNb9%#&q!m*HzulU>_BiI&vD%mp<DiJCtzjsMkxF8Fw5%e;6kg*0)i!
zNrZ8`m%p1(PAYt-r}xXEq9R+2+Vp<AQVE_iJaYoRKKPYS*h`q3wEFKnG&US|*#W%H
zq+%*E8GWfPL!wLwih?{B5M*&SL#*ng!q2TgLzc5gIT$HBew--1bs4CQZ+!Zs@nq+r
zTcat$;pX4|twUegC()ohx8c?Ke{NKM3%xXQXj-evC5G!e`=6M-|5hv!)jl(NZ1@<U
zsbn4;lYO5qqn~0Fb@4m0r&zx_HrP+o8^t;aC){2=FlOFw$+kPQkr<?F!Q0FkoG)ze
z-pqqJWo2cL3R*mZHN7mg$>8}TsO(}x*6-S#SRVODnQPsWtUj-<H+!(LTV#jru0xJw
zuCTvU9(kzK11YpQ0njJx$;ik^#X@|?6zA+2jcpxZ;irZlz0h^-)%_R`X9v*R3rOf7
z$>=QM6cq<dj|RP&T61dcb-Qu%rVWG%u$HYmcgj4OIq(C<1?(+1Ae|QCr9|HR(RU-4
z9>!px;BZ5G4xJ$(e_&Ub1^{#(g=c?rdLKQ4A*|t8ib!jJs958$*}!HW=nU+s#%}GT
zS>Wn0*cd&|P*@+eXYeGP{y4Sc>v^c;$TcD?!5R;hYi!uicjLy5+G~uaURYR|%67y>
zPck(#vl%<~PTy(g&z&=52pFh=8W!!>fpxKZzh-xLJs1=;hq(b~PF(}N4_I*F;UU0-
z=Zvg5ms=`H2?G)qo3(-p6H2GX!=%<!yvL0k89OR5N4vPJdaIJG`r1{$8QEc+i+0t=
zTD|(s+FC<YFOgaKv;G$$cdws6XOOKU_;oU6vLaR^kFb8k^;$ko!TfFOKGXjrp#FeY
zk(ZwveYTl7^Wcj`Wn;Q5dX}BN)ufBQ{;dgl>(;Kd!9zWTw!8m|_L%PNBb<J}{tI3$
zJC-8MQZBxwe|^O=fC=S}qydtZ1rmTUH%JLX%l{iO3XKO9{nNdJ6Q9sAx7p|e+BhvF
z@}Xx=J1T(_F+U@I`&lQAF%pCDB3`eea!x^1m~eRP;Rz>45Tu{6eekv7s}J#x1!9m~
zhU|9o$Ai}8OCKVGCJ%+EQxo|2-@h>L+qgl%qO+6Wd_*}VkbBICu;WK((u%J0-Q7=-
zjrwONAaJt%^91c34ZT`Ij!U>b^W;u)vKSW-20wH$9G1T9`||GbVH_k0y{MOM4G5U$
zK2Qm9lhhK<zN8RhmH5bY8?Z28J4`1&Y53iDsStVR&smT5&BTPag>k?3M`hy0HUl)`
z*v|Re?!c2i8&s-2G(5f@S^wvA?Ruk}npZ;iqqRPsp{f&(`|#^E<quUZxz@e&OV41<
zgTT^e%$n7$OPA}JzKW$MgMZ8_4v#hE&XoPb8)PiHKX1hKt~O9|Dy&O_GvQ$x*jGT8
zMF#@|pA#@8q&28t-~X?~J>|>na(a$_f&QvT?jE)(FI#;$ez%#k(K)xE#hF>Rc1&^U
zL?X+SlZ6+KmOogSsMf4tP66W)egn5ywBmvnmgLVZwTEWEEu0lD*YfSRR^!L-z}yFk
z0mn_>k?08#jFahy;QEITKESF)PMCj{*N9cxFC_|{_}InBck|}Q`W~ITba94h-Sy>5
zF!a#`5=IRID&niHc`4S~_q#Z+C^`Emm&)x{)hN{dz={kolf1@B&?<E(>J>jYUd%so
zHszwXH<?veqO{$+x4vuJoiyHmoUUsHIG}`RM8=gJ*45Z}0vm|Zh6w>u=t2+@w&{Rb
zp$1qG0^=U06-knP9NzxNAAjcR>+{rMQlKq5+g)G2y~kKv+dx=)6ZD(2kJDei`{BB5
zb*bK_&mBuT)vRgLr%$jodo}wjfEE7rgAAHFnKI{Ox-a^o#4?WUUX$3;*#XGM*LUtb
zf+(nmxi$+`ZrE8_ZAN_mDCXWw!!`*9!I85_ZZcfPLDGja=EQh?)_<!wwN_bt7TK;Y
z=>_$A2!w&PC>RXY`FV9k;smTD*~QBTQOOhX9*}PV(wPR>$h@83e(T1)<EQfe+4m=O
zDSO|Ra+iwlUns22h7PF-1iNI6Ou(vs68lAtAil%w22=ha6V_!?;}usRLc$EBcC4{U
zrR6O9#~*aSFto4tLC{pw=l0849RSFcMkUEK05QjB`W}Afoio7JBtfsT+ihG){)Uej
zAu3{;Kr9TW#X<`^D3-SgSmr%EH-U3Y(Km=Z*0ik%O_7lOhLOmFWC|jjtzG*+_EoLD
z5~QcA%TV{T{y_<Jnb02>l44AW)fSJ3&MZHKWnKEtyO+}UcWrPxG*ki%Ne?weKs2)v
zq7a~jAib4{>Na0HTrR-7Jc7KLrH5+;&apForUd<5uwh3%Md{V@%k4*6Q4c$D|E2t-
z&*di?3U}y2TqIxZ-(?Z1a5t`Bzdn4m_RD=#EXZP~+x!3G!J>Q~4(A{Yu6+g2Wf_MB
zFlE@)R^kwNDWTL%w;A=IV+nA^)ZE;buF7oKn6d;hif;uts3@6Zn>BAv1hSlOPiQiL
zuq8a{;fi7+9i}I<D_H`pN{fRMGO<^485~Ex*hB60_^%d*MCE@rk!uTaB3k$L_t(e4
zgBjTaAa!@OhXp{ZZcpyN820ePhYx#Luutv(jl5IE%XbxISz@C233;c|=MtWphc1+H
z;;_cp`}_b-=<4dmRxUuuJ&1ChPp0duqhFrBkG1%b!AJr}aRGV2bYk<5kN4GRte^Zj
z8R$HPX$>5GCy)mU!@ITn-njbVAIa33oFC6-ywdl8)fW~L@_bwltT5d^eXP;X_MAEk
zDql8-@9(GD>$P1`x32V3Z51g$^#{N@L&yS^7cif5bBYK8zRq|`8F$#rN3@3zU^sSe
zKW7FstXZ?BkL&>5VPBR%Mi;i1jh&s}+<v<#rbbY~;4y}!7}u&*OTOu3N;*Zt)KHtM
zO%UQwL-)QeyO;c;X^Or!l$bc8Jb`a<MJE9*W)p{>usduQ?H)=RYxN`&ug4Q$9m0ss
zc}#Au7*UtCUo7Cz;ZVzS^*!#rT*RXoMamJB0G(}|y}dtq8$Vc^#8qdsEAUYUfzpa-
zO%H_I;>F@tnl))Mt-|`bND%Mj6h?}{7JRX2>FO--RY~*WxPKl3@sqpu%bV2ra$t#Y
zOeSC;c0eLu=mq*78!leFC=8-4ae<s54@6C22~?Wkp;!ArW0qmgnm|IDIkW&&973NY
z4&gr&o{JSDo@Dt+(z(QLbXr|3jbD%gXQPgW7&w1umGM)$?vft#(88Q10}2ZYs;tko
zt5+rNEO_^Bk=0b~ov)jMeTISuFv!q3XATUGt~byaFZ(mVe9p4&*j?5gCXQ*_7R{Ok
zQEq1G@f7lmu3~m$j_&<pd$vxRGG!n2J^#NXGoVkv*M!MmW>7)atX;cs+W?s9lPV0Z
zPeG20QPVbU!U<$eH8QY#eY*j7DxSGZA^0PdHU03^u$~v*|3W`VPgi?D88VQ=e7_h#
zMEbQSZ^iFlPM+=X|0600N+>NomEgGhZf<^ler4e2dsy&7DI4gh*&L}DcL0_tf`zk<
z2f&^4y*l+a6{iTqg_<>MHp$=`6mH3EDoj3I8x2R+SB6H6__E+(b*MUkhvC`qxUdn%
z%vtdLQZFwiI{!W4Io<D^63OKoY|%#RE?G1sgi?#vLl~E%h?x>^Bya-u=mGqZFE;Pc
zc%eCQ2^kra2)^YK%*}n{rfbh!YNIb!H{{6wKbId<+^XN)9fLo<8KzE>COvY_3ajUn
zZ%;EJsV;UOZV2W4>!xkfs8EZ!n|V@X&8;b;+50zDe4MJ+K64E@nTc5dzet$Nq4|*d
zwtf5dYu2sfV%@fxpvnQz`|9r$xkbhP|FQ8}cBUVK2y0uaQz}IQTkG|KtW-1!@m62o
zK%Rd4r!0PYwaI($0oyYTTsR;o0e_^+aJUp>@?vX=k+ZUzVNr+`-ptSj$v$RAZ5v^P
z39i-uv+FEfKfbx#*p6ZL9Cj4b4@=T(aP9KtVbWucLOP0(d%2WWo4EC^e}36^nb)Z3
z-Iz&x>I-^pmagvb>XS9WqSnv2<J`i@O)5`$NRLl%H+nxO1Kz4Ff{{XQrAl@wR2F6L
zRLQ@XveZqs{l$;gVl-Us(Zjnn^CZB3D(xCzO3_AG+pVp?Jo%Wz|3Cfxm?Yr#)I$>t
z2)b{wt$1YIV6Tc}Z(G07s$uZFs*V6UT*P~bRbn}|rq8_tZVQN=DGt}x<cq~uX7n^L
zup!dqhmYe2qOiV)1<Qb~ExeL|^DBM-K2o>0_C`;%At+&NO-FTWW4FL(2?5R$kK^#p
zF)3bm?bS{!fP_LM?tZ=Lr~+@?&UX_j1hbaf^{>D2%lhyA_<!;C```WFy}fO7!XRt>
z(-<Ie=)hw1q##y#_Dp5mjC3v#?RKZX4VR#^f%bK49zRck{OD<AxM4f8`tGmA#>L%+
zt>YhLu`B){Ad;_1Fd+Jx_3O9N#2z=@8VNRDm7%8P_grt+e*MN$tJ4Hh8kq>B-j`Yy
zFWXT7njlPT831ueOrJC-x!RNH!;K7LrcaHaW9VOB!d=C(H`}@byBlB@P;~;0t54wW
zyXmlbfg8{EGL|oj8HssMBk0soW8;&FLEfJShvaCgIDQ0t9DmY=uA%#%K*uGT6eypd
zQ)(yT?pIcyzmQ1IUht_?hWI)}T$}-G;5FKkZPczULtJNt${e3Co8S$VL9cx-76*$c
zwLBt`Bfh>(BNjY*C{;>=&hF(*MY6lFEDC=sC_Wtwj{g_)mpcO&B-J{BeluLJw_BOb
z@_AN2l&xmq035<I_bE5`%g)K!Zqns{5GsI;b5#$;L5O*B>NHZ8JpcAun+pG{j7%c(
zkAQwm_QTJNnKQQ;y`mDnc=tc);9A>PtUTk9Kb9MCAiNP^R)XlrxeIU*zkG2JmQjSM
zU4aEF4ha0q6b_-;{}a=>@i)BQR5}&a*3^O{z!8YlT4}Q)U~Sfq)C%y4!_xI^hQpIc
z8d#)Z78?z#y=Tdzw@kCzzumzruCx$hDH(u1nE1B*3)t;d)rTi9?$#~$Z6@A>j~n#t
zc{IhCI)8%oEaD^%M;AH^TIgt_<Of2(4528=c+5>I84n)~VEUbfHo=Z;FE~B~BOO9z
zm;`!5hJ5zx&E^xYhUMxDdPI)1&2s~(6-ilPm@|z<|4^0e8A0#Pt&%#5?ZoOQH^z~Y
z7YJ6izB}<7S$S}ZG1E=fY}hcC>pJ<MRr~g@Dtx(5(!k;rn-4&lJ=9Vx=MrXAn39<I
z*!Xyvj&h3ZgTsui+iui@=g;zxJLKIxegLJM1#=z&c7>B~=v;V4@<AELiJ50-PSsPC
z&ovgw%VfpJNV?Ns4Lojux+engx*iFM!5*Iqu2|4(S-D0dt+CE<rED!NJF;@Pn5{Qg
zYjbENjbiyd#$U6iWu!SL)z?VsKzT5b#k3<|PKukqvczxD($kby#^vpRNt*`@Y{T(f
zuw=QS)ChExr0m9j!G^2I@PItW1)X32s-k!^YhN+|AsG@_R1b40hAFAPQFszPJJ@t)
zn%&-?JO!MpNHVF?;<#^2Js#^6ktf3NQ>HxbJB_Mv(+(Ya0&alXhcX%>En*SVFwm9@
z`)vH`^!@qCpnQjrvu%^mSc&;uZ3|+KiXZ`EaABmLguj@q=Mawxz8f~Q1%Km;34IAR
z&sMks+j&r#E=Vm|ofNS6VJH;bn<R%7VW3ZeOj?oZp2Yt6WkQ^ZPa>h2>FDU_VULL3
zy1#uKj{6@6907<Glz)%O2{wrX3)cz84Wr~i(jJOb-L0N{B`d)za|FT%|7deF(@~?w
zSXtdZx}oIF8%r<*T-C{xR)9JnvK7gp&yCkyz0<>UQ6xetHI~f=e^I&XT_PY3VDLZ%
zp2NOMqm|RFZCaqZ2?ZK6wwXk~XU?2a&7rfq^}r=>zI%Ni55AnZGi~`jvc(!3OgZy}
zcIJCg3xiLlNC#L>i#6V(XsVIAHS2|?3#CFvaPC?Wxrg`*-U*hb*1Wp-c?E|lHuG1x
zJD)Wj|J!_b&Z;)wG$=_}yTIuvn<4LW`Nhv&PNpvlKVC~ot@kCI0F1$IlDXlvfn)Ni
zTJ!W8G+4OJWK`!ZuP&6$PYtoUJbd+nNnkdDEDkD&{V5^1r3AJ^Cmbh|O$ptCacatN
z8glurpsy8%-KYLD%zkAyaRDmz=h9Xg@nCSt^r{&$EPHT^I*NT`Q!N`v?#1^EC3|-N
z%?Wwt78Z99Ndm%wyC8HMurq$hDy@OZCN+8A8HKTMtW$Jh!hqsuyB35Iw~Y8r3t|j^
znv)aGPBBGFLM3(`8l`_l#HBRL0GSr528LbSJssn^$;E&Q;c#vAI(6D_o>o_}HLf;I
zy?$YFUMxDx4@<;P4>IFCf4XsrDFJ+_`k=DpYV{;39+6H2QC&U_oF4lxDgUu640?%^
z_X*1k+U}4<4Nj%?z2IHd`Nz;1))jEuAc9d)4!x5e3|H)nNI%Sq=v#Nb`mf6g`38TC
z>ls`DH&rAaSyhQg9*Kt_D-CQMdwUhlMwO@}%o#M(PvlS6wUTW<%uNe4Jdm=HKw~|7
zjs|3v;t_Ui-3G074+&&+Km>KyK92oKI%#RNvb40^dGtFar07e|t5{mW{6g-3UV?RS
z{LEx5DNhSB=~dvK>3v98govcmp*h4&X8;+U3wMZ>&-1vPc!E(m^1v-U$o|8DS)_qm
zOL-8>KEEO{8#-g_=FR(9tL^tc01%|2)R@ASaJ$d()~i0=dr!C+wY?D=yD4M-;%z2K
z?ToRtEec=wza`8WUVWa?)YTMaz*U#uhEKy6^re_7l@h%8jMM$K&q*jqttl6)Lh%sn
zpK;VOD-VCjAt}65Pv4f`o)y!U80og_Q$CQ2Z)lC{=|eE>PeweQR2v2nqSw#C-#Aj=
zd}B#xAqJ5oCkz<8f4~$rd)~wIWghFlgLp-0DU2I@ShPacCQOEdhH~N)9xcF$(C#1%
zJzhg++yTy`o*$>PRNOzzVoSn>Nhdledw;29y2;zdEJWNEYc+r@<@CH=plxOWP1?6V
zLi0`qEF}>Pue69qLgpK_dtm9xJ~-hq@UynD0T#?|&&T-WJ#Os%{?EaTF;*v`*oZOf
z&<e^MFUy@&*hhysUcA7boOF5?F{`APZ2Oxtmey9fs;W=6tp`<Etnb#0`P<nm_&X0K
z-C5@#lo1YP4UC_Ne;h$NJOaBXcgguC9KO>gLEu4QFu>^q#JKRdRK7OsAdK%Y+G{j{
z63zp%U;I#}y&xrR8eE6gq#8F%Jiv7*H5Q?FA>66#vN}rFuWDS-Z(x=Q);Wp6Uus6?
zXykz%GU!aAAgHmUp&#`R$d+RDt_2rDAp<`sk`lBMwvucWD)U$q$PTJG>=_}WRPB=&
z!Z0I(Y;)QhtT+PxXx}+WSB`<ZU$*?>loV?RhuxHs_kI4DGep9F>DQx)1@r{yL1HBk
zpPQ;-6vm(~OqyeHmqPSy8gzXF0uGaJw4_AUwHbA5B1E#mk@P3k|K^)sCO<%(TY??|
zXs@AKvYv<)<zB3PWfW<-ld!m(O7$z&vakN1s3Tz;55e?yM4y9-!6|x64p&rX+H7l}
zbQOz{lVc8QRZvpW-z4dCb%3k*zMu$r@51u4_zeR3NpO_xOB&X%pM+~X3d<rcBjdb%
zN%Z$XarclhBh3}$2P|#Uja#=2Nj_~`0V_z(F5+d>sJc;)U_S~F9C?=n1gz+7{crA9
zaRI_{4xY?BBBrJfv42JOx>qxi5&@}F@sR5*Mcn=S69xIklT!FQ?BgTc1NlQrADHRQ
zM~@z*B9i^`Wp{6HZ^&Uc`78MHv@vhpxG`~zGugIOP@o?Pn#o}Z0x68n9NkDs9fsAC
zDyldm9DNEA)lZ0R9GF&KT?kNImV;vf+gHYE1hV$;*xZsNa5ocs<U4Mp&L@tCs?u?R
z9Kd>7(IgVKi))os53fIlh@;!RdM{!7f$gS%oX*RL9;49`h2VEoCXRc%-bEltXq`~Z
zptd-6ci0y6@#C`Y=tvT|r{M8#l>f|dvTg!j(nsfv5Cg_$>F^~!hSkpBr4YM+imPy&
zAc2&ypNe6rW2o%1PyD7P3Myip^&ANLkSE8lo`$nO02ZvPuWv33S5mTg!eBV9XsjLu
z1&6e1PLeCNr`1`qwFr@%iLbS}Aa#5D84x+VAYbbUw~l)A7UHE!*Bsef%Dr%j`qC)`
ziFd0}XErL2dK`v@I27LBkQTvT5urgHp>@$8K0*Zh*m1dTp+u6j)y;;y%Q0|qHSJP;
z)H9z6zJ(XOk(k&sZ8_a=d&fC=4riger!A*CFRm6dZ$?mgF)xCx6iPQq9u)#^ID{gc
zuXA%HWCJtkx{~S>am<0#NxGXd=4idIzs`ecoHlLapb?(!si6{BcQrAw<Gcy1jFkh5
znpFWSD@X2gdW$I6YWY6chofGHfeh%B+r^^-s1M}-TKoQ>gdM>SJRkD8bfX*@!MD0}
z$q7qiPlm%1vi2^$>G%|EGjB<CkP;KGwy+or=5X_!!LIlZIYT~&(wx05H<~!o3qY#)
zmO-MuM}2?JJJWiPg7ATW%;y?=CL_spG2yYxY(%uQsch;v$2f|Zjt##3_BKt!G$Qm8
z^ReRx#j-hwE@rVO4ZwEQIWq)5UYQ?&BV-kOo_q86nBW*sPtQ=sDSpW6@HqraR_M@K
ziL?YMmMxE35q)u^fDGL(+@WwTzXrtWHlv2a$b5E-j(RB1k!B@Dg@wrr69)xfZ@<(W
z`*7ymL&}v9E_A`!Z+jjlFcuEf?qvn)zH{%s*;b+`5B!ADz*hAe;(Y-O-9K$<(er(@
z_ly%>>hHOVaz-+(WWrM&HI2M?kSRVut+CQd`h1>8z#?$~5YMx7k|)9}bf#1&XnC)8
z%1{mXVxW+X;iN?7?b}C9GDUm@+a<Eo*ELSp!YXHQVllZyF(?AX$WI5;n0rn{oOUsf
z@LZ$9j+X-yrU=6YqNeW{TV-e~W1LIp(Yp_M!=X!X2L0APN#di2G1C>FT^=-D3V#l|
zwkD8<i*P(*WrmmOU3PuQr+ugg7<Cw!1K#rEu$9k`UvF;%l-tmvp7MSH_<CBmqvs@b
zH$F;omj7qt!2q69Jr|7rf?y~Q{Mo_ztkKGsub(|TNJNzTR||(t!NEBmiD%t&HKIn@
z|KXC$o-etrH0ZM_g#QKqsZV7H4(C5~<`iFUJjCdzFoea;jS&W)&>~X=*OLO9Mz)vm
zu&#Ywv5UNAFxiL=Z9_%8)w4S#8lfQ4AnL3o7mkPLxZ5jFFTlw4m!$01?U){H0ck68
z-|Kb{I@UPx*TfFtT)L(WqR|3Q!?3>dAB-o6978eh$OrLGc!Q{YeMQ5NLpX=SmEbHV
zOVSct#@6BvwG>MWv1-4y_xu$^YO#aa$dUgDqqk|(>Dr6=KtqbJudhl=uSZ$r0F<wa
zDw^^ppAhdLwl<$lvDrQ9eIZA30>nKy<J&K>{nFq*2=m@I)ycylA4ovZKggilu5YVx
zs@G3s(ga(|-AvT3Lb&QQUjxw0FrG^|Ir@wbfj6P6P^WVcW`!z_&;fOrgJjp2`Vb~(
z+93qA?^99m5E(6?354Dv=iKS;v3<hX4I-qlqJzl&#ogZOq~JX(^7P%ckHPYL;Ng*|
zCHw;<p;CjaC#`E`AjB0dtMv}k<pYJly?5_W4UEJA%ZpRTi9wmLV0ew)6UqL29+BqI
z&BJ`?(3rkgZ#nMX--@dE{!>YOtW-7I4PG{0g9_1443@MfPIjO0HC@Ta?!TnyAzV0>
z9`KFum|#Q+9r%4;8cOy)^!~hDtx2D|?dsIt{j><C67-%7klnXf#tUjvs9h|_$2i?P
z6?*&<XMHY5&+GZVRlW>dRXe4h2ol5k&p-d1c(C#dET%-d5&}uc4}>i~0{yk{Pi3WE
z!)ZDSqV-rfLP^<fiNt;G*V`=?wz=h4F#`y^<T0OOi%O_;rr2TaDdTDvNEm@35#Slh
z&GN$d{;+R0R{+m6qoAD745~g)&`iU#KS*X;@bD+a_D3d-CKGD=d#6e61PvP#)-}9)
zb?W8QW-$^H1Ve4wWqAu9QabrTQp}=1PPxfPHT~g-sV;rU&If>E(fJTYrAlqh&<LS5
zXzg4><&BhCb@5j8Hg8gifGuvvumY4;bOx~elu`mNUWdXUi+L{R8)eKtm_ez@z>qW5
zh=v+=2lOUbf1CH!^ZH{Yv(&y)&1UGjn6zyKY9qX7*ow@;`5BITSO|$T!I|lqj78A6
zxly5VEjiL#v2BYu)~Uj^N6~M&&5(|UH2Clu8pmvtKw2q`yf@B+=0*)+sP=O>0?zOj
zB(PBB0%P3)@w_>Ad9@MPYbuo62vYqDG?gX+kan@~mb>Slec4a0LgYeKV1=UsOi~4b
zQVU`yRADvQk}%J}{%2NXU5e>DNUlyiQq7U3O9>(++}9l~_a=1>!bx=!3SmW<!tHVQ
z=3gDSsA<JofKm4bT1N-d5>O8O`4ka3epx!CGCP6LSdqbGyBIN|VxK7AkkS(Y`#1>>
z<AmHZmgOh4kh<V(EU3Dc(~~dNhL&inArc6uMvpJOAvZ8?0t`+v{R0$u*&aeAj4pe1
zcEOCeaYofg(^e1C(;Yg|iwYO&^RE`NqC1Je^TocaAfXU-m{SSCpZMfClc~XwN1~u8
z_NR@~F~!SA11z2gFK^MOfk6ND{T0jjqWrYc><nn|wLBz@b#Qpt{jj>`*Rs{qFdRB+
z{N>XIPZY~hLekD7Ob)#1c+QVzRobTZdPzRy;9c&a)({;60CXYFuC%O{Cx(>gj)vB~
zO;!GC@x^3Vfzk@r>u+$w!(h$oBszZ#iOwe4B_4jCbKO!~ew8b>b(6J*IIU8X3{)O;
z2|Y)YGLuCj`v3&=s917@?-1Mz9U!Ei>^_13lhKd;8AiE#$&+utTnJ<F%mJ1)fvCUE
zyL;?sU<C2MEp_O8z!8oG(w3)9KLwC?!OU%5e{HFvt0+UaT4EFk=ctHaT4@Ow+E+@h
z<u4a_Z&6W#o`W0`<p`g=?47fAyEZq1S9Fy4=T-&ir}!Q^cFdC0q{ix#_L%dy`R%4n
zpHtJikQ-6+k$=k^yHMeqsWqUU+)`(P4!M_g&&V-gum%MOn`sc-?<U<J>?O-$>3$$5
z2FND)-j*?c9KNCwAsV{+;qsD6)bGb?QgT0^T%7Z^@9xsi*B^CR`|IX^-@E!l_XDOQ
zttPfu>g+ht)G0jZ)bRM0GY+Q4|Fg(s&4!?mLAJx3ABEaY?cf{|pmO^5MC=9sM;%r~
zT~w>wetD9xI<4?+Y?iBIj(ytliwEu*Z0WWqzwkykJA7f>YDMw<nUa7|$^ZKIzej4A
z&@KVsjFNut)w%V%p5$<>g~7GYgjl+nUnwIsZQHezKB_iZLIvo2u$E7kVSJ9zI5ARc
zjbQZkD;|&MSfr!mzttL80hpChY_M3F-7GxR*0fxFdcW`0u<rqMD_+Nz;**o;<kC^2
zsTPKPo#EC=8S0Qku}EM9O%W{}qd|oXjlWl(F8KWZ`Vg>ET{Xzb3O`3Tt;_9rbOHr1
z#>wcYF-+FrLfUUNd$=U)yFrDw2=M?kg+iHrq2BN>fN9Jt@T-i-XDzPOQgX3l>s$8z
zyJe!kaq<;IWz$6|6G&54;L_j}<RE|Buwg^zifFy#!qAeP1Qye${=jvjlo3co-6m^Q
z`m7V<507=WSgE}xxw|F)T+^6PYZgf-j{J}+FJ5#@I>ZG}a~6<+AxG&myn)Opr;3>n
zk^EIh$JLCcqj7Aq`{R!gz$I4Sq9r&fo1*Z9gHU$Y1mZ}YZ~&dV)K;DjCmR$~zJLFI
z=?){Jq2pus36IjROsXi3igNa}G<wO?lE)@btCgFps}=KJRT<s-2!zP~==iym*mp`(
z7utCo#iOz2@@jk<R#R3qp;G70wUnnbhOcRAg-KJXVfJBf%ue62x$>>{o&4j4U0x@D
z5D32=Un!f?;O(SG9{f)e#qACuT`_GG>gaJsLl|qzU=qKI@|;gSv+{+1L#fNSqEdX$
z9l^6)^H@}}Q-5rDs$cT&RUfRhUxoa8vz`w2P&*2Wic)~GrXL@F3*0sK0Qy$8bPxZv
za^(#e?_e%&NP`S$uG~G(@0@@0cY_b?J(XIBd9vl*v}mBxULP5FT)<eMB9`<ZBv8M0
z-zu9{fy0LJzBQ{?TSd8LHzjP%f8E7=Z<FeMJCRzm6(bhdAV1mAy}9IH#2eCIplSO3
z_xms}ei-|LRnq68o5|Sw--*oUu(joNg5ua<?Zcqg%E_%Y`zR+bFAUDUsRojzI}-qQ
zrCHZl(mM=-T9;4ayn{uZ+|yyn<GNWNI9)srpN~`Us~<4KI_lAbihV`Ul`E;*G7%|d
z=Fslsx!qk$bkjNwpx~ruyBg<Ue!C$n9z-5NU0}6ZQQmbb&KaJU_`Ndp-CwI#NuMF*
zBuyZP)0Bj#ud%l9uAEkv7ha-&ZI||Xti2s9tFLzVH&pz39id5jCP&2Q{K)fBE1;J)
z*`oNAm(jHA&b$`2HN3jwhf81YtZ<3S{h$|+fs8|~gG`^T<XZm{NrbfY^jZ7aJC(1y
zhp_`Aqt8UWf`G)T`3%zK%(w;=7}r)}FR&Y&2x>GcU^gj>Q#+|Vu`!GEhIDXWxRA5A
zC2kJ4xEK2WH4kFd_<Y@J%eSc844W(Q@#`I1)>G~V4*k7$*YU^p2ELegq>l0_5h$fE
zOuuR3u*Za0Ub><!ASu%wZzR@R+f;9=oAyqZM;cZyZ02fFFjZS~QKKws5A<W7Q8}Fc
zzArP5PiECBwYbr92HfTy{$D9CBxH4GkcnC-vXbfUstgrgn5EC^Y8$dCPQ9reFickl
zRjqWSe0S3&o<Oh9q$dCR*P9&|>nQIIal!2QZNBH1cV6?wSNdeg(^J)}cbZ<&(`Ka)
zMwYf^b?3i&5GAAmRp_s)fABM2cu<@JuO&t=3~DLuS6-dESw3eli&gd7B3&0S9r)q+
z&Ad*$8f)p|a!eBO)>#<vaO71wcdjsgqx~qx1C?HRPrfu|oAZUTR|BS9xOnmNw2|Kq
z9L(vI@^jeO!(jxcgwgk2jEq&P7f$<??RD`(gaUWKJymLI^y2_OM-ew_Ujx%g^#l1U
zf;UsLoHU@X_znMVSYsQ?`!au*s=k&dO)gtZ&y}1??Q1CAgnq6NRn+P?9h+7gO8Lv!
z6z-=lkeyod=bwK{h)NJud7=~rHO@eqp+bmQ!zeMdHgegUzIjuvE+@Z)OWH4p-6f@J
zx-za_xiV5Cb8Uc?DlHx{plYlB1Z#kanb(Ile_nlf<X1Zm6M&OOsGjOD1Yjg~tu!2p
za;)e7Tx(<^wzep~|D%x_|9}8ySPX2YKmMPm#IxGhWJmv7+Ol@Ve-8W>wG8Qe%%gAr
zHf1OGoP>5;b*+A3NT)Ro77`MZ%zoIsXCZy-QlgM}5;hKsRo&sk$U}CbT}bHX#6j%m
zr!k9pCIv`tyFi#gy+^H)c2j(JV%0@>UYFAYFe%(lHnHi*k(O9h;bMN*ve!`R&*pxm
zATX&^bt7OC1q@m`mV8niA_!l&w4;MFZ0ue}ws?wqFQZkTQ_`v5xbb%AV92rsVX8wx
zD7@ZYZPB6BSP*wo5P=YX_+dY-C~|{BaihsopxssXPJ)W;lHOVNRX*>d0&5LPV+<mK
z(wi))1A2=cjJ=_g3E7sAo<WahT`l+eAggcynur?E%^IP;6CM{)S(JQ;+ymxUuzT6N
zdB#RYu~5Ybat{fYndBhcKFV#Qe>=~$SDhm4;v0QQ_TKc8&!f*o3?J0*{pqeg<`NLU
zK3BNmh{JimTz~!|E-`U5h>U6SV<$peX&5}Q>>9pG`X&Kf^uoKx;2UNG4-*Gi_@%LF
z$6Q!es=vpBPUTb-HshT!8OfYU1OLxF({6@_hBwP<Z)~uqgR&_C6)5=vO%c$wGeL!$
zySrhfnK!<knmPt9r*QCDuGx$HmrVn~W%u8$lOtpj6S{4dA59Cf8pfEHIG0gWGzGVM
zP~ydAqN%W>S^yI!Sa*2;0+1fh9^aDQG67&p;+N9;=}rNt;cJ<BBN%}sda$ooE@Y&p
z>>_rQsRkgaC$y1C%ymhgOG2NF<Lj{FvsHkMzkmz&=~nxvAg`(a{<ewj@u^qK9@5%k
z&Ex4~zr7$)s)QdPDv}m#%{7o?i_TLBG@5?djT<&h;iIGvT0e))JQj>8_RYUip&P|s
zzux7t2gSH#!~mp+I~0YzXht5cEUe#DS5&@4EMmgUbVNYJ!a(R5baVMY{H{z~Kv-_`
z7MpA3ZRW*oe2|r$9Rw)>KHuG<#a2mQt$NU?(uQ#&2gZpQBwe(ZbTjqQpOU>sZ?P_&
zDnx`Iz{kfhNu4uc2haA_I|J(HA?$VGNMP5yTJ8V$L5qMeI>n>|nec2Y{c|FKrWW22
zjobS<csb8YpxhU%lB!x=e`rQ_2p-q141+&EqERaK<gC%uyljahk|CMV%n}0jb?NzD
z)W$*xU`FD0QxLKWmD($=lMmP|wJr)DxtI|vFq?H{7RYh3kK-dAKwPmVI$T#fVb*TX
z5kx|FKt7<G=tzIWXWG!$z0x2Rj{5maWH9&59pX%>O`r^poK*Y-pcIZOcuO(un5~C6
zo9cxv|L5lrJ*oLo`)xIRA?78GX>snzdrzW3fqdW{coDz=KdL2`mFfeMxpQ+G(~1NY
zM;aJJlR!R}ny8TA4rChA*+Qxvk$Wr_6Y7NHWlh6S@)H>rna3Wztn`H3CvlJ9@o>8F
z85(&;pikY1R_>>7?ti;FN59+MB=pCl13+Q9lV<JOb<ce`^F21p_QdqKvS$o74gm(n
z{*3jr*>N)ON-O9pR7~`c$JE%OSjkSkd3W;Y?sWU`ugUC4ZS%7U=U`<<|9a4A>A7i{
zB{=|uqzNk$am8F3jlelYcsA+Nop7X9`~HK6C><9{&F<a1e$fsx8VxkESJjE$XyG4q
z6ZR6S-~Cp6J&~9h_t-Nz;g3+DY&n;b!b|rkQUm5W&&GC1ND&~Nb8xTaxG-C-e##J@
zV%al?NSB`l7RHCFLSbihwGR9bImjtw>Wl{r2;SL&89kt;VmLK{)Sv_?ZhWh-m+&8X
zwr!S<r6=YeF#PPiJg-O0AbTB#4W+;QKY>eEX?+D$ft?SB6c1TNtvVlzFVrG{b2zVV
zk3N#qNwJBxB8~g&MA?X&{~UI$ygn(Mw#ddZCY|rGk2urj_u6$|?OKYBk<LBB2!nz{
z_E81DdzXkZBvf5>bXr^z1{)ij1Ss-)*V4y-*A+|vFLbSBuxG(j^Vi+ubYIaElfq?;
zLEE=s4qhmPBS&3li)#6&(j2J(wO{gjT|x3{gT*y=g%UdqIbx@|E6oJs>Z70OD<uoe
zcDj3#ixZenzxqZx(!tQGxr?baHF!A#&3f36*Pu6Wg!1x`%TcdU1MquGNonqllMH4g
z?y;Z{WJ~|=TsxwUf|c;cL=!_d6T!v)m4`cX-+2Az@~15w&n}98HZ>iLR0_0Egd6K;
z8k$k^LTn)5@lqn^JGA@FT!)}r2<ZQF$Q02>eQcu7EKY_{Ic*tItZj!Z`^PCl^b)S^
z9<%S4zYH;Q4I3Eggt|QbTN-ZH)s6$56itwRLtaz%Ai2CviQ9nN6+(I_dBnB%tSyfD
zrr~FX2Cc7c!7_~6*+=JcZ3}hS_~ZL|vO78_QlFfc;7ywssoc!*F?tzgvm^~}lZ6j%
zIsEK|oxP^~a<Hy{()nMW!uM#M1tadEuKQoh4^-y~N^D)qtI`J88@<lsG_M!M^aZ4{
z!79yNagBy-(WjVZj}(ibOTRnc*w#J#%tNo6Ein~#Qp#Mgn<ige_kwjT^vCa|p;O-=
zsOi#oM8Ghe;@uFji5(zt4~=h=lFlbTC9rU+4ACYhvZ;0-v~<C-O*o>isC($>jdGpl
zki<7PY|zK%9NqVGu7R^zaZv^Vd%T8>(4cGdX?CsTbjmG$b$-UHhBE$Tpvx%Yq$GF!
z3`UB8*32t3s7Fn8Mp`Pjeo(K#{Cv*Vdz@92<@Ba~q|_XW`f1~HQZ38Q&YrO@=hn^x
z?2L)~yIou3V^+ll(AU?$`KPJP>(Y$#OZ#6wf4XQ*Q@#CEb#-ofva$LdS@0lobC!39
z$&ISh2*t;q2`o<xvnWU2Kh<NvaDS>g(jL_N6%&I;<wHW4XFm~EnB6Je%bua5Kdk@H
ze`4Jo)<|eH>Fm1;5q$US0n6SW8(dywqYbK|v3KM7FC1183Y~oha)Enkodps<Yr>1u
zOm1NVh_~tY`tZQ{)EQS~mSGR(T$|9YV1jJGEW-XF_Q%WvU%R9q9MxvmouG%KHLpBI
zMqm>UpYXRiF<njP<^7g>I4rQF&M$EIFz5_M^`7rOijUVkL~G#X{aMK-YA;GK9p10V
zt8JwQmrtsbgIcQaFefxWZCYZo`rwG~XMGx44OuCh8YJx*A-iYS9dP3NAJ>VVS-erW
zW9yMte79lNEenfRpdEf&)&UTIiYeo8+kkdT2O0sbvkfgnlP`cv2=x(=P%8*AvvjRf
zMT~xW<fBmr;q(3BCT&tl*2RpgsAl<_k+UuR{WZC@(Zs?A&AR%|Y}>3^vCE}Cn{d+y
zm)xFu{O-9HTccyFH89EN+T{R=BcyuQ@C?Uq&Nbo(a{~)=^Ay0U^~VuKUS4LuGnCNJ
z?=1x-?b|oc2u><`{o9BU_uLmuG%_}hqYtU&BW8(QfAw8q&BW^IA4S1W1_P}1JKSyJ
z=|k~}M#Q0huA~ri?QjvHhFL{zp6G%3N!4rGGI#~%4F)5TX^5(X+WqlYO3XtMwq|X%
zxxda_E#=Cm73p-xi*Me&v#@Cxi4P%dnr>7A`%8#I^q4$cQTFnBv)=b1pJ^Y~z+4Bb
z%s-9rG_AVm=nS$y))0-pA4gfuMwjtvUY=9(Jrsnc>#F6dl`He8nBH=mwCG)Q(&AH+
z!~4A@9!h?SPVc!<8TvtG4Z3e^H(+LRG|p<4aq#3zf6TcCJF#EJQd)MU%}=^OrI8m~
zIhHf|s?4-}amSP1j*}hdmQ8F>@!<Gp%uJjzr0_WoH=TwJ8U7)8XNRJ7MO|LcoMkgG
zf-p*#n4@m}rjf7DOIi8;f{SC&B3I49R<&C6(VmR_h~q^WTpG%bZM+rJK5zv)DhYw4
zj=2tLt8~n^Jt$fGXT{fT+)q$i;V#=X7)R?9Pcws|l;=D(fux#Ye9T6Jsi&u(88+L(
zqv`s!Yj2(i4_3ja`c?XcR2dhR4~SnHy0*_e#zY?^OPaQf;9vjj+=H+6KXlA3xtjF)
zXwahuXX&y*9%)u#xy@Nd0FNr9h8(J&6ZpvHq}}1h{$i<dn>U`#-<uxQ|3#gEFVKVH
z)3$lDW;2=&I(V$uHmeAYy3QshCcllI+UUFQ;@@~6vU2|2J?Cq8A3buEi(c>$_eP@s
zq4;F!QX=GM&;_hm{Pm7s?Ls{)KHz+XrI?!v1H&*w7sKrG$e$MJa?%#ft7gkP%?f-q
zRWJR5<JR<#DRl#O9*?k@Iilnug8}?pKYZNSY5C92Gv{W6pB#f07TMDDrY=nOk+@E{
z($ZK@M*y&RVCjwAhP^uc*L{o)vHX#oE>Q0&x{eXKPaP%>BKMJakl5#~=nBOL3_LlF
zVAk&9it{QFoS^mg4%5lkl`uLp6thaE{2cHz{%uYj#m9d;+(ryUn~a812;=}(;Q~*8
z<FG?CI*Sg3o?7a-QYGbbTP~FD4#?bFdpbHm`Y`C`bYaViN4kt(o0y-Rm^i;n;){nv
zmOm_-2LEna;*ItULKf1}h*ef}830p+lC*U0aye^6m#WTdZ1O5_?`GZY4C9^kcAcDX
zy!h8CyqWypB;FZN`E=b|2cav4kbz!XLUK8$zO7BP%V{u9GTXUFrn3!=oj<;RE6KD<
z0B%CYK9!JhY$8F!PiwDEU57v2^Q_$Wl|O<kZZ3Nt=rM7yM~TPWThE<IW3KrXEfp9!
z-D~EDgk5Ms-E?Cr!n*j9(HGBbjh+_n?)|AKz@#vmQMe+Ib1f3<-qdCAzV}D9Y(nix
z*wlb{J9!;Lf)*r4`_3PKdI_D$N$Yw}ik@D0?*(&2{vq&K|CV~IwxU_m@20dQ<T_ku
zB{r2v-s!mP?O!VFofT!>Q@JhPR2;DOW4WGUSVyh1$T#xw7Mehk*mBY`kT%@y`59xc
zEQ_OK6F|K8H3G|%QLzPtwsY1PnK8Zs^E*B272$5b9K<uG`n`pSvwf!j;Lm&-mEh@x
z<?)NyGlo=K+U)ZEYUY23FuDDwBriys`HZ?nzaa;3S&fSus!co#0<nda=1@PUG586~
zdCe%O>=`tLX02L<qBt8uY}{{12SwTO6I3%i>NsLO)f`4c!3JF&wNHC{F-`ViKdyOD
zuwigor`*rcZotx?E+nj7Vo0?^u$1-3Q=R0v&9#F9blWhCP~+aI9&}?r`{g#}$lz9t
z9q%7+wEPndYk)R)4lXrIqVS8OGR96Oyf1raMA<l~Rq$&=zwc?h0nVSedGZhc6<1{D
z=kJ<%&MCDk4%|KR)Nm>2V!hWP&^v2a8VT>+NMa1<ZG{*u=>ch#MJ$j9QpC{GXw@8g
zs=+HxmBMa~Y0Hp=-qDLM7ToC8dcdMXQNLH-=K+E?>IH_^gxW_}htN9JbJh;ryfEqY
zgrJX}vvkS_PHa?nYap>wIY9b^1X5Ewzh7=6E;kn3D4NjWBL&dodaG1214tu9A2@#L
z=XwN3o;@4f3I{VY>KRu_?v!wNWUp^)V3|XEyua!AX+P(vWfq%<%r5_TXFz^nm?7yO
z*CC9d4XS0hr)0b*?OT9ZF(v4tCJ+cbgni#{&;oYJwTM)wG1x;7Qr>2|mP3I`ju}I|
z?j?g>B1BeH5G=_Z8cnI#z8lwr1Ce0PSoh~2e~411G=THpB_30LO<)Le5Yjhu*3l&I
zPL5RX2)WK?TCRKJfi4g-PcJl>74Nq-+aJ3TlxqZ`DyCshP~s3;U%`^P-7LdqPavSy
zs07sU?Qt}Bt#$wT4|Zhxy=WTx_lp0D7c>Ov4uEBSt_h&a6EN^^)22-~KppBVGxJ+X
zr7>x)Qug6P9%0wc1jgQ-#^wd$l!Rcgv(MFMPu2^@2dVU$QFha8lD4qvahmiSC}Olf
zN=D3>W@`<_AW?%7c)k*P%v2|XZIF$&$>N($_cxd|pRxc&7r*)~dg!aAs}A<buH4;d
zCHYd*lTLHB22c@DXKG>Ln&#cZ_`Sg{@x+ZHi#E+(1SMXiCm;NXlNaQdIB!>OIeNY#
z&-hPlVadhM8P7f%hticUIZf|}XBO-MK;D%5k*i!@_VUcepIWy!_^%W2Bs*K^nOWPv
zXSeGjWEW4kNd7LmabyfMWOU&VWhBxEm^vql8!~oBXoOS-H>t>8N0CX$V7_9hlw!8!
z_fU`S%bXpWPlic{w);^<L_!nq6k0F-WWG6i7<H`JrRZb|^g}0unMQ~~9t@@avT(NN
zmbgkQV7#+b`{?a8H!YeTf-ZSw<gPARGmB9kdXP6VB3*~Ed*I6W(HVJ=|K*VZY61yB
z=8(^F^iGvdAk|5(*J0USDI^KFDWor+FJz8fp7{r;=F+^UEs9a^*b4xE^U{xnea;k>
zGHW}Sc*cpE1>RCaSE)4m0Llr0(2|(N0}LC`+Y)Lw@N`i7cN<@ACVtH&jYUvl%iDW6
zVncMug)$|*^{fw=i6Mh+xnnD;83(5ie|2KKO-aBPWbW8cv?ttV_4z;@5T@HNjE0IY
zlv*!!4OOx+<K6v?T=xq_k+goMQ4R%i!It_g>xqQB@D8O*d|L7V0|L!7gj@{<-G2Ww
z_=s3XDr;sMj$bHuF{5m-&(8NBl{bN{Am7s`(L*h8EAaoG+|LiSM>$jdnDQXK2l70D
zCk+QAG?{@d$a@fVviB=yEXu&4U3VC87fJ_OzxY8OdA=h1$UV!<$_WV_dQJ8jT=v31
zUD7DR9?o}s;$2|<Q5nBVhA3JlNEb@&eOsY)^8o(+c+>MV*zO^r!}$34=;vI(sB*tk
z{VX&)bD`b3CgKdBgUygVS%m0};9<!gob5E+^ce1OSAk?+zy!B}z)tv2WbmhGVo{82
zbpb$PUIw1x_YzUK7wrPcyfo5ll)doL($|AvN3ipt#idf^0#bij?+6^XL!#9uO-Zd@
zl?apr*qw?uc$$^9Wkz!~Kv^WyXJ#hb-@S%UCb5F2cpL0--h@7v^zyZhycL$Q{cwV6
zNt&ygXFt8qYMc?Z@HhOu4fwW*l8I`9bjEQ=I^?wy4O<w7*q|WFc+!L;VEr${hSA0o
zhz$?JGV!PDy`ai=8op*=%0k?5AR1Y-!S^-K|3<pm;S5eiTAMUQDK?{}Pp>Irmh!SW
z4<I0w9+*)&Y79<4<J(&fP}hJVK5pnxAKXGPu4B^06_e1Yf_XfS-aWR$PeP}s?fJ>D
zA=fWm`WYx`ERIX?H@u7)GbzSl)ntwqPc{^=)+r-8g2*AhL<DGy=`@i34~baiERs!k
zgE1(u5}!_#)t%jML8Q6mz=5n`-i?i=o#q})$!$F*g;9GDOHJ#_mbGiwvgn<OI+2Y3
zg-CbX$&)A3T{kzVQPg_TJRk)Y!JmnZmC=F;NM3IA*c++5)@;nUQ0*$jw&6FP`WFQl
z(ZU6R<LT~IxmR*G-LHsNv~S@b106)m2hx*OOgh0w{i<{$=Bxjyi<REOBytNq)Fl=x
z_wej0OPmx1<9@A9=8r9fWbi_2f<t#7=K+<&5#FZ4V@PmX@a9bwZtB^j%v|w#Frvn9
zPQ;^WR9MelRB$Jcrt3#XErFo%x60FV+@)}V-q%eL_O28`s%~?s^}C(0h0QH$E5}-3
zHZe!r|F>vsn%fyI&zTiPWvXR7N7e1EjfP&>g8m3dl^N9)953pdAcMQBK6#h*ioq?v
z|9*=0*O*n`|H#RciBEmoRK^3`WBYm($JL+5ZQ9~CwU46gR8M-pN2=$gGE66eR7ss=
zkT~!Fn;>@T@g0@yjB|R~Ie)V8K*v9N&+gcUYG4YJ(i=W%KvNdE@Vd8@y&(I&i`-u;
z3T2q2$b^K!O{x<ppNy_#*{;i<-D(8~KzRYXU~S5tpOne3MJth>bPLVXCr|cMbzD8w
zL;IZDcjOy6zv#mDGiKaId&aXq{69Wb@a-dH?Va(CO_KTGOEgTZ&)0r*;t~6({S*#w
zI7OIhq*P2e@#TKdm$TzYYz<r*P}{iZyJM3$@>B|^E^3&0yg|c;hO%tGR;Njm&sZ;5
z6WHTMEv2IYn=Ez#qdZcWhe{Y^rYXYwNdQ+x4tgEVbadB%Xx(~>&#f9k&~78h{eM8D
z(osYCskkeuP{TF0dxzE{9C+HmyX^iWZN*Hhb_zhbrAaMWjmaJ0S<JGB=>1yD;`kF?
z15x|bN<kAM-aWUi)0dQP@)-`}K_Z!v8d0!ClVfKabmoJF+e^~3Nt^zI1NS3M8|X(K
zB1C_UiiiU9YL1nwqD&071?%VUx2vRTLvgP+ztOSh0P0Fi^60MiDlbmM_DwIetBz%S
zCOKmQUC-0~FAusPn%{4eMlRB~_qWQxXv$RK0QN#i5C*h#E>}hycenfQY0_CoM>wsW
zgN0!vYjot~*~FSB)fs5o@CZ#S^R&#XnGtry6tYb<m^8}KQX*fn1k5HcDQK;0-cm+(
zA`jTHvT8rs+xoQunEOs4C2c1Vz5(o@U3)1wPt4b=K?8Y-Cm%<hQAZ63GY0#sWzO9u
ztZ@hIP+M1`tJ?1)u)V;<G=2343G%6@SMdwVq`F<f)v8PxE4W(1RlsGb)L6=~n3$6h
zyK6kX5+?giwP&*3RV@=L4{7KAM{>v3i`gy0%k+|NRxwm;q)0<*StZ(d#i!h~yzp<V
z^UfCB+$`A0X^|^-EbqKF)hXiH^p4sS#Xo$mcSl77mTZc+%*@Qa5WGUb24`M}L-svQ
z4dbYpPLHt5rZb*IDVBL3FUBjI=E)+k8i+hWrKR%%RiSy6^r;sU;m}C51`H!6lRBOp
zbuGc|3Zu0Sf=H<~yO+K)mi^a!t<HcmJBHRcSS5@K?n|$&+8Pv0#=psyFVN78q?^4=
zxPm)sTPTZi$)M$-TE!UC@la{;V{1{#Y>m#nN|gBMOc3Cq<I!Vwhb1cO{fYj96BA?f
zR$F}G`*qy&pY`j<08&aDD=-HeK7_1VZhyyIqEit$P=(@k$+1-=a4#&lC~~@BVj44z
zu-e&wA+T&Zqvy@5qc<jY94!GJQt>x2al%)MNrH41Go9h8Cz*WuqgEu%og{UY-jl#;
z_E>BO?!1_OG<Edi*3UpEs20ki`s@^X2DGSFS%_<Bw0F;l34{7Zhzm)mU3K?A!?l$h
zSB2_`T7u23J9f<0egHAa`bR%+3M3RAY}>*@U4uXMv<>K5GOa7<dD8+0>M)B<?6Y(5
zhvz?0{4>)4Lmr@;CmA$$({hfWjDOWp0|rRNS3tnZ=U%Xfs4AS0M-xa^Lk%!q@LL0+
zx<@a}*`NxgE=FvaJ?zoVsS$REjhj{ICbDQTb2-0bpi77vWW9RzMCg-XLKZEv_e6j=
zE=+W+n5mM>my5nUe>MFk*v%MHFG>QY8hU@61#+x39#0)*hf;$^o2w(EyTp+3$ogwP
zvaqnwU`6wZ<TA1Yflrs+y+5i3jW1U4>it?F6Va{{nnARC+yhHaOCEW39-@KP35K7@
zz4S7_n!H#tCVDRp_?)D0{DHv=GpknQ3=U-uAc;mamxNsorYS6nOjij@KT-8=okb=+
z$h20lYIXIA-}HJJB4!fDt8&4Y@~EZ|M+nVnNn7%r15l`SNhh>ab&AfraLFfL6K$ta
z08(WP2$@cPRZVp3__+*NLBxvGQfn%rdXDI20t@AH{V|VQ64tkm%4q}{ij4KOpB|-6
z9cAqGg=^oj1A{hg*&`&BdTP-M5aSpX9qmWI#-@n`Ek*a7kIvzgYXU(CW)nSXscCUa
z$E6sOAaW12M&e#VSYc0A`jX5zSI9%qP{xr(YE+cmtVNiR%q;4XqC|D3QAyY-Y*>|A
zD#Db3aEibGcD}d<fz@YO=*Z2T?2b?UYfX!uJ=D@V%k%>6uBTUHKEtikn2`W=mGIOK
z3Pqq#6Q2xYWHm`sX0G!_ue9>$EV;FN#fLexuZHt^nm`B=`LDe{pCwToMVCzMcO;0?
zzi1Vc3vmN$`SjQSd14EQjYm<+Ht2VkVeCgRPTgRzJ*BP8AHDG)uHK38Tich$&b9yu
z>u20bnV!g=6iq4+4^z$H_nCDBMyTAv19#$u5AeBh&}}q=4CT^OmKkHI#LoKr@?}ZV
zg^#SUWe)9O$BPc+rg6t-)rc+%r1esi^o6phnu8_sdKr->+%B2U=Jq#tDZ4_5MH;5{
z{V2zdGBB3R=Y^CgWxNCttJ)eUvY6$#Uv**WqW%v!fps_22v-G->Fsrf)#|uMrk{%j
zc{?MQp&Hv$v#o^OC0$@{MaI>kxX-eqfNwIxfwGN`n)F0NW%c;?b(CYHS2gZ1md%D%
z!U}aszdQZOt|{WDvyz*Z(}E}(Wjbm}!;uh|D5=tw`|#(^{9af4$Ya_r7)fiWuuWtB
zUd(~JFml9<!80GGWdu{A9W<^{{h;dd@WAgKJhPh#o<j8GtICBIC9o*O*#+_umq80w
zvb{xK-c$o^P?89+mRRkDQTI$E^U+b0|9=2d5z1KAg*9cF)7Wpa8fpmaIY=(lV-F<A
z=6FHeBao#~4oKlgYQ*r+3mrl*8dYFk9Yw`7Ie=v~Ve8^^m>z(R4?WGKOatiA-s#^+
znY@;;E|>34gEWc;L>IMJZqW8qP>RPKb;x+?&hpVtfriSl-k*4u;Vm4O-L9mocKiw7
zkCa3i4c^_=$jzlda}i!EhSg8|(GsM=!uUW@g~O$*mOd~pM+dbd`=vdqam~!tPlyvJ
ze2x;()nOu}8^JTk1eiVY@QORpFS^ItX$11;d~NHaoWfPq3fi>GSMCPE70O-T|6#8+
zNx!XYxer)HY@58McMh`IMX&kWdt}_wjz-wls_xGa!*yu5!qaOFljJHw&cvz(izf#S
zjQX%l8}j6RAq*=8AxOavmmKNS$HLKBeJMIE|BxFjE$e^Q$(d5yCtdWGU?xgNJSV#I
zxDd;qQTiH3PlCC5J*9C^XjRgdfNAcM>oD#yv1}Z15zkXzG0kb}NMd@?j)M(yaMR&{
zexbZY4D!J`%19A16IE=2-F!N6uH`fH^){>}X=EO}p=FaA+~d)wU{46ePxM0a<~89$
zMKG3aZUdraZvKrj`XI!9C-%7?nnz?eQ_sC3hQkfyAy>bh68_YwzbH!#rI{8ktd3d&
zJqk-=GOG6Hi%8GHgDV@G<mdcc#Rcea1fLk_d7i;wUVxJ#26F?|$v}7I%~P_kJFs&E
zTL$5~yd)z%eF8Bffo+!UvB{UNH(OQh#}u^*)qNT4z>_}cd7dyy$T|`}0i8-Iva`-t
zJ|ACGqY!?UB!1A25H0vIzKnj!e>nJ|JZ?YVe<^O)0FY=#?xog}VHW0Ci1ulvLCTyW
zveN=85qa-^=Fy@>3n^%F@T9@U$f&8}b{h(th^R^j4m(*2tZ#SIj3O<@(l?8lUH;&9
zjkO;+59$|_evY1^Nz`zci5_QO26s{0NBzQvNDCZbiGQ$rZJ&&_Gz1XyBF{?{c}Orw
zP=;uX@A8H1+O<nPJ#V5Y7D>AQ?VMU2XGxxhF}9(`LYBf9EXf;BRtyC5X^UL=*V5Q1
z&a6+@w70N}q*Ysn7ciks6fgntNR6ldVu!f|r(E-&|3Ht7lokOzbltDN?rLaQbRmlA
zrudI7B2pu%aFS__n6KkBx=vEqbfWA9h4ShPBjwGNP}@+AsDnbk4p~-O0Gr#Vtt+&A
z61LHR_?uMHc1(+H6%RCa4nyv5OQC_`Gv(w*@*7<&h?-L&ILlleP_cdk8dci|Wr8tt
z2BlY&)A<~9fePN~^vz1gM|#kDMOA^55dwPc1$8PO6?J22{wEMSp-rd6eTmYLwn?NS
zdVJcPxKLWtSx}MOkx5`28dh`Y1skUQ!mt#@H9k53X1UX%z?2%#IQ9tDT?qjJPDE}}
z9w11`B7Nmp8?s&=u%+;Si4!Gf>k8B;tvG2k^aw|T;Cr)*uKZlAr5E-?A_aN1KO@vj
z_<Iag?)Ppdky|{?D2q5im){FHQ!F3gpkKTEpKDx;{HB}8I2EywX3^tfFU(48Z_-Gc
zRA5xLO!?!_^l#-&RWF~9lC4(MD8c#I==*)DY~1eu<*r9%zIl9EgwoKxQRTAftw^fB
z@aE}B`rj+L%R{Hg_nWp1@ZsIdaC%Lu%F&AQCerG^$>~pFg|;L>ltw^JpfI6nBCkTp
zxcr9I&R=`|##fvo!eD=C<z9s4EHND15cA44q=Hqa#<t;ihBjh`%V_PSJnPM!hM$58
z1cCeSN(;anPS%Kg<U5}FKu(=NL2|o_{en`(Xxw#;OEUkTOWH^|ZI}dk+9eVa262gb
zRcnFLtIdC3Ps~q|Ud3TTDTT75mZ2NMyRI~<e0@RfYUeGN<Fm@W<kcji1`EuamWts-
z@w_q^mdkDSKiAXvN0K<I;~yS<{?tWH&XYttuZr=J61-Ks3hMvwcl5$S9=4CN*2Z2E
zXzG~yR^M~Sp8Ki;9{GsagEY5~Af(?S;ATF9=%lh)ebced|FbVnTw>34SYEVQ6*u+x
zY!G6x=aon!pXd4ge=m~HdYLu@;4N(%8rj*RsZe#(@4a7Sct&nt`1g=Ei^F~Q*G*m5
z*ZEq(H%+F6UT?4VGtgVp@^+IM&QE@v`0Kl%(&5|Itu*!h=G$p{*T#Rnadq!4Bc@%u
z*GFfLaX?|&ft%Y;%`TqjdH%>vkBmoUpOy_SDO)<<^~gdm?;*;e(CYsG$*s)&je65;
z?@ng2RVCbk9w6nNDth=Pk2+YrtA|VmqZf@vMo$&0)Q}k5aD^=@3A4^7)ru73*HBjr
z#({W#E-%@l!qp{Ep0H*iJ5^u;lSj#v#>$tOB{D@g=QK~kGmQ#=9?LYLoum4t2gb%}
z-!r~Vbz6GO5B4sX>P7+cZBezEcgm{0_s6$$qq4URuh^F{_iCsIHP1p<=?3xuyHr?m
zYeelneN6K%AGg^?yI+p=t5($VdF@G7o{V|^JD(<s>>blK=uNHYLc24fy5ajxJHi|%
zA#xzTrBVZ)ExguhR@16AIB?@?qnPOIMDI@v+vDJ-UYE<iGX?V<e{k1)yB9JPAjU*w
zLCmmD1+ejT)cF4G?1wL((M`u<Y^X4YqnPBnww;)}@zM}qu>|2)(Ma_FY2&~?HV4gV
zA)(kxC}U1av$kS1olnHpJ-sSzq;l-xuEEQ_1eFjmN;Ly7&a7+MyHlIF$9~>iab}M7
zJK3a5csvkG0R0H(?uwMIk4FrBGXBB4y}~O0JB8X;0Knce_|p~|#+m98TEtyzv%tPG
z#5?rT*J#4LMd>rPw{^Yp{nZV!zK7}%<B!$W3}Q^2pn2wEP6&N=snXq(M-?&|b2?7F
zC%J^Y?$m9zww?Ch<R5FY0j3hTB7apV;FIug+bz1gFE&*Bj`7t)DU@Rl(R89DGbTr+
z2DB>pXISGIiGxb?D*=Z)I@J&F&|<WfH+4G_*-32;Gx7lnA;vF`U0zd(-+QdD7HY`f
z%8@YSxphBxfjU|;@nfgX+A7rKNkLz>Zj4vf)fSHC)B;7xNfnj(M*}wk<f<c)-?jQk
z+-U&Lw`jwlcXw~?9yCDh_GEPzN|=MIc5Srz7ka#p2e{Vx?ZX-mGQ<p-Y1(?EShX^8
znGA$rS+&|Xs}Jz_;BsjAx4!?a?Pf|`ozy4DHD+MAL+PZcdd5d=zQDGV0|ZXXK6bg$
zP1$3rxb{n5Y-URhl>XVOM^UfFnr(sA?oXC&L?F3BGaXfmTQ;KyZLN7a9jj00v4owy
zr;XS!upo9_harR2N%a4JMB6QP=B{~Tt1f<Y&5K_KK5qPM$Fu^t%U#qf3byQdN?*AJ
zKx)-~F0T1l)&bv&U@=94Uo?ThrmYNSnx9UexvR>sEDx-HIwn&pH=WoBCYsts^r7yE
z5N~9aU2)X=($IG7j>r^(y!4^jLKU~N&HNqo!s}Ct)vH>TSv8iWctzo^M^6(5yf)X^
zQn6)#k7c8QZ(^$K=JIO$7Gw8v|Joim$V2yNtx)?VnZKWcuUom7tM&DLBJQH#NbdVV
zr53yhhdYR7486*oJig{TleUg?*eN?X?vL6z3n+<vgc{PL@}2LuRc~bHfHw0Ze+2b2
zJ+U{&^Je!mfh5J1yH06PV_zJ0OCHrfn<{wEb1=5U7v7mLt6A@gwr2g^H9l|s#Q9M=
zgx>QXd(ae}1VUALFnn}sewqs`|7ZRS6ZFnWMSLMdv*uNw7Et5U&Os{~0Flm%$v<0E
z<Lq6B_lSPJzj76Qf@*x?`bC`TK&rEU)t-gGLK0bC)!RB$JE@q~`HL@>O|ls9@{})w
z>D4@-bM{MLMpWJL&1!XhpT0?K>m6+72+PcYIsOaX*osw0SARf_qiY1<Z%TA%;Qi^N
zszH5!^oXWax{#QHY8Ntjy<6I(#o5GUJ(?Pg<_aFTT>kFM7guW@`SiUePt2WZKq!!@
zWw>MZh5@feQYKwfVZ)VedupuFzvT(e3h)@KT_Ylr74xIAwZm;OhIj0$H)YdF03Zv=
z6I#^w2Qj(jkv3^uWi)uqt@_aEh5>9m`=db*-E$XS{C~uqiC>QC+y9%@3^ROZ##po8
z*2<P5WtqitCmE$wT2Qi-t&}Zk7GubI7fC5g32jJdp=>QuAtYogC1p*rv^?+QPR#uN
zf#3alUY_r8FV}Tm=Xorj<8yqD!&z?GdigEiKbxp_TRs|LpDN&KGTtqdd#Lv#0w}}f
z!8hLefqx0~Zj3>-R{n9`$u}kHH7Z<W4uoU>{?H$M^wCB<+tY`WMuKr!j5h0uPRkTX
zF*c3wo%8WjKHC#GVx_k{&Tlk_km(W_-{Q6mh<tt9H{?5(pM88(-YsRkRauzd3)Hd^
zO(J~*Yz+r%81+7kj{K;<tPxp=o|N0^uX`5Lm|?0u$B3)tp!z+z@lxJ?yzGNQO6ZoZ
z>25+>`6{(y(F+>pBEaoFe0jjf#}=7QD3ynvXieFKmBuR9BbP?E9qzTHIg`HL+bW-{
zvL`Gz6G@;kH7_t7jJ<ag%>^UxRe>{ZCdYHzHGnc!hS;L<g9``UbdAe)n+dMri3H{u
zlOTN^k-^lz>FRfWOY8cSdTiqZEJ&Z^+r05IiPO0awG93o=w0*rbUE&`3#}oxPrNiz
z-`?A}vc{3@Z#yO5mWFCPX~bbdga_PEF2yt~6`T>2NY>O)m2Y&>uNSf!seJ~5y%2YJ
z1*X%_*bOp~b~5b>d6k3%+6%ozJ(upU`r147*;}y{SK=8pocFS#Ko{cI0x>P8-_YB&
zm%>28{k2*$sdZI}38rLM&6K(E7IJ6jV(!3szvWe#y(*u^)7-W*gAZLX;j+tdUE|r4
zhpVnliQu71wCZ_lVfu7xQxt&p@SD|cfM=z9HV$XWi3)KK>q5V6i1zF`sy<hDg3=Yt
zJyy7NsPma`2ix9{|FikfeG81Zr(}jf1P=9>^W=EZhHp9=4cR#Cm6h1^b2+tCFt$=n
z7(s41N>v;&qIS0T+q}>c>_-Nu;byW&1K9aa_5h4mqNglg(^dr5q*a8PkKIErG(JTx
zEBxeO4WDywSYz6S?!0__b1;@^vc!L2KP5d!yip$7BbRz`?Fod84NySVrKL)d$@cfB
z*5~A_bF|S#zh1~0Nu6^Lg0U0T6s<wW$QYN((H~lkrzuS=kI52e!u^NAJCtT168;kx
zj|2Kd@E32`7l%)M+1}}H>hCf#P!)#KJ|!)89OP!{rJ{y+fy&~^>)+g*mo!&ejv(S|
zo89+<?pg`4w{;o;-amwe!{~ThCZGDOg;9U7stI6%(m*_Yj0B-myGsw=Yyc3%PO=o4
zNIP#~B2U+#d5N>%M<=NpW2YbDy(3(C9v8X?*s3teddh+1&bJV|@?H0i`GcY8>J3)9
zi5qDetTcm75e)Iw;ND+9s1(e~{Z&@}-0i14xp4%6481vs`PW%Amx`iN#p=e?wEs-w
z+H&lzEK;7(+9-O<VTPC%EYRikOI)so)h8f#CdB4VzsMmWdccOWxcj;M%hVGR_ql8~
zH#G=fP*Ynt)UWvWCLPDBCPi5dpcZCb(N3a8&6VaEH7D8MC7XU}o++U^wBS}b0b940
zU}JK=Z11`W96LH=3O}`*t0+M+Zyc^Au553R=sq$&#R}r!l!{sMo7#WgL4fEDW6}dj
zsCp`ejDeodKoY4w?}L(F9kG5^zOE;#uC0<v@RD6F_(+|WhKtcJaV0YHwQ~$z@k)uM
z@Kd0Bapn7`9wa(Fl~mBph76`j(}dPC^yi%Pk%9SfDJ>dV#aQ|rBDWWj4z+zK$4mhC
zt7P)(48o~%8wAoxoi4kVqi(F8HUYl36zMZFBCJN+cHfM{7N0RV)FxN~oNN8?25rYp
zlO{b*uFM%{yNBIcFl{xYSim#eS=mpAzPj#c&c}^7{;Lk39c;Lmhvs?i4m~h#&`@n)
zYWO>DFz6vBLi)x)Aj6ZBlN%nI`nej22S4>i>z3a3L}oQTILz}!1p=I3UcY&H+cejR
zJcY$*mNdO@CQUK{D4F!lPelCB=H3znEFB)+fJ5FaU99Azp^}EeT;GB?sR=P4Qy`1F
zI?9|W$-R)^&H%^`l07dQhSklZ`r-wqnYE>Z##D>f@LPBAaXm;#E=O-KD`yJ2KI>&u
zQLQVN6J?LV$;|8-8qBe32=`!N4#jJ0zkje+?qY1`Jfo5*lI-I;RPydHQq!ztP!e6x
z_fSPM?@zpM%#P4mv;s_XRT02&UytbZ`Zs1Bsyz7=X^&iyz95`MRI(cY{|p%|$Jomm
zIp1m=Oq7CZ8In&xSwT`AAgb=UW09Ez(+y|Ws#LlTc_`<{^*iCO5KKx;BarI;_wn=8
zY?;j$`4e@#ILw5O#|4$=J?(z|9A>r7Wy8*-=OKamk?yq9f1U2xMhIRGvu=a2dM|i9
ze-hNf1vO;AIty<h_q>D4?j_4gd=sLWee>vJv;30u=NEKyTN-|<p&dspc53{cB54hw
zBR7PmkWyT{2}g${DTt>!GyYC{7Ix~2N_<;4(XJOrLD6jZNNJuH*rL(iY#$?T_fJY0
zJ)r{H?@GWb_!gdX$NZQ8aJ?8&Upb;+Wqoby1p9;+$Lgo_)6$yqXc#)O&C+Ej-#-q|
zC_>Jv(UJvJZ?~}QCHwYlu-SU-)ErNaK`SmeYV-N?=U)&_3*A++3gP>6AzjFyZvC9J
zS#0CM>Gp^kLPI?K?310c-8gcGiJ@9i29;M-V4qfoylo#Is#lW^i+*c9<gSn8-pUtZ
z;)x-r=odKooy!pt^}QuPPvh)XQutU^)TxiICMnOVB94snE<2-7G_E;OkjrI@Z685C
zwzNF@*jmJ$S+3~8Zp9gR^W@*|-kK3$N30$zv%`6%nq6m|uJy;Bo~MSHaxo#_ez^|B
z<E2IG2g~gwGZ6R<!A;`DZQ1%aqZZVV3u{+(yi+s1_FZMG(to;b$k?^I<94xf{^q9E
zX^T%ix<<M%eT;_Bj91jnW}?|U;zli}aH~{*VDK9}+PBB+D<SKo_E|tI4I0x|MkK3Y
z4j^rZ`%N~fuT7(w91feQ`1>3v)Nvg4nJCCX$Q(yLduUZ!1sio$5fP1!#|b*~dnwJ>
zZkYV3`yKfoDBYz+O?9&jaEVFSd;L3&seyJ&A8d&{an>4LgbGp;*dm!<_YB@d!6N$F
zJ2jZmkLcU|&4YkauDo9`bJ8+_4@%F($x>-dZ97TZlmAyxfr5(x9e;ib&e?g%EiP7x
zsyeQFtV$NFH>aawD9!$T{f)Tlr&qVN(U@B0Ld!`^Ldo_TSd}>hZkLgn{6ffTJ{C?q
zQSo!F`YiB-9zYdwVuM5Snc)OQ$jaAt(WOxdDBkO0Q6$V--NDmJu!7^au}C#3_0{#_
zN>R*hHv3vsL01<5pxp~@vFe?=y)q6mTIyE!A|R!uJ!R!Z3H8%cG1D(%SJrAN#hRm3
zx+tLp@8!$|uF5A2H>cY-#nwebBXTq~aC0KPA0`+L09Ud*R3s}A<;QW{>bP;3K4`>r
zRTxI242EsakfL>st|8tR&f`=aDKRX1dV3hE>o@0@!rF6q_||&njx8{Yx-0u0zxxM3
z$?{$00&_v2@KXqujw4$6*A*=~Zx@PM^u163;6(}f5BY^@bwnj8w<;;XB>eU_499-e
z?KD!?-kk2Ma~aro9Ykr6?_BoyUR0<!R@+Us_3r|J>98v`;nL{#4@A56{mT9*qvu=#
zS=4?+;tN$_Btn6Pi|5_DA)y!=?Fyv`p3Xq{ESvbo52UP*Br&gepyC6RDn54TU~`Ss
z?fdAME2CFia4!k435hf1UZITfDwk5)jFj3}S!!b?U44~eG|)i&;0FvX`pVXNBO<8y
zIMcpK6^GTf`%cQrX^V3dbO21i*tlW7ikb3$9!;~vi#6D&QXrLFWzQWi9mgQY`;*)W
z{l`0qe;=SO)wg0`bFLuu(PGvh0tR{D;Q98d;+Z4fS&LDNK6>Loqm_Rwkx%4UC|vFr
ze`?YIBR&YZgYvy(HCdA;4dJ>50;^mI8r+XvktL6D?T@Wp5845<Nu&@F>}E`J|Mrxc
z2mPS<)cFx1Im_YYnT3K9_C4&3nQ8jw*e$oLP8WUyt~mYrT=cO%lw4vL#47e%M&EFY
zJ;cD?AFu6mNU3wnt$$K!cS(<En$v8vVd$fMQr5@lTkWdPSKHS6&=f*_`zH3ntmp(A
z-v6iDK&OmNuCRGVnKTroTvD3S{Qclu^HYgxn0R)?j;#tgNR{Od^@$m4Se#x*A%P9#
zYHS|bQfYL{=(0MDC~r1293laWutiZiri4_C`W>cJ$-RMp+O1V9CtdrNa?e=YQzd@5
zbq*o74zNMdz!k51$%EH8IBD`tHRSjDk~CO|7A`R2V^<kn`<s&}tPDuk^ut2*Q%am(
z%o)ig0I<C^RX(k8uz3USU>Qj&qrc+qOQDQNTGZN5o~nit@=lq=9BK$rMvJNus-X{e
zHl9KcUv;i}(1r;OC$WLNUT>hzS&YlQfOS_XlAa{GfMQSVl&Ypr;^)sz7oVYiO|8<x
zFmr_6sPln0h{MHjwQS&-Pt`m0fy9?ObRH3dSkz8+&sLvK5V!*{-1ywu;h$-$zss`q
zhYa4w<GKW0bdWgRKf6Y$`d0sXXF9mXa`4~dfg0CV3^#7Tr8pHGJlI9f93ZWhL)s6T
zwW+oJ5|MzFm-oMvzRR~E2~rGUXabpuc+Z|nX>MXZx1OM&zEC>kb*<CC;SY;a4&`KG
zCY<7@{m#s8<O0OD%t5WsKn^@9w9VNDlEcBef8X+b+<q>}jzzBpjF{CSds$5E)Ld=T
z*;k)_yy19T3Z@c+{nFe(L8NZ2a!Pu9G%FMX6YO@r_b8;a<y9U_X+izJ_#<^~5GX5D
zZGZTMt#djELOf{JZ)A9O@2|$^Bdz?JnIO;blgzjlI8A~KkslvWTR{yC7Y)M?7--Zg
z0mun#4blJeP~)ph@BZGzO@f;DLCLdk|8?mZ$cnmA|8OGBTIIg{Ny5yINFW0~=r<lR
zFNc5cnSx}NvH-u?8fW(ShMttM87VHWsNX*VRM!@8E=k^}xM6M{Py{eh$-$-VXBNIR
zfIYGE_$TU|-sju_+Y0sP<qz!w2_C5(sgv=0w=gu7B=Xmv1gYP0@NY^T@86u-?h&b7
zHo?NghE?H6XF*Z;@%{eQ`8a&^;NOHG*&OVxXPO&ohoS(llDb7MwR(5KQ}I`$OdG_~
zw0-~mkd8^#EY?<n++aiGsB~!#<SfSilc~N8p9eEOkbHgg&ay56)Cczbb~g&I*%yH=
z>HdGeo3H)s`2oZ4br^nz5yWVTeVUa@dLeiPSc8sqEAW~+Lt?jv*Uo=BGKL(WuWz#-
zhtOIdOlCNy@mt;aR-daC5?0I4OO`E(f*NTg+?z`^BeZLTSX3rST9GYo#``%jpmvoz
zpvO|#F2tC78*j)0ZSJYx4j>XS0-;;Vp!yeB_?fus+4NXL7W1b^XC^VQ?V}p;d}sgp
zE|6v^5*nFAUYt_@F2%j#Q2Jgx(75DM(t;BiZy-uKNobB4J`5sXOd^`itgxhu+aw-E
zvS+M1Jfy;2*5YKkU<aKT{ME-YY8%ks15lOB7CF{83km22HKb6Jm=O=U)vC*P8mSwf
zijPD@bQ8Lm^wm4Bxf3b@X4C`*eH93Kgv##3d$wR2QtOPM;Ly0Uql^JwU3`F?2Ft+Q
z7ZUuwdW*L4wivsly|l~w+Dh%sbZA)^kI`+j3pmJxqc`S6f+&4fz>#c~U{BEqgEeh`
zfuy&@*dxSG&Lfqd6l*_Bc^K;-5f!CPcAbrBeF1ZRAih8A`~0>8rvJ*?hA1+7M+6_X
zZ#jfyFhn<JNH@?=Nvj2w7ervc)uHJsCk&OA+s_(4ozfvZ>7ow44$*qq?*N?vvOi_n
zg+4VsT^{u7iE!u*1MEDlDV`>(QFMU@1VxuXW^<nU<Y`21|GwwQ%_P2B;9N6QVJIRQ
zJn%P{1jhN#Z}l^zh6NM16z*CE0Doe}Ei!3Mq^#eZ=~Tz8Kl~@%2%+qWVgV!PC6?`#
zJX?^6I>c-DvOUIKkm(S%BuT6Cff+9VON2mxD`ll}*|9qSzqAFUn`Q#nK>fo26b3X+
zN}1<e%rLSV35b=4GkXLTZ+D5YuIqQjMr4a~NTB@`)J<(&H@6~X)Wv}czATkg7Dh5k
z$<<O`pX(=u!iIIRimrQFU)K*$ogv`nk2xKP2Y3j}51(r_rR9Kzob6zuBmzEF1&~Xc
zeUT%X8&T&30@yr49?Cvlsf*a7J6$E)V82D=wXT&<NjQG~ounc60@#Aeq4v>aNrVJ=
zFCmySCh{0axs_OX#eQoU<pD7DW1Ru^#GQEK0VUZSdhB;7+ET*%xqA*#lFrQFg`-u)
zeAs6CnmG328}f<}MHwCL`h|;_Q}lCmIL66p$ZfE6Uy&xweqOj^aD+*qN+JNC&zg@V
zeQ^WolN5dw3_l`{Wh`3H09!ZZdso616>vj3AK$x@^8VdTeY#2<nPXIWK|O{{*X#8&
zSJvOR!%dDw&8uT6cEvSd+5S$a`Yn0)vMes6N^n(mJ1Z(^J$QIIZlzINa!lf*1E<R&
z4KCSL4o}*Y<F8Uk@q@rF<zf{lcrU~WWh?*TjEOjWtL-&Q@mcqlU@EjlPE{|RYI7wL
zfPyruGvt~PjLEh2xN7mHAlG}Voo7LJ+2lP;w*7WCj2qO5Ov9uFVg?7x?+LKwBq6F`
z@o_mrwLs!~ag5|O)dO^^mx{frQ2T=~l4db3Yv%r<6o*d%uv#<=y}bPTwAJA}DBm4y
z`Sc$*pn{w5vvrW3i{%h+ND-|Qr3LZl-w|eeE-`28L~nA-nJ{vdttv9!6z&MPrYFJh
z&q;ysMGh`_OvG;g&<3#l1q8gl3!~va4};*91wuM`NHrjl6wFO&xpAkmT*77-@G<$U
z9=;*k)-AKU8blNa25df_GEwUrU0%M+8Co7C5>G@}8G`XhDmSKs$0Q^v{0NwwI8F)V
ztO9Mx$!$vr?dH9O?OiB9rtkqsce7}xep-W0dX~UAP=#BjB_jtf?1ebMpPArm=XH+V
zQi=5bKl{aaY`p^p<c0xLrvS3(iv2*FF~{$(P3BVtMB;0M8-^~t9x#dQ$EWHlC&O$A
zp<@-X|GZBQl1h!!Jca}Kaf1Vf>A}ef>I%!MBFF+dZnZRyW;3oQP&@>VVYXNre&GQ$
z-{lOa2~|i&?UwpUlc<Ze`K?ygzq?a;x>-N%S2twTN=5>kizOyr?*{#UG$>*)WTQo(
zT+iZbVt}3mYiPE`3xa=;JO|0+yB6VfPQAn&wzggHlibRvm=xAdI4uLoNt4lVS+&C=
zl#)s^VY(;^iBS5FX&d2V@L11_B|lqDucJZ|HCR;JV8}ZYISaDtMyB}y+{=_$Q8hoM
z-%!rUWo)BN)!;dMNj6gaQnuzeeZ>Ey7@WhrwPU9f2q?Nq^!bV;rE8A{NAB&4*YBm8
z-*gLjR;F0_Ta_tE-AXJ$SI;h-;(BP5l%ExEBoDe(tyF=%?OY(>ML)L_FjOU<mp2fE
zBp=ijfGbq0@@)2mCyC{j4{y8pq2yVLY1FE5zpU)}V~mK4H?t!?EIlkXJ1<cV7=77d
z6K~-t+74T`NT5Yrf5$+M$gWv$)^qnj{nVmXK7lSH8k%J!yuvV%ER!$Dp}vuLcDa<7
z4e+hH)6f2M+B6E{Ko(7xVmzG@tX%ct8B~^JO^5E1D^k;vh8~GLnD!=HyY<_HDoP@y
zeAjs#Lx{ZjU{LA(;Z6ej@3exW5yH(xu&q{oS}KJ&Afq0XJQ5q@*3}hfzy3{NY?H$~
zovagHyr?gIO$4o_IUkQUPIEG)3Ne6zfY`#DdeCCK_L0wufdEpj4|rD*H^G#CF7Z85
zeDMU#HNvhdRh-5_K!R4gmdB1i3Zui76lYaFEADmg_0B(=+w3Yeg7JR~9*t~|%Q&o6
zF#sg9H+SeCitf-i9fl54%HMv|UO(!$`{Px9rbBy+xex>}GA6MP<sB1De5-_~QjtN_
zPG2bnLp5xs*N+msqM2vT(ARhWtPDu0=`JB59L+$SZ|f&1Mna+`@#t4=q%pPd2;MIY
z{%{%R0tMbs;(!O-T`oA1N<kbVg@l0w9U_b?^%n9ex3k}kX$YHJu*gxdQfCZanAY|U
z*1BowD?~!2#=GhcYH0E*?<8*JHMm=|q@LFfSh%dZMmpzw?>HxVT!(krRZnxQTXBSp
zrTUF~?V*$jD0!xabJrBb0;+w}X}A#>iKKoi1$+Q^cHL3i>IWp6RhBul`;TfGocV{H
zZIkAj+t&V=QfT##BVnwfv9mW0uKedOa-2@}9Cn9dzQJ9e&jvyQ5a!S-=1{ueKzeJ=
zC%}?OOAz0<(Aua4bwQ`+a8T>2Vpq=KdUB9+ki>La!kyClo=d*a7~9Irfp9#jm_M?_
zy<jf;3BWC7R2KeN8<IH*3As?7p}z2@yr26zXunVo(@uTVA6kg8;M(?|I`W<^IGx+M
zA14<_#pT#MT-?yUpZXS%d*@eDN*ZFnxv;Gt^}QntYJN$1U0c$TC{YbVIyl!eI%}JR
z6ZW~=^|S;Z!!oP1d;P)0^d<WZ=ggxTAcovsSWOaH%=P@mMv0>?`95|<{hd&XTRoJ5
zdNE^Z!6A3KNFn6}f+|j{siqerX@M<}D!ghbSWF<cYF3&uWn>74K_x^K31U{>?Vxi2
z1C}*x=8DB^=9nurQdSYvWxpQe>m5{B7lvP30?sgbP8|k%8#^ubN;}?go(kM?GnXYD
zn=R>h=Y`S92TQf@E{$p@@(J`mxPeobFxeuLzCp1LQU&2TUqB&r6Jd@Mpyha+@MW%Y
z)$pAX_D{7)gjeXEV^CVm@AL{blrW6Azf>x|-@PZ9x-?a9$9Zoa2dRo*AV;w5-s`0B
zi{}EV%TLQud3IwE>Z9q4e`6(`rC=u@6u@a-ixRrlT%=NpwVc78$vV}fQ_Y|5mf=!o
zJ*bseE$dsr&JP$`k6XzWJoCDX!0*_Tq|0ps#$ep?I8HO#y~7uj@VDb+MB0)yMPiP3
zc;4tO;2Y%F>T?_3c<+A^lz(SfePI!vT%jb+K5}8HwE<EM7mTy(=S^HZO2(vW?QonS
zJsny*c2PFe+Ht1RDz)<U*3#!XxsIRYI$niAVb**^%`;6>NOdWP$||fL1GM9er(K|2
z=6=;;@`D4BG3Oa~v$Z@XsjtA#-<gY}DOWL^DUALV;b`5n#eKR%HHlJ%7RjmR-<!)q
zifWdW)p%1efxW{nJuHp&kUEZgz|dFND;uYrYMC?%SmUAkqI(K&4rkFRQ_b~W5Rv=z
z07SQGUIRSudSUYi&b}OW&Yr~UZHt^xkdEnDQJH@nPTe?{Ox?ER(4C5rTeNLk+6R-B
z49Qfb^tjXSop+^4%JRCTbP6O6C3L|?o(BDT0ai2<RAXuJ<}t!|7?q~Bz=cBs9(<;h
zi}%hDN~F~6Bu&2eOJ6@3o{@lSkK@XI9MC{zJlI1WD3YuUlPpfuO{nMt0nAS*nGej<
zP_kW9gqyLy{@8uCp@RH)Fot?c%r`E(DGj#t>{xrU(~VWSN^tj3A=n(s-+IKi?9n8t
z<8^#jxMM(RcF?ga+}Z6!aKb37gWVtECjnb=>hBG8>}BopL}D6F*3+j)6T5zip`)JZ
z7@lw47UN-47JKeRjlq=f<&hUn2}wU@wDAk0jTY+EO;7>IkF7kr)W-nSmf#MgjQGOW
zp=NilJ7*-EE8oj?^`&T5@0WtjTjef{0k*?YVYxLT1A2jyE>^(6ZENW1swx&H2Y_dQ
zi#L0=SvrCC?eEi1U<iG8#Kq7|Rc4$W3+ih##ktVQZ@d)z{i&hL-Zo43Hm@Oz89#aU
z-(ob^7zXyurXS~|v3pbtfxs%hBHT~w)4OH#I;tEo->vHU-y7Sa3n2!_s12>_JX^SR
z!buXuFe-WiQf?m8G4TAB+T*f=tvqnfev(7-);__JY*mq85w>pa%n7m$K=`xo%psT}
zTqz+kH&qE3oMm_a7D1Oe&&UhS%%vn!0Yjc7<u^C1IQV0;ESp_mMsm<Ax$F{{e9EOu
z-%MW|g5j8hzEG_qlUDEqV_h^hyF){L>6Sa?2V)(6N}?8m@UapRt@1qfdNf1;H7L#I
zBB9>$RNyTiix@zI#f2V|#ny1FTIO_ua!B-4zwG%Z2O6&WJj}XYDlr+<MueO_5NyW?
zfB)uctatqNLl}c>%u)#3v2_kaZXJ*pSK&{SpBc&%lv?Ddl0t5RgvE3b9zE*h;Ir*^
z3F#Yjtdd}=gzBHMQkmJ+aURhNkFQ7O&-JF!O{IN{m9Z?_u9TvIEtHZ~`P8+f`Vru=
zUf9?2NtLK#^4HceZ%F=a_w8Ipb^o>Q7aEb*g;znGF#}9#DQT0rz5U6HRjjEL%Eehq
znS-iqtKPl2+k-LhAjk#_Jfbf3+fs&i<EfM$$upf&(PXw5ez3z0?9cCNNl>%1`lSR*
zm?{()L5JlrkXnybT?id9LU_*t@;nBD+I+fD=qhq0^dQR*cf?*u5UW&)Im3pi!yxTm
z!EeSrDL+Y*axovvVn(;R>yd)aYE8GXsxY{7l0->5Pjv<2hGbi#Op+>udrRc7>)A4@
zyICjFghS!t8h_O?Ygo#WBa3!@0~_RpTkIzq+K;%>3Boy-jZIm*uh;dng9QYWe3tZD
zNIo#d;<|PhF#lQ;%4(TFR>3&JFt$tPAE6V3kSLV|47Kn3^vmidb=H;>+5Imd43!8w
zOxpOs0E<=Q`ewpf+{O`eMr=Qjy{ks9r#RKI%{Q@ud&&Y*BzoacU*(_{)gvrZBaADm
zEL6qB`{N*TdS9s?y-<iMLg|-;0G2>~rWB6n9xdnr`$YYVu&Wfx!vyT5I876>9pB(K
z8djYIsge?)1WUBzIP(~M*U3*|qArUdA6978*sr<cD5;UX00tQ@7y=y*zNg=aSr&d7
zN$eClG3Awd10wd$3{SFmAutQmQ@%TenORaaZ<-H!8$#l{m*B;B$4a<^VHDpJD6Uq@
z0`ZMHwk}K?6po7EfTT;2wI>cb4*?daB#P)Q!HlrkWkQq;5!$RMn6+oUgz7J!6dFqI
zwb}1ejq*LjI1!YZ0&(75^W}1m&|S-u$zsi|{*sGLefy9UYaFxxIFe6W=CCMQLZB7w
zh1hlfKV|AW@`;CUo7!$CrR)Um+8tX|3PesV$x8+EP=;fYXuLk<@C_B<L2ztsxt6WO
zPctYyi|nUhmf<-!xyl#Vw(d$0sHF3^bLsvyv$WjGF^f5@mp_`XbK`$+Rb<s>)#6;#
zd=sIZ{ah^|lP)ZS3xt>ywt7Xy0}DiOT*n13CI*`U+ECz@rPF%o2*9EXz}mY8j^bvT
z)z?WiUCZCy`Z0!Q)$;CY1F?KP$~(GZj#OcUS6RdAlUCZRoFaC3+B~5kC=(9s9#&0&
zNiO9tA*2xsuJsp=qg5vaAnMQ8gX1}0-1*ZgcOh4<=^3(({Ks>ZH`6#ZH0nc7EZ>-$
zP**>sE?M93U|aWg<1Y+2{l832$FFwGy4U`J*W0e8H~clte*S0obW?{>Ym9$AoxkgE
ze{+-OXBPKJpB6ZGTS3)3$CMSv+jt(2S&`PUnmfGdabiSh?z`ILS8u$W<I6ox@H;*?
zrC-dzWX<>8X+2$I(xLqE;~l$prx-1L(Pq@&uVi3lQtq41$bkEWhC1xs5I^~oUl>(U
z{Pbz)lUg(u151_=06%Htj=E|_mETsxw49bk)1n<opJL$f+Y_r{awSZZ`YVxCIZlRs
zy8dB1$+#G-bd`4dmjsyQcoK8br?$t&i!R0NIy*JpP2SZxuCdptc>tgueUu0A3nF+a
z%9EZ7c@HAb(R5dVoK8aCb$}(Efikg;(5v{&UiBOJ!>6#ZIZpEt&*v)r*o+{U_I`Td
zrg7365A*gK@$vcrw5p1@lzL79;@s1RHh#;q{*A()L;RWFP|#2W9NE_c!6fO!<l02#
zY~6F5Ryh?eGn!(?tFcS<H@JQFpVqL{U%DfRg85uS)*6d2r(-fJ*KUM&GFLJ4<F9TC
z`fnFq$EGxXUPQ`Hm^iqS^KJSu?F7-~xg4Xhbi4QH@nHcIzWq=0*WKTbp{OakY&V#y
zJ)i_JClIYNj0S_jH?C<ouUzZMd80ncXYt=lrov2jkTzooezSC8Q0bBtjt9IM(7OS4
z=1o4uPS6BCzI@h+2C9h5B8L+6X$}X}j55Y$EUXNxd;i+I=R-+M{$|Nkn$c<USCW@)
zL?>kGx&WOimgI$#sbtSMWF-yO?_Udk_|Ls{P9KM8cH^?3aoVQ()hkQp9$`LZ($o(>
zH27IQed}Mk9{lGfM{!R`{K$Z1^gQd^7KyemI5&CaFjRN1oc*iuM>Q6Y`OnH^EJTl@
z3eaX22ZpZS{%JdcgCT?UlsyPQGGIWL#*e1fHtxZOzj5=&w`dI+VkL$1w(~hrP>+8>
zE$&R1Q{!he+7|tHQ5oJz>lXm{Zu-=3Hq3t${g?kPYKxZt*=aueMVsQ}isjNZ%p(|>
zCZMqhq@s-}P{q=QVy67+z}_1R#0)jQMm~|@23TVuu%Vi&pN3{f6rN4yQpj-Hk2g5-
z+J86d;j4vf2g8O9XB_nu%4D0+n_KhshcB=A^Iu)*JOv%;{Fh~;RX5=39KEy^D^}<s
zj7WuN+<$lV!G8a}y<ZBWE1k*mMLJc+x)6lveY`9m$NyfI<{$URH{RVQwGnLmFO<pV
z&`>9GJ!G3rtfMU*Z!v6`=1|eGLl<NG$KAJSSfgob__XICgbrk?f(^4t$3O3}V{qe*
z{=bJQLq#+6_~0QEQNGxdJ>*<NZJ2NAe9V0ybUS&aK4x2qEG4tA13NUZA8xI>I_E0(
zwd%^Wa!=C4&Kk;Jf0|@IVL#!Y{ujX%X)bfbc5_zPRHDc9{<X<P_T7SQ@FBCNACejK
z@)!+G6FF%I7z%+10;s?F)>34`w<B4=#-|afA9}7u(!p)rf-Z+={}VhI)k9G@A;KOH
zuC|qDWQagMuh%OXvMlP{_}gfBvyUKf4?enulS?H_TGs|mP1Q(9N($Q6^8H0=q<Ya?
zTm2oA<INXsd5b`qg5Mm5yrT>pL%$?HDd#(c^)wOLb<3}1|E{m4);l#WniMGQC?(lj
zRo`BK#isx2CN0q=_JgK2q3@nr!!d=D5Sg()uj>3Nxeo4m7U5yrp*cuRf?xP4WRw*q
zG{bBI`RzV>;H0g;ufxgU@C2@|qbK(w+zg2;y>^afo-j6S50KN}F%1$`+q4G^5dG;%
z@R6!O?Pn3$rs-qkY6%=g#!}?W)_!>Y8g-pU1&{2(Zn8V+Wevu6k@Q~Nt9r&sti$5h
zH>k~A0$Qzj=J9N*^}(sXPfT52obg>;(JTyMBC_s`Ws-s1p26SHhqn_&+$f?(84Mg4
zQu_LNk;vXFlp<TVq2sUvG8Jh~JbHQuNGM;em8X~Liyi@KuAWoI|GFy_V^v;WUTE+t
zl&ItE(Q2S1^pI(mST2jbCT-=)5Pi`<%AJga;B*-x+!Z8bXh{CoBkWvC>vDz<!cu!)
z%eWE*c|>DWWK)%w(T_l`<gRO`fETXTO)}P@($emnFHb=C_L8T)cvv!J0o{S+xwWQ9
zuOpwX9%oYO9TE~EZSiUuY|&|+6eFS}>85~c+)?(c4PiVcgB_d(%E_YtfONG|#N|XN
z0W$O~cCA`4Cb41a=%&J)kYSN9F87g0wbOUq1|x2UYotTUN#-^0qG)G93hMd#&Yz8Y
zQ={XgNh*y%R%lY#*zcr296Q;dVgoWH&EKU5J%$+>DU<}`mP8$Au6=TNI{zdJk!TV_
z1n9mx7DrM$fi+nM1gH*y$n^DZ(s2}n%!p+2YB_uQPVN{ukdHV%K{{(@pZw_gPQ9S>
z?=9hKmTVK499L_>+gf(&)KA%m*;91{CjwC+*E@+u0U@3-JSOP<?sW1)tXCodZ*On0
zWFl3pBu2X|iWMT|fzY@VPBN8^21>(iIQG;~Ml{j{)kn$rtiK^|^;{jzY_g>ypwCmd
zhQxNsK@Xp(ZH9|u6e6C&&o6qtjNXU8=oRTFdZ{UOZzHKyMS=7ft3;_)RB*ALbMBlx
zfm|Bq`Qp1A>jbwMNF148&d&Fnh(u^w5fP`;9H>P;t0&H>9?v<F0Z^B-83cqV2(h#4
zt7<y!K{Fqomqbd#=TA{EB0-LVJu^V9Wk%b-f(a7iiTY%2Mm!@_)mkL7QyN|^WJrqP
z^Xlrs9Sslv5TOt$3^U_t){i=wy8Dv3A^X3NSV-_v4t@^;2Dx|<0mMs-$O$qnCdf~k
zQ{hC}`*~8%Vxm5o#6(^r<K}P=7R+c8k|?`n@vtW-pJQ72Ka9x@yJ{_#mpwYMczL8y
zx82#hV~|*rP*i40J;)B4O#hwwDd8BpWheBU5aOyi|Bw~^JcAuY!qoLBHkqjcbuk8y
zMcZpAx*cKuU2`fBTHeiFmqQ2=p*@7<x+`TQE<$h-)Tu;|86lnV)xnJPQ-!gg+lBUD
z(NA>A--IB`d9BBt$E8jjn#DUrDvm3bQj1!y4}sxb1PHshmq>(rL@LfM=`X-f3()k*
z#Pf1M&>cIUo<0U3?=s2dE;QE%ZFFIdDtC(j${>bl$^=DJ0OwrVy$b0KkGC&piy$s#
zugRnbS7=4Dt#&7_a~i)aasKUZ?F`gA^I!_9l@i*cktE_qma+D8YX%<;2nYZ^+y}yd
zrphG1>4641jSo5X&(%k#!aOr5j9PQjyEgHceA#-ShI<i&&|$t72W#QOWd@gsU#=XT
zJEdrOEy?&J;FRBh0^<>0aZC|~irC~AYmd$=T$ZjW8-P++7%|Dsbz?@2k}%OGdC4K*
z%{eW{4K@4q*T2{@;ow7U9O3AaHAc$h^I8tS1!z)-?WgQz9G_A1ufFQv{mkQ1t+>)y
zwwWr7W(TZ}sgOC`j$SwpsE2orM?=R^8!026{S^*rl>BT<L(LK1wurM_fz2|v#DMxC
zhkp-KuVz7OlHs6Cy4zH;U1UeZvKcu5KH<^3p%of*`aVT~{lD${S@Mb#_I<G_WAQ|&
zNX?-sVf6F8=cZ4jpKTDK>Y~L9COkghwq;8xp!8<c`or@HC=_Q0>ZU)urapKYPl$_6
zvE$bOkQTjuHVpvW+gtQGpbsw3PwC$H95gzwKqB~hX7o8T;p{%NLAgczx?(m&tLAoE
z5ol#yqY$=-7OF8AKV%=+tFo@_U7Sz%6)!S-s)s?h-O<H7yw&gac+T6mZ*SL~xq%^r
zv48#bxlFk~w_0?i!Qv{O<<oyZLZ(OJ{U)L2w0rk%3KK&_L4#}xiLGZH(ka8)q9OcR
zv@s9qbnSZDIS3!Z+1IQas+f(mzA6l`a}x3#8C&(j{qL6n?1+!g193wA8i7nP>>Ui;
zeq|b{M1;7lz$XUfSN1!$oK;VfTV1#kGK&74O32?gi<4@DMd8`bppN!2EwuCG$lxp(
z7I{DzAs`!zD}or@$Dx@h+V4moQ4tJBq7&R@Jb}V*Kr5)w>d!Tf`^EJD<pCG=7n6<$
z#H1iwX^zq_y3Tp1mdof1W8>bhKiAMW9uT$YPICK3*JsevH2Al%WB(Cs64dxQ8N&gK
zarnG=3^K|{CZ9AC5;c4}5GP+hCBrwEO`uev7fr5F{!EX|OAEP1nWpg;u1trM_NGM)
zf%>l7Xc5_@(q9)28bTZTuAjd^D-ST#{t;5=&z+xf9|f8t&<#aAp6TOgg3FX7RBbKU
zxfUtjD?}p?!|JaPnaCx3680QFetZ@o!ZK=YTgdo_kp{M7itEIGjx%ANV7R~|n*fWL
z&gpd|bFQj`7jtzg1x_QW<U|0!SH@LNrm}x^=TDzWR?kXrQ49k_myA~uZDh3eAzCR=
zcVbC)=!lew{@BQQs~@BUNbEp`P&A9g#L&x;g-;QEE|^E6T2Qd4o%FN5#Ap~BC*)8k
zUa<ugL$VWT^rhD6($axI(iR6WJLV=6!ek(%2z8T&4!8~4dX;hA>|>F4RFt7-V97)b
zMY`Q&w60H~=&HxuB?J!<=N+#yOPv4eN#T_DKJf6Mm2)QNaOT?-4mMA6lbKGQ^&Qlc
zReYKhAhah6e?Tgx@Bk(;mgP`1D7Far$RtKWt6ogE>CvMhk>HdrIsPo5b?0WNWXj}#
z6To3I!A3C~%5xv<!(nIgL5h!v9IPaX7`I8_44NmMP0g6q6*Qt_KpzWZCQ2MG`(w@_
ztC#c&JcMl~Y~TUjkZLhiCaNo8${+OR-Fm&@P342E2)G62_~uCj(-Lz**gH2Z7S7;s
zWDxImD8k`iVogztWPIB5w3op$otAMl0ScLUbMu-GQICvxK(_Y)-Oehrrp?R~jvs$W
z-aUz;`XGq)-*|il1AHI5S|}+=rKTH6gj|T+M^AkI81v*(=|uvoQZOG_@Q>LYVaK;w
z0d}u4TRR-$_JZ``lCF0|gv%Bymx%Ys9_Cq|n}~^-QEo8ie9+u^rO4MmSox=vrpMe%
zB+!#6Gkhtq@qC7=yxQ;_S1@DtZKiw;dYa3L%RF<YTV7?49zF0vD;BQ{zzd?~7#K2H
zg-l%p@}ofmP-7=e(jQmZ5vaOk{A3jav7{Ew*N|e}s4*Fj2CE(()_K`&MqUJ&mh5K+
zp4R;NPfc>Q7Oo7tUw3@XyP~}n$s>sK_Aw%~{kb5s^H1AI`NZvc$z!u5H<1r%`t7${
zNY@hG_Il1vsrs{@+E8ccps6RKapOaW+tIDHI$NJXZ!&3t6KIZSH8K7Gwmf?V^p)Ww
z<4zW@D4I{%=Ovi0$buDYjaXS<CtHo-G`nSHD{`F$M~@cJ9qAep;dv|h=4))q`a_(u
zM>K_~WXfQn$NDM95PU4$H7XeQK6}FmdX3YHZZ>Oicn;L2?M6!v+m$Ah`H1k7xG&Eg
z@9bsjC~XYyo|H2C7?GqNzqm)7+;HXol?rEep|mNw+#zHr`nC;_fjL;^<y*0oW<fB`
zF}s%xs_2^-N8X`dw7)uFQDsN>vmUFCKoNMi;See69JGb3w;$uY3Br<XHFD(0#3zTA
z3nk=o#UDX$b-Tq~i*<<Q$=w-nlFLY{a~;2tWe$HcEM(w}wU^2U$5HiJG3L@_@SK4)
zm1UkpzBY-A_GepVs=BXlu{|S-tt%|##p^<cEbKaNl!KgYHHKPG+H|<)UCZh*O`mKG
zIzD3F`J8uWGBcHAqG?5xdF|NCDdkUlS#P<qMYmV4v3$r3rTI%++!;=QQ#9RB|64<5
zT_MtoxUxJf*3;K3JIkB|dL&-LGS14+&ky_SgNsgB?N{)L&+6Y1a1;ks4xE#7%>yhc
z24Qj`3T&s}=xEG|si!`**4{cO<%va`rCK9Kq^R?9&s;}4TieTC-p8IxdEN0?t1oXw
zMhb<uxT3-<eZYnV$3DF^#Ub~x-F~;-IQjG?2*Z2-;CFq(oJ;q~=9r*f<?4_E-dXnK
zxHx=q5Q)|CE6QIb+1)HlNT0XDt9%%3GMzJf-Ss!~KK2Az=`&4-%)cr|b<WlE&J`@n
zRkl-ylltmgau&gJf%?!uWjL5PI+6@XIc7Y2D3&8XI=YeiyME}o=@TYT%1DJU|34Bj
z{5<C2mr-Monx71s!aN)=8qzajUH-U;RxNHbA8VrMiJ`!$9w7>|V_<U5nfvoPn3)&e
z$P_8fC0?<EH&4-SC6PRnm`6_w-s4o#E^|%OR@$?SHmqN-H$Z8;I6BI+zu&x*&1cRm
zHEh~G?iWz2%O~b9i+w!hSVZ-WoQz?AS6pJgs$rVIj}MkgyFYMCwsVE|s8eMVd6I*4
zXIk*k^Mdo4d_1ml*^(t;2C*qAE4P+x2_yO|ykwU^ONe#IG2*_(EmuSw`>Mts_#m&w
ze#w$0PP6ShbqXC+8lPU3R=fF+cSSweZimb3#+sT+8q0W(C<0!IRcO@qm*Cy4c&r$o
zrP)VZ3llA%?~0!Bx<fub8YnqVz(%-bY*(-Uw#T&aN=7T{;K<jWat;FS6M<Jq@B&dG
zE7`dBG77=Y28x6(aI&N;4=O5ztRm4NDMq+Bo}d-F=C8Nhb|fP`AowBUa<Hxzf9GdL
zQPw)ZuCw0WjS&WYtp`53Hu!2!h2oaS2y_QQC&#MLh189|#y%V9Ve8q3l!z%os(VGC
z9*g*iLC1dVZT4#`-%(5e#N1z)b5;{5rI{)Wt&Hu5PbMWTPplsQRQBQp`<si4{wZ%d
z8`|vFUrN4h-rO@InHrcmN4#SAd@>%*7bh5m=y|<*2^drJ!2It<`Is3$pRz2;b>^=;
zrVg)$mDG(rRlD$q>h$piD^~usdUYh1ej~v>Lgoj;11y7wmrr_yvfbRI=#^EEEY`Qx
zm08UC-ZBCrUnb&Em0Gl7Y(w)+i|zP@IfzNiAfBhMlY+>HWQ~BbGIDcs{k<9&S!3+;
z1xLI;GtnozmcUpgdIRX`-*8QdH@rtu`*dh-bUFj7I@`IlW=s_xUZr55k#1q2yDswD
zMbAl+o+LV8XP$rgY|6&XuD-5*D%EA9;rM3yiO(BfX|jPwJh7sUNTRoUH+Dq_@t_8o
zwR~x6uj1r<S;CvKbGf4B%&a*;$NS*%+xnk+c(N^!GFymq`Vd27ZC}I_HYWkqBUdb{
zt_IAWL+DGrVQ+(c;<JkdteC)XOls?;P@LWC6Nl{(l#V?gP6ML(dj>eawdpg7V}Fl`
zh@i}=(>uKXBe+cQ1y9P$wS3PZs1AM3u2iz9=f~j|mD#7@zSLQ9AA*O%^aqxp<bxC2
z^L*-J-W4WBldkFCd{LSkizwyxwxL5k`eYX^uDHH1cemYgcXxN)j_K#l6)vuqK?!r>
zonM!G7k&=|vbXXwnPgymYyZ!{S=*i4K<qZiAiLIpaYvLs{S|Wy6R$p0EKeelZ8!hn
z@!A_Xea18df?Ik{=m@YNBK@PQvq=;MEGBEe2E}8b-cv4*85p~<br0c{Nt|f9t^ZGi
zoDzc;+6!$&L&KTqJ&!Xr>BKS*T-+2kjecG8yPPx0HYndU;*>?dsH=|Z4^36$K=3G&
zo-2<Mot`8<Tt8Pp4>pHFDKaW{%?wg%UW`-rUS%aLis~DrLUYLV@zb<W`)5wxwETEN
z0+eHWl|oP&{K-kjQ1Z1EH2V{`>9Z(HgC3ijnx+Mvo*Zp6V^K}QRZ8>~9cn{3p6k*H
z=d-TG&)>4_aR}>89(>`7CY3oV4Vx)8RQ<3Uv>*mAuQVe_8K1TJI}kE|#kutK_W92q
zlYKcFnLaKK%F}T;dFyx&5QyxEwGopoV<W{0^*AGQosDQFNRTO468g%1e669n;Q8Pd
zddGzgpl!m>r1e<8N#HriXivsZ>m3#R2PI7z<0%6k!GqIYPqkLBoBUC%37+jKNDK+3
zv2oearOTgRA;s;c)`bSe5_L@A2PI%}9<_B;)(7v=h=Vr+q-XN?wF!^Y+G3lp2B@^w
zQ3GMo{S;iFrTB6(JVxAGh)%8uS7U({;VqMwE?ahcz!-3g?n)8pW=gK$aFRL_nwxP#
zkDv+fxL(G$&lR7fR?QR4K@RWHDs#Qrm(MOq2FP=DxD#SlX=rE|Nr2KLLK#Xr5s9%?
znZ;CcGkyB>F$SlTLG|!?rnOv0?pfKkPaBpo<G_`eU?#+8Oq!f?<w+2_6=Cms$ZFi(
z<x%_ALlObJ>@o-`f&$5OuIu)6Z4juBWscSM{?q>0-|3$QXn_y;yd%auC`$Swb*l=K
zf%XvB2OJ#+O<5GV5hImilnnbLsq#-sN&-Tjw`dV*@fL#0E%)M?pC{wcl9D&v+R1mk
zMOVKV^=pJs|CM-3YRKtBzn!Sag^@6`a|QtLNx4Gxl4iesgzhrCiHgcgip4VV`+Vq#
zl}AtXphWR4AcCqT(;M0D`<OmEIzRP_OWq~n1R~8$k_&^()jDtD$=X*_lxD!QqT=qC
z7q$_RU@0mwl6)dz-7;D*1TIWSNDx_z?Stosg!$*}rm8SV;H-@3fq7f}8rlYZRu__J
zMx(kfL_;bpJM>+zcV<Xq?+3rEgFLM#+Qkfbn@Ef5`ue>DO%1P+Im|$sItIGmB`BMU
z@I_}efrGPOp*1b}hzxW>mTuwFB}+2!#HgCeSS-oW8Cf^fys$k1p0w@CTHc4mdP&MS
z$f=T#Sb)&Gd3ab5*D~L_iNLj;R@NNSZZ&Ruy0{X0xdpH;#^O>-7v}@+EHe{j8n&zV
zC>iA!DWe?N95Rc3_I#%aCt8U{KX_M=s!Js0o_M6YlxEO#OicX-;8$ce*&xONzt5o0
z`}N!?86j0r7n($F9ACIc;w;Ze*ni*fV_H4mALSNtq*($Yas>g@>_w6J{@Je`mxHGc
zns)kgQ5u#~4#v#H1j0hY%jd#DEAtVFKOFoM_Dh>{Q+DmzC3p|xWwv!;T0I-X49Wt(
z9f5NQ?9ltnAAkPIaZ+sQZ9W=QW|1eQ<^?+mrOFX#x`>C5<tmiDJ53Ul3M)EcC>q>{
z=8qidLoZP*;K_xk<NcM~?7Ey6j95Kj7QU9>;?=owgQmgmix+>>u{t;se-KwtLViev
z?EtMul*u#gMlASHQqyShR|}ctb<mKkQtF^8S+HTVLR<f@efvnhM0uHZO~zwzas7EP
zV!%`C>ykiwd-o1M=q}0fC(<e@0|{$68}G-SbsWp_mrRCTD#?MA=P5Yuw;L?e6lq1U
zp+XV<LFicHZWo!so!W(-y>BJb;*O*!TCi}S6F&idbqQi$499$%D1da%m&tNRvFgMd
z=dWJ9+NQ-<UtOf-K|uH6Q`PS5St?U4edPNDNtVq-8}9lRsf*m*^I&}TlW_@R^nh8o
zK4-V-@1LWF#&tDGV0AC1EyySb<z-rIw*&{fy6t|sX8roB;C%`uVjN$N5^_L+cDz<8
zJ9!t7{-qV__SI)2*YY-^!>h?l-7WT;jUqA=(B{yv_;P1LBbhDFlm9E`Llsw}#eO4a
z?#n0{%G`c-?e-4cejyiL^!)y)I7fb^co{zP22=eC2B<GHV-t*{Ine6_|HOf~9>pk>
z2_{X`-j8kCjJ>2bsq)^DeSi87`yf#EvTO||kYAK=?YjM*(chAsj>ln}5!8+!`yqqZ
z(EShnVC0v!cnU4csEgcOJcFR%Koq$4W1VPKiiJ`<o*GjB-X9@dxBrSKk6n!`c?3!Q
zBD&ekh;fX|Ms;4~-6h=wBs<SQ?=XM$9m*dW%Ky~M@Aj3cS2#8oiWBfAjyt}SmT)4R
z1E>LzvS-!#eEjUcMc{ERt?*!%LFWXg1P9KX6Mw_Cnqn9(5ZartgsJ9i^Z$4-3_ARq
zD&k}Nlw3tjMytNI`<^n^j0tA?Hz95?)oZ6XKfE&4twCz`X2Vr<RQyK@+AW^qg$vEY
zj9#qhaFfVuw|ow)h(e2Amya^C9i2XY?yxrFrokT)z1|!KB@EtgASbVN>j0CP3_Q9D
zy05z;S-}tm>E*EA?ntBni}3u*=TayEhmP~_LS}{)$hxc<ys>5Q#-Jgc2ETN&_ip3@
zEF5mOVEDZIUbkNE3~4c8!i1ntP4QSz*C+9;L?6AoQVI)1s(E^yN*j{DWy2enIrYzv
zw3v}wPe65~F1~zgi52Appcma0UPMLG{vQ4faggwwx0e31W=-bH=OpA(o;&P64Ud4K
zn7y^+u2<MyuUlu|9A8tp`H#wlKa>{jZBpfBMC2f&9-;s3XT1nRt-&3C-J24@%?RxA
z=uP9*Xte0jGp-?)<*zz=!IbNdtdYjZz3>RM2uVovCXt3wq`iPB{ri{C>d!yw4p>_r
zn`gIU#7cYI6}s^mS2URrV152K0tA5^#<@#_&A9Y&W|)petglI@r;Ip$@QviY@GJtt
z4Tf~;%@Qw^HGO|OWj_t?1l*hEn4^Yj+)n@df@P@1_70!6*ofEIM>R+lhS60|kNH)7
zeRro_WU7E9&MJvxKo7U6r8*j~UP7R^shJfYZb$y%coM#2?(O6PwRW@mt)2fpkb=L-
zIOUuh5>1XRLzZP2Xu(4D`}kwzmmkS57g001utEN%^S6+^kI5`jrD*J{r>s!FXcPhG
z9c3R0V8zpwN+odl0Wl2J(c+5aD~^xnS?hV%Ecvmot(B@J4n}^T>{`er1|JT<@lGG8
zI-LV#o-4xqQfIR}zThXswl0#RUqFeUL#;1(j5rxRmmW$P!9tY=iMZW7=WXxRF?idp
zW*<)Sre^2Pp52GFkfMZo%nL!-mf#&b8btp)>rjV~5R+2f0ZPncRYzS5c7uxd*4MxG
zUXx9JcisoP<YUX+dmAYs6Cx>O{;AMm7^2vqfPdE;4$MBO32`0+6nLQt@?LCM<CFou
z?()wc40Lf<X52}X#r%W5%085TK{_nJFdD?-$HpQ<_%nH#CDQo{5f;j$y*vhx_oJA6
z14omu<&|(2cwg|R4#SpBQcVJgXMWsd>Te}q$}!nAjT3T__IMeMpu8->nvgwu^pKHI
z?_ampeqdZ>d_l%WejE)dYph}@iHyXfikn{SByS!FSU4F7Y}2u0#}vZcUPgb!M-6(7
zfAIs0KczV@TtZT{AU=W|#C<LeLcrD!N=qZi&b1s)da1%lTwTCK{<X{Ye_Z?3w}5p9
zIi`K1lu5uM2gea?m^9;a!U!@!yJ5?Ykk2V*(^Dd4f{zOKJ|AZf8eNfbl2AP!@(rpm
zqKHVIc*8GFozD(a&yLHaTFsvB(Nmnh`la$u|ES`T{)SFwkzP=W0@anI#q(v_9YK>n
z`$8(+l+oQt{2PRB<n#aguqmHil<^5N_)IBdK}K)T>`m#cVFbZ~mP`uR)4r$r7x^Qj
z?UG{+pbpVOm_**)d-fcs5}1Ry<cdsc`}i99<mV4k9ZSZ7ob01iPm;EUvWt_a=EpZm
zMD=l`Ku0rDZK=B>!QM!BegEFhlieEWX8-;)|2y@}KOZ*AfB&h;&i@W8X#CgIPd+@!
z#=mO#{|hueod5s+gQm~_$Dhb>{0~!AV~8mwV_{zcLyK&=l(ZRv^8S^f*X#aYXOhXT
z+cEd~ebkov%U#fC=P~N{-2Gh;OwOmV3dm%<X`|obp?%j=vxGM$pZdPx+eXH?-8=n-
z^drxn$5_^j!h+9UGqoS6;8hVGDmh9+#6JG3p#4_|;|M4u5*+J(lg{{EqG;&eb=y4}
zp7S|XdAE)dsJNPbmKwtN8GleCVz}NQ(M*nc|0?d&a^dM1ytf6c8+G)Fk6FEq%o*}>
z=gEk#9aisEnU!$`$v3|rPT6w@RS;iigRiK<D0h@oWxMdwivt5z7a$;>Xho%O>@=Bb
z9u*`RLrDx7N!Co`Z-f;7fQ)aHhlgd6v%aLI&IRKp6WsAXa+Q)3K!LHCI&~*=q@!TN
zIlIJ6dNsV0MrS4Y{v|Ss1BhnzW}F`MskOeoevy%p0C4?)_GKI#pX~#ZvcayK7U8lC
zpOG@Mcr@V#fTc39bPQD`8P5tuxpz`*62ou=fJ&>RwLsDrPlr<b;NW0|(lSzSN`7Nm
zwDU0l2v3-HVT@RNzgiex=}TS&XNAB+hOj!PXI>I^Ci$MEa#91t*z}|{qMbYC_=0Ic
zrZWIxEP&e+H&SiPmt+^WDKmh%t8BL(%WozxjeGu@1||J|WA$Ac4rdpov55(2U6qNc
z6GOKjUlw`<-obfk3=erW!h)2<Lm46#P$FI28xfK1B%Xi>UW_zoV3;()WtHEVKxzu!
zy_>p0Zv%q@=bJ!>aK~b$9tsCg9Fb@;xQ@8<%BWLW71i78H8qczpNK)(GPk_jOe;|Q
z=c4AgXzGfw3aXbN-r|{M(0R#l_JJ{j=-7+#E8clRBsl<z-Sq3PCsJN@KilrrGrE@^
z!YIEAOysER&1M&0kKRat<hlaxQ1(6Vmp**{{vlM1opXX`2MF&{{er;r!7MBpTY-yZ
z>-bS=Etu$*Osqh40HqhW{=$rW%j%Ce?Jhlg!cJ8PP}7u2#{|0LO+E66r-f;xmh7G+
z!{2`B5=pn$E@Co5u%{dTNV_23nK*Q&QFT?71;11Uk@CQ=Yrc2$^t^^@<Vytb%V=xb
zXc{^^8VpI0dE)&`{Iw1Rp#be;hA`aGQt^P9pj6?oBBVCK^vGp&Z7u?`laah&K$mIQ
zq4LmHl5%>!x2g$(PKjH(<upNv^@I)1HRSm%7-&4RIfaSw!fT^{o3m^Ww0Di1mRlV`
zo-+qHP>2jNA|52^(o4O=b6_N;HQ8aM-RcB=(oLH-H6t)%2J9Y^a#<1S7nGF;M1@5)
zn_J6yaYKHP;mC)L8oDu9`z8H9riU3Gtkc5WQs;ydv$P_LjpG*<jV@XUhG}B*>2i5!
zxV8NfI2e~Hmh*s<V!k7Bf+OA=!5jS|7#3I91X0|y3|zj!D00PY08Ju);TlzJ|3!7b
zs7Sb|dwlY4-V}62-)@PWot;I5X$NR~V;C^KA8-YUG{QGQdC2H)44ZtQVn`<q%_63$
z<Vk>*`TEUfVFApdV0-{FXZCSQAwodLOhvLM+YOlh7w1$$eIfhNFJ(d5#Qj3MwpCo+
zEY>1TP1ISmvM&>$BnFnrm)w>m{0ShIy+V2?#k00~Cp8-I@Q`qW_xn^M_0R0gj}e2@
zG2qAjEcp3=exbBN?}Vcg)PdnBl+T190{2ZOhaxlZbzHZC<MIDUccOm3uMSthdaz{U
z3EG9xJU}S|lNfB5-%DCLFlmO=JjK9bNg3Gdm3AHTwthqGE1BEVe#gA-a)Dqm$H1CG
zDf4v?Z)r6Fo-7et-15G?m$Sddno&x5+AAj64ttB<t3}aL?vD)=rx`zbv@KLV48!2z
z+Iujo@=d}pCGMVXRFC3cJPbvX*cJ5U2}eLBlDQ_ijowY8qwB}y)~y3^&U)m+C=d7w
zDq^wkw^J2k)$>Oma$v6VK=l^f9T``2eF#*Oi5in(4uU0KE&erQtq6iqG8NnlmF{r@
zU@$7p)2;~4&6a?G{1w$&^p}qxFn;><-N1-5^R;DG8bu=)qkfoQ>Ojlx6j3#DHiF}H
z1RlcleB1qpi@Y>^qy~xCidi8PTLXhaFT+#9tAl>0d1&-#0ivOq3=Uz5-i5paxv&9r
z-uJu)fGX$Z4Jm=(VPr}qU`VgL%EqVf)8t>1r|zDd_w@8^X-)Yvim))x=abI{F{ltY
z`C1q0#Oqf04wm!HlBJf3A>k~t4u*bE2n6&YcIjVy{`vJq-wU(q@pdf>&)^msy;y#b
z%xy6%nlYc1a#x$&qWkw3A+atO%$PGk^$SbvNh2t}K028$#qcG`Y#&Tb|1&PJHu1(_
z!J8FCh`p$EwGH|3w2U<W6{A%{!0e$u(-)e3nk3Q|GA=i$`K(umsZ%_@eyE$>j=tv7
z`%>~Ix<g*YG*(DIH}B?(!4Qe`M{Qxx=207{*goE^;*Ew64A)uw)muPA-9;j};2LtL
ze5Si+)aJHAasV)Uo=l~#Z0M6$wWJE0c6s!Jt^!$PtV(7BJ!$d54r#QAP{U^pOKoJ!
z36A%A<5r7^rIcP!Nt=X^B@EDl2O;F+g74xKkk5sv$&>bYcK0Xh3==tVeV_voc}38^
z+QqIzNQI0KmHDOuHDU}Oagv1PBb7adCuT83i)i0W2&H=UP}GE0tyED(B&+ne;|Omx
zo=5|x)cm3DCF>Zm%e=qcpMKGFAM(kvanUyrNs$?S{JNb@PZ!;&iv%0UD=ie-KlO?$
zC<yI_EU)H-HEo&kvPMEGxMniq4mbdsRWA&_TsfHd^I>$69y;Uu#yFtw<BM-};q%`x
zTD)-Kq4Jtzt)7h)a4&B~6kS_*<@rW;@Cn$S`Nv$%atu3FyFq~x&>L~>eEb-FR68X?
zqd+0$b-IUCyG&1Pc>88Xyp0sW)Zr5jT4?f=>Zl0RI4ob>(ZV93eqqgLU-fR<d`V_+
zw~)Mnnm?Z#Fn(Oz5$%ZN8FNZT?%5iU82-0SSyh15W}6LxaRy;Ma#Y=ODmVS`)##>w
z{PStklo3`>hA<4g<&b}#=G!cet_`lNn7J~yV0DwHC%kedi>wa~%il2&FoTc{+LeKg
zA=PEN67mQ<p;<3v)*b&fg?R4>7^=v42p>K0^z2om*i^!UB`Rq5ZaF?rLR_hsN$^E1
z^4M-~Z^Q<#aN{0#Q$L|Xm{nLP!`>``duV1TeK*`{NB(kmZ(B!dXssOMmMZ~GAuoqf
zzCf~nUZK>~x1D@KgT;@nTSri(DWg|W*P(x-73k&v)d%59lX9x2gc1P{+yzp{zahjh
zU&au&${jU&bRU2_?!&xcB;7NU(pcvD%FMRx^440G+IgLd#+ecH0iy&xFrXxNm0p4I
zSD#Yntb8jxxRjKXQ9H*OH+y;Z?AeF>8{C|jzgu*IO1iATw)`m7{BjCc7hWWwQE}CI
z>RoR7EgCrQaoriqh68L6OUnsrrERqnTU?HbvqVJ0<f`XQ9QC?Qqp(aCVgFRsFYm%G
z#Mxhl%j~aGO@D67U`~PsDSA<C?Zvqhh9?0dp~dvi_k->4=k(3ANy1@bif#6$Qx}+4
zRa-~U0xhxB)xi-4Dz#a^-W9DGyu5E;F9A;|YMqcgALs6zWWNjyRwCS&T-PJWz=0tf
zeSM`d^t%nqiQM%(E360&dXC9Y2$)fk?0nRGuQi`&RTTXB1tzrXq)DZC%l0RF`}gk`
z)CMxOZmsK`+eO&feTX^EMtfx$-C1*E0s`Jm2FL{anEO|5UQ%tP=Zj*cj$NYgelL=&
z#4_R3z<kX0C|AnY0&S4@#hT=bQ4yyj<pCMcN1ys7%J1$vU(MQhV!)nFoBoGQn)!Ce
za>5NM^Agm^Ly}$%o7&5N*DUFPv}zzTueU!ifZpjx%nmDQC$W;1QR@yJqWJ<$_AJWM
zLVhFS+xlQA-!sw4$!TwTJ-rkRIi0yQhyHt!VOCzo_(D}m(JmD>qYU93IBYmxN2Wz(
zxpOZ3%1_-}x7>YRlk1g5?H~akY%_Vk*|{rATNpj_!BWINd$vG%8O(=a>aKm9xW$xq
zI}Gp8??}8dbhSmJ*YG+0_x~CJi_Vj8C3=jL_*L*CqFm7sb6c=LDi7#%50Zzf8U%$^
z8pK3wZHHD)ps<!{bsn9OOL>6&=PLS`)X(~qfh0>wnZPNWee+;kbqp=`zY<<=_HQ*v
z!Wpc21nbbJd-qY))P$4=y0`~8Q0-|T*fSg3S*Ggkjg1p$cAWUjvQ(av%%mni?jG*(
zSky1BCVIFCHI_l|a_)~X%vIcpbm)WR{rJEC$uN3ij>paKtVxO^8<oNsZ}wM6Nmv?-
ziMpjN3cPU^@Vk_v<?xZG4jogT8#=;4`YIGnnlvHS90h^coD4(&0Yc8%^b^W{l-@?l
zJf;XhY-d1Gdub8NcbNZ^kX3yTMdudm$Y^smnZ}zkcik_)$qhE})6y$+k;S2zMluTN
zh=LOq2}h|99XvE1T_bVke9CC*SqxDj@;}_R_(Ei@mcG6vy&b<%ol=z1Tz*kvafyjp
zDrxPJWQHdsDWw3i^r8INvE#hY>!NA(-#aSfAm(QoyjW?hVn`=(tGmEVl)I&80fG>i
z#E#1tMfY2e<2nlx$F$r@ho-C5eCFV@c(^3`eP?6uBpU%Fklr7PdE-Njtl6K378h6x
zoGi>_Y0Bc}WCSZ-@CaN(HF>yYyLJZ%(~NHR+tw>eayAm-c17g4#jnS8ZhBfzR@3)T
z;K5gOf%ZrLQ;COw){`t9dzHcLkT%QLKx>qw@XXrAE~k;xe*J)5O|^Nu!2)>m;8$_7
z6C{NfFx*$ek(QR0ZQAU+ePVh0mHX;JKK4QJWRLeRL2=odCulim&H*Vhv9HG&zx(Vh
z8|y}es;#D`cJJu+w@^fi^!INm$Sj^l-!4VxeXW1JMg)u;!CHJbti#dCj86D2gD`8%
zxEMu$L_tg;wRBK|Q*8zVEtm%+RSYpS6u=?y)0I3g3rvGpK$*-VJ%N2~kM9)z7$$3Q
zyeD&uL6^ISpGcG8Hd#xl*iuB({WO>n6e$9$N;tlaNOCuL;XXn!;Sn%-ChD)h{xWe1
ze|i*f-vK<~it~CQD$V(Z#sV6~W<H8nnlZ?@4gr%kpXBgp1>$ej<$J129*=KWfDGi}
z<EldIYtmo7Z7bvm&UpNiD99JW;g^K^k<`F(G5-GkVOXs>WP;1kbbj)BS@l{H@%HK9
z!nsv<rE%`uYhtv$x!Ko;UA4)-bGm{MO*WXM%aY}{%C(5Kx1($zy^I1av(y4EAbj|{
z=;Y1aIqX<<EZ9xA)Pmiknz7#s?k`>sc!YFOEcPZ+GINL*>rKxbjj~O++*QiTBT33H
zPB-OFgo4M-n;>ntfx3P;MYBGP3>J~Lqxl)8-cQzB%jhKOTR@4xW>JK;!$~Rf_J%Dd
z872e<OA-{kO*rx60ph|pbwdSJ+>=$sz%%KAElR*PNzXS;k%t~`vVz;a9^n8siJY;p
zBk$_y<RF?f4vaZOw5IK_I)20Mw&bPxOYFY>`fG}d*{_1Jx^3;0LYKhsIxo-JapREV
z>gP!lWl;2rT)QH?dn-G4^66+#%WUJc+<8DzN4w-m3JG~Z+#GxmH_MN4%#jy$E~$;@
zq<R$>&luwoS*bb5pzTDxGPBK?(W4(h@wbjCVw5?kdix@rIDDjYlfGr-@<e0fwW1Ey
zJ9X(`njoYM+O`N7)9UjfdG7cJeok084<A0XjXbp#|61UjKAIhb$82AO)t>RlWJWdt
zABA~srSzU%eRf6^ZmpBY2(M(6kM16!1mZ^a&dC@@x_a#4d4bPt9)GHqduTs6=c~#R
z0nY}0!7nacc!{2%sPDTQk5$#d&8LT<>rIaqorVLoEX$;@A?OEr-IIO8mlps|d{?(6
zjAj<Azn-}L0xwx74QzApig|By<HNN7OkY)qHG9c(FGuJ0=uk=|>C;0;0P@2E3weAa
z;l#@D7N0MEbS1N7rXT=h3l>N9?D0Y<jr3205_JXNI?ux!7)blJFAisB`$@^IiQ{H5
zeZ+EQ3;%u&_HY38(@20t*w%Z?%p35KUlP7eVf~h)DYmZY%9Tk)zFnUThCJf8@D}_f
z0^L_3Nog1v?KFe07Z{rn@e<oy!V<>Qf2LHyb%6!zzGzruwEEkU@9sAi7K_)Cn%I2%
z>)6XgTk{4CCoH_0K(uk_;qc=~X1T_O1_wt;Bb@3b{mo8>ccO@iXOnWIT?ZlW8tnI@
zR5Hvhz5D;L4EpQQhohxE$J`~N#jj*HL$eInoY&B7UAGW#NV8|0YyK1X%rZ>+>gLws
zfUu<PiUijA#!qQR?REUBlZU653<qKbyiIRIRj;t52J+w|7X2DKS5g>Ow@j2$J6xUF
zLc%;)_XE6Xu8s0sexg0>(YUWFg;aTEDVSMK%e0f7ehD_SS)E*wIOi22KKECIKtVrR
zUh=AL%W$U-aP1!7aDH`wfWYRj6Yr2@oqO>(HQTgzTefdMK)U9%e8(R#7&HAdZ&XyY
zB8f@c7x6gT?3mbWEqiMpDg&0Gv1ERuw%xgwGP!9aj#J#d>hj5d&TxM6&s4oxxPIL`
z4^zV_GlS=Aom~nbC{lN-Fv8wQR@5M|p?#i5)a%yAT*7UX=TF;KcU+cne`6&B%A<2U
zAfu68uYV4@C>lvY<?|uD5YJ1mP6$#Ceik$P4H|Uiz?BYT8@T{oTNXcltTTCXzH4qJ
zRwKS$)YmxUS+i#C?fd673g4MShU0sVRzm4*`Th4@m2RX{Z@1H98I6~Bbk(T{oCvTe
zh@825<!QnPoiyjC)}~J%WA1-!*KW?G!tAZVw<4rj?W9-1{X_IwJ>Wb^%=_IY$%nxR
z1AFE_fAPYM#A5wI%FHxEZhO2GSDy3IDlNyg+V|ztP(1(<>48}H96&kx%eyb;)F&2Z
zTB^KJf^ju071o|a(nz##&Ao$S`0GD3dPn<(2JU>0R}n3Pwna3vU7a49+&b{hH)9Wf
zmB<(=Y2sRHak;|)_j<+wbSWC5t*zZJtM>xV{2(gUjz++8aCshtB>k=MeHO2vAfXsj
z3IXz_ZNC!)qcX)u(1TIccmL?2Q2Z)a0737>xJDa!_vFtXVu?*!GQ_^}4f;13g@uJ}
zX6AR%aU7gok+_hK94pmJYKbLt7N^s&KV=<dow<*a@o`iBTGRB8mmR`^5f1OD2VQJ%
zr*38jedv?5l;4xYMtA0gZGu)OEs7HP;B(hq3sC{f@OAs9?5R0nm6^GDbPhz<<WcEE
zl&!iX{^1H?sOdM~5VwLk3Q-9l=0(LaYQzHi51<G2VB6*n;4RSAbccm_uYb2{_3ANu
za|89-R%a0B78`%-TpjzeOF7Tz0LY&}Gy7wLtqo{29yK@xR#af~Kf>>50m7K9Ds6U{
zQvLSr0h2?2zojZnnXIs0<elC3E8A_vi5Z|N0{Mp>X`W0zexfvH;gmE}7V-9lOK;ki
zPHFaB&_!t%P?Sk|>8Rs}9^T{i#TAh~8dv(@>AXDMn+9d|+iqJ9?$%r9H|Rxndwg#a
z>Pb7fcD+v>nuZc2XliOw!K&7@oOhFuSv##P>C}t}eDGpT&GTy{3&Fz=UGi+FSRbP2
zN5ljAXI2)Rs2AD&9!1wJ7iX_Kg$PA#g>PStCogZ;iTLoI@^i@1;1oCj0YWvW3Lp$y
zbowlF6Fkbd0#8MM>E}y6Bz#7SPEA|1nA{MZEFM$!AcE+zOo|}{I#QUId1hlqOj&#y
zEehV12On<>9iYNbg=fqjk>(+32d8lVh#M117+pvGloI|+e_@pQ#nT0Ec5z>OOZPsV
z+AI8V?$P2`b$TeD2o01jiT|ta-2ZCM*EYUpj2YV=hP_P_Qwp)O9a3q88HEv1X+$CA
zPzmKw357Hm3<)_)*h)!65gm<T<dACUB&y+w(wR~b)$_h%Ei?P|dj5c?U(9q^>$|?+
z&*wf|*L7c?pfCn>7cq?>eI(4^)8gV#L8mDv!W}aUE0>mS@DEy!y#y`Dm6xyEZ_m#h
zZDrE#b^4v2M_T^qG2d^tnwpFfQ=a!Sr~0%x@^;j%9c~5t^p_uHvMsdEu;|U{F>k3x
z1gX}EWV{1yq-*CWWpYt8wj1-#D1(D!CUj$-4%rQI(YUnwGv5E2s28DWFniVG_H84+
z_eQJ~pG{wknGMpD3QyG0uUIp)<h6WzB|Pl)d53o^>>U5thAKj4^`uBjzhYw>8+??;
zhj`oF$*osLc6N4>?~cm8Mn*<9S90A8hgYc`B!+WQGXDq0=Np(|RE(|SUFu1%wS~#T
zz)IVUv~Mrfp`g_w$DSQsNY#j4-O46xx}zA^9e_cTIGC_P4AP0hdDKACO>XN_O87aT
z7gr?F68Oq_WQ}xMpf4_7R#pwM0lgxaWF1)gG^b@6)Dy{Rl0XUgn%MQx384^v*X~aU
z!XWRYD96U{=y~w!zA~4}rUVA^fFp*EuP<8hy4Gcz2g9s%1-*hxc|~=jhr#(NnU$_r
z9PcDEMC4j*9C0*rbq<fIZD{wND66E8?%cU^V}a4VwD^@zhwcpa@Uvb0nAtrO>5l0>
zY8xB(hlV<3{9Va>Dt_wO%j$ilpsOQNge1p5Zr$pimq(EjiV2|mC#Gt8j)(jI=EH#J
z8HXg>DKnoLUUK=o@I+0&Lm|m;$F>mL7E+;&@OvBoVmif6Hc}@cR6scHET1e&9&8P+
zWj7)4H@rGYa5x%)^M_Zm4J<-*ylQ*qrvYOb<}Cik`EXWd6LT*@9e-U+jE|%o5St<<
zAA`#h>Dzu6>(fxTf&rkixQxt!ev@e;^c_OH)y8d-auKpeRF(qdv9Sh#fYB5wfy7vs
zNy5QP0!3^*C<8oh{^eZzE>Cov_r54?WxF3Nc~ajZT|WS$)rL<y-6PsDFnt=BcKV$J
z8ZeaJBjv#Y10;&2`_l@G2@{5RTN)aCT2x~+e_aF#>AsEQ+tp7Uzk*W6N?~+RHZ2|&
zN8jYNcG|6#ck?Eg2pufkc+j4P8Ofm0Wu~>1n*&q3k2E8g+mkwXxB1n;W8eDelrk#*
zxtFz`0|zF80kf}?DUB=0&oYpq?+OS3QEQFRIUA&4*4bW8c_%14XZ?~I(&6*ZFLW@K
zo`tlX9#>CXcGtMHFe|k@8GPY3lZQ|t20by<8k%!3=LjfFzwzV8Z_{?=VBP3EO$`nA
zu8-V#AmLUahl-;w{4^S@LDab*owu)FAIDO6x%`>&Qm^I9r!u|n?ssu%n)?5oripdj
zJ5ATQppW4Smp!$9t%>P`L9tx}Nc&PvEu0`JQCcm}ien;St9UX`v71}CJ)pj^ENeV@
zEWy!j?z|F1|MkV->(=N!bs{Olgb4QfZq`AqymR3sDYGp$ZFN5k>cJ7intf2f_yQpg
zI`Rh?(%s0MnRc%B)FQevM7v}2rcEUaRB|=z=O2pXZVT|N60e>Blc3qe;c;8MsEC5X
zpzKCX{`e!-^#yfm)!<RXoW9B|@GMU41mu2C9c}7NmdouNx3DifSbPu4zXcf*-7*DR
zsuQ5)!J<oVJ?ly=ZR*HbPfAL{@qR0gAt&7~Ed1`FmzU@4uLjRrpzlW@s!3D0QkcoT
zB~_>KV8jdqkR>I}pzqqx#-rli)*PWt%X2GE+uG`C1&KVs#O8I(Iv24por2CkM;d!P
z3-EA1LrkCh?Jx7Lca$Mtfnto$Vu_<xTu^z(CuzW3zadG(9t0gYu;5gd=Zd30ChhwA
zZ0Nb0+Z*ShuouK8?1EnR1I8Z5w|2W#XwvKE4!8Rpo@@zhsa{OQaHyk4zkdBdy6>rT
z>gfy!JfA6jl--B!+uG8KLCW<TKbAsA4qgs;K@WiT@=J5E7!r|%I5xogrSGobZ0Bq9
z4#x~QI;gDO`h+sFq1x!H2&0DDZD=p927tI>7arL_fVpR!bx68+{T1Oo!FJn*4I4br
zb{wvHdw8RJ&C8;aCzl!6{@r*5RW$q6@~HiEQp-45cNG6mXlR*2q!W*eT{$DXD*yZq
z;|A0K*dkU=-@f+A6GJU*sPr>ldF{)4v%fnqVGiaOA`07fgJTGeE?kn@T-8vtV5v{S
zu$fK`UONk2wZB`P9JpS$`Hl9|%+)cRxWb(_4`NVVQxJ)%6D}Qd=ra=P@tnFA?jPAc
zl=kPJ88Dv~yf9Ek6vmFVY;{HGpjLVbawg43c@K|0?}fVm-Iwlj&H!&;Yz$i>3XnZ&
zop*{>$F+RlK6@;V7U++nW{a$&;^hIKzyXNv5{|<m<KE`3?)euM(nZn&x|h0Eo?OuI
zU6729-hzrp`Z{%A=2y^u#+PBcE>^gF{zblLL}LR9CcLCD-eWABRjTo&nyeIUtUxfw
zy(>VgxpdSVY~Ws}!RP@<wHpiX^$*7_iMTTBs9(+au4YusQLKrwG#41QXt-3)I<)^y
z9?;Lzs;V;_R;`P6>EUaT_3&}71*bnggpW>4hw7dteg)YNF2O6`du37jWRfCw_x{zP
zM)MmlBZ`?3u-N--+>Hv|Ej?E~`e>VW#KFe@S#o*EV>W0m97$hug63UP@(JiY`w*>e
z-E?FwqPszusl&AZ)}0_UsjqnWL%fTidY9-gfe4DuRprELy^Ze<<KmXLcO<1rVdj*K
zmZ%xUsoDJ@Yi}A};=`t8Wo4-otshQ}pGS7{vE7aG@4E4b5f8F`qzI1mDI0qjFezh2
z3o4GqT{li#cX<#AKL-Y6khw@&_IU|&r~%3{KWTnb^w0UX)cb+8M@81%DSAjmp1E8-
z0Nal*TMSko?U*Q;7W0;OjtvMb#1FH|HWS!~dpQU&%EqBuM~rIBR>k1C3DoH0xNrKL
z6JcS~H_kZo^VY4g9XxE=p{(V&1G`z(du-1h^bZD~yk4{K<&Re%YG`T-8|<Wktl#%2
zzF;=+H#(72kuqY)McqtGxqIm0*}fy`(7y@P<k#_Fi}I>^F`{;TXmMflAJu_@fodjS
z&mVCau&6OGx=nFGDKXK~avBK<h;@O7mgiW%S=R;<9O?3|we5we>aReD^aHgZ1qJJz
zT(0CH6pf#$ic0^FGw<I00i)Q%`hi=v{Eq99H_;TBA#48rvjy=Dlk`%Y1L$i7P7W@m
z8+isCbm#mb85Zk$r~t$VdT#L%FX%-ZhezP3bC%oJJ%9c?{A^IwaAu8SI!w?23Z42$
z1nQ|VaoS5@1OkesYXB{`&6C+m)=z9o^<s{cKQ~UdhNPRnVqE|lH>!a}O+CuSY9YbM
z;=?b_m~3Hj$3wwUAlG1{_pia!5Gz`Xkj_>31B^ywCp@~2xlki*XJh0ULobU?<8@Qh
zBHRMZwC`IirYpF;!jTbe<Wl|H+`PclEY7f|!^Bn-G$ogw)iT6O?*OhT7rjIe4rb(3
zz%^TV)q1v0TuKv@pJ8rgb@5d}Hb%W^J}_9jsDAZ3ckE~|?Han>fG*3J+Oi07vURN9
zo-4W};&{w(S1Mq&j~)<R?ALT1W7e=r)2G+k&9^YxarVXTBhGBRt=nvMYcfIcD;%iR
zOeS^byA^?nK}$d>1i+(ez-{$EaNog=rB+XF7<ndUinj?=cj_y&A#uak5t!9#YR=8Q
zR-N2-E@@ko`;dzZ`iQ#YZqD-+!=hh4{x$g?gK68AUFV|RH#Nicd83a=s=nhQU@yJ$
z(tv-$S9a+#CoZj+5L3^<?SYvi*M61GX#6dP7qWe*d*W8@kXiL$ha6bBMH@XNet9hM
zl|-O!|M%a+SvSD>=R3?is1IJi_~#w<bsVi~JF;EBzI|`e;1b-z^J>7guCKy3#dfT?
zw~Jzl*$#K#=Klh(9zB{v^LPE_YoA`Y2U5z2PB`pB;~WG$34S3^;ilsFgAP>`mMG0q
zUnU6B*?mPPG=%u0!wnB$uYj3GQ4;4zCn3se-@bi+AYEoD(Y^hS;LS2`nl+@2%=ZaP
zl{1{-Od0Um?m<anu(stya^Wxcf^kvm+WTS&u!9}%7j_}~aQCFTq7ooljElZ57D7oz
zyvDA8N*Q|ftw>^yAniPS`IF#U-=%kMQ@nE{v}RN}e+?(>@2_yG#19A&&yA8pvOFGB
zdKd)i&5GYy_8PqV9m4O~NruV=;J0lTg)*KizUeW<lM1&7%5GUAyar2TTcQ_nq1dV+
zUrzd=%^K3;a8Wd#OeLlzh(44`VEYSkvjb^=TBI7<Q#79F-7xrlepDz7@$(2p#)Y2F
zJJku)+Zx`g_U(glTf5yQJBSvEwt+`x&JjzSaEOT_2S)Oq$aAvUk{n6>t;nodDa<Fa
zus7F+g)`&OAJ;K%cu*MOvy99lR!3yqll*4|AFc_&Quvl}-+zDeT^${pu%xMrD|N6%
zp71YB-?$MBnrLq7v8QB%IyMQJ333}nU6SZi7cQewl$Xzlg}e6T`cxu2IG?zLEs{fj
z8anbRZ}s%qvy+&h%h?seW6JgNrXmPs(MX$Wna9z$?WK1S71NTU<VbQaEx{z=GbNWX
zrURC{kT4}HMWi2MvjFdJ5Z8jibtn(HoZKj-1;@5sl0j3^X<WNDRw5gAb5JD%p?ov5
z*Cq}!Ffg!rW_Wg#*ZhLcdy6{Hm~@X@fo`XER*DrTG~d;4xOjMD-#)#2i@k<0Scq_c
zmIo9cZ_ET&e;5&Xgu7g>2r&3G<xHIF$1Q0(wom6ocz8LtKYYXkHD_6hXq{hz89gKJ
z`!Z<G5YXc9ILc0}u)t`IAi?`=nZajATQ$8?Y-vu*-Xa=HBiI)Xw4B7&ND+@hvZ}bz
zNV^I*HUI9kJlqrJ-#<-VA&N;6&<c?Uxr;Ryry-`i<k&}Hn??Kz?{a}MfFXS}8I6MJ
zCiE=NtVq!xATuqpJ~I*-9}zW}*lD?>smV@-m!0yUW+VG$TlMDlPraYCXGe%D-ISRM
z_@INnXnHY1m18`*7xu=J=$M>5nPA5KHge6-2pw6;F@~p*m)H|uMHEYQ_mWLq0l{EP
zy05=CNlzS{!pk30bFn#G-hR#_p0T`pW~bL4oysyA^I2qHAD!F3Dy<)$P|j;|!5=R`
zcq8sBd}jO*<#v|C0cDY{aQB!HSN~;<az};LJ>IM(SXh3>LkdV9gU$)>8OfU2G2zTu
zH1We&hn4RKi}kA+mrYm9F%!oSs_fX1VGP@{qy}ddGY$&=konEPTxtwa3=*1Rdz{2B
zYlBe7N-fL9a4^1^%M*5o33!K&PU@X>FP849$JaKyLCKpP81fw~5;6cYS}W0>3mpzL
z{|tB_?wQ{rGgwiu3lZketUA3i7*2Rp%PS{(v3n)SqUTlDzdGM8h0~F&Av+ReVPP=u
zGCYDOVB1bl)I;3yMRn)P*|%){NVIH-SZ{e+**5kkdE+x1(;>j+`meu^Cdk-OXdCey
z2bK;*qSt%DJ~$Nq{K4_nVh~$Tz|2q1WogS62=<DVWAsEGYkPDD71@LIArVlR(K7f^
zUZ;2Y)p2mj#Q~hw_BT$Sw6iTmSv!MK#pp7m5wmAQ7}-*jSRDN)hy<@W^V@H{N-PA?
zEAjF1Ht~dU9|KOAUYo!&Sja{btPT~Tn^%ZfwQ=D;$gyA<X;sY?7r|x5n!J6!0sXhj
zd7F~O01&w%T4xw@{z$ut6~;}R)1V86{xE?mHdoy6&?S;kBWmX@t)#Ng8GKtHa%2ud
za7Psy_l=a{d$?bVm3nu!<;Ad9wnZPy<(!SLMBc1J_Wa}3`j70H#zsXq@5OXTbq<}-
z6#Ayz2e?oM_oqURMtODC8Z{8zKhlfoAsju_XS<e(Bh*O_bG+vd*Fln1aB)LZ`9ybt
zDNaW(v?YY0M!7KXXxa|92`evW7F`721$4>npy7I*!cmkvV!9c2Gi06KMH;TI*56c!
zLKQB|??#>q#%w=D@nM#*F@PsCRjBbXCcpwW^D694O*IFKvS#L(8!IED{ehUoZ;67w
zUV;TQ=r<c&6K(-=uq^IUF<wA=LNESX2fsbw0kkJSP+Qf*5Kvc7dB9XadOaRI?w{yU
zJRB)jMC{4c%%RX=&$S)ii8~}|<rz`V)5BvI(nQHSvc#5OiJ;A6j(L+1IFMAL5tG3Q
z8WF?|Z8ajxL7G~N_EE76YZ?xMoRH~Xs3<v0nmBO_c^pjKZZw4)uhl)&aF$hkV&cEG
zwM}KrO;od_4s}h3!h!;E`jZ1m<TdH!CG}*v!Jkx+)bfyqcmuG?*ePYs@U^cS!|)|@
zqxu&=_#Q(SXyeuwT!*`Ny=c9_egu|BsU9leI-NOexQLKncrg+-lJadbVCL{sk-VAc
z^-=d8hiiF;4BSw2;J=^o)WeC#r&ttn>I!9Q1TF#<d@gwrk<3IK@tw$m(67Jzw4udY
zQk=}FC>`y|WZo#i5E=^Fivz}9BCB|wiopX_Imoq9^C+r;StMW^p0?;mDGKCivlZ=x
zC5xQWmuYotk%yomv^cx^U$st4?{HAOAB=qtX{Mg_C6=1^2cW_^&PIzwAp`b1{YO8|
zgmmJEAnxR$a49Od)l-RFOfpWB`^<ghGWPPg)4h4A2nG%-c<58;$=-WmBG^0$G+xeD
zL?(q-W5gO?m-d;3)#v%xyHJ{maT%+EJ!~PYA!-}ePz3#48EJOc{{416FU#>1ZC}fN
z!5vHaKRb0q(m|sVbnjzNlKmyGampXbzl}pjs1%S2iHP(mK|0`($T@Fhd`TwJ9wsg@
zR9WOg8y0(&gL2|c(b!4V(J4$(s@fql)QN<C6Rx1`eQsKE_{}=ejqvFf(#AFn{_OqR
z`@i{A1z0PNk<6q{YZ$C*7x}eg)*csDs*;)f-OZ6KEy|@Joe|N>-`trgmI?2^e)^5M
z0+HfEH}D`VLCPBj%MVRI>Y}B!RAayH4_5>CzW-&#vepl9%um086M_ySUtk8c=bGoE
zN7_jsIi?+tHL*u!Ny+>h6P14$-)j6IN%=pf)f2Q3_EsO(93a<fy)&$}<}<e8d8fY0
zUny9p;)YFo{eYoQ@heJ<6{qD5Oh<cS0Z?q*yl$!T*A+fHRFb`J|6rRA3CwEUB`#_H
zZ|s}&d_G*YX-n@9pZ$A^!)Ykp&E+_{J%<i0D%~otc;6mz2Me~Zxh$KWey`@f`(*O7
zE6qRs>74-l1ew%foo`*zMgUz{*Xjp<Hc{-xsq38Iy`ubO{4ZUUVg&y&VQjk87p(ru
z>2kCE*UtfmN%bJZSG9dZ{y@?AFV$+X-h!{{l)%7+tywj!<GpB9-(PDQ2yz_yb@QO}
z=M6usk4pa!m)RS<ogBVPM^w2uNwuv~#ku_=E}&)&3PZ|%l4!glTIrnN|K<zTOX)hJ
z{aT`dO~RFT3*lS@7vKE`TQ>RK1C(oh)G|&*4|hBWE)zvt5fnyvSOiS;?v+jbq)21P
zs*kyNV~G>XpKt8XAFkK{EJXsJerHg7CnI@-%Ky;F`l1XG4LWP~Ha%@oU*%d~HrsQ8
zrTO|9Jr#;i9p#JwOUt;GUFGli@-N_i9S3YJ$00B%3~9wFs+|?lvj0u}#3vqF=5MN9
zJxA-K<sY8ZP0}a(%qSIYi#6I@%>gn&JLd~VkF`|>T>hG>zTJNeHJq(%37!5%M$=F-
z|5pIH(tKh|imHQq5<)2>sGKhccs;GIUQ**Jm#qBqf3tP$A}WQHZOZTb<zHRrk2R_s
zlsDloKl(FY@|PcIf1z53fBC-RX1fpA-(P-Gt57Y5zkJ{STT|6D|H}{l`z>@I!P|?`
zE2{gVq7%yB_1DmySEHIomha9fq;M92U{zs~omk8|rX-5gpe9!JH-A$-8-=qc!edUO
zvImuD7%UiO{8EQs{2asw+z~!TUL<HaB!wW{8GBW?>AVJ)V^gI%bLZF`9~5#j@rA$C
zqAUI%VFek5-gjQx+xov5LzI?8B~#rXoU00`Zm33cMM!j`TzO@toj>_-W2YOF_r;r^
zhN@Vfid|wW1@mrFnr4&);*Mw0k%;nBDiSdZy@44=cEn^Y8JH8rvgQC)SvLHGjB{r`
zS3!OpD$ZDCt**}Kc;yI*ws~h%>7t<toF+H5@iIibC{pF92cRdn7cT#%s_?~TTKGel
zqWl<Dd|tL$6>k-%#a5C;DPlls3HyUb8lOLx2}9QPAnp(@Czgme<-9-#?+zI89im`e
zGwg);`=7G@&sB#rF$6)_{Gu1HN!)fwU8+xQ;s6lBPf<<}lJXP3EpK-9Y%JC)*fF$V
z$GT-EEx$HTxYqpYGrqNQwFwa-luwDtv<;@&886r17LXlo`L&RDP#=lGQ<RU=f30fY
zDV+6i$e;|$LzQs++_`h^rDrkn1;a`%t`Lxbf+D53uCKtz;1{h9XI@Z*mID;6T;oRt
zGZdI5>C{}<e`)Kg7j>#0-=q45NxW9;Bfqf6es4FtdGUqnTk#LyypS%mZA^F-sY|Sg
z#j*>}jribyC@;_S*Dlt-6@95(Ast7t5ajVvRA!??m$R~(Dd(ljqbsd+AbKg5P39oQ
zO*)Yl2Mq`$a@1EiYwvbiTH1#d>|bD}+ITzrNKH*eDm|7ti0FQ?Y8KTav|A#Xm=>?P
zya%|W1kdV4+c`k}Np!}3F$zF7-4K|GjZAVy51Hi=Rl&>+Z+2~x&nlmNVcW;&vn#(<
z-QWimWJW_~#~?-ZqCc^S!^4WIS`C#SUHO^Y{`C*|e-Wls*ZhA+!Yi%qhB*G1xi9Yi
RX8CRNaaP|)kD0Une*r_c+CBgP

literal 134558
zcmb5W2RxT;*gt$VgchYzvL!-P$es-up{U48q9}XIx=|`wA=wHc$<C%tDnd$jAtNIr
zd%WMP)P29t`~Khe|Mxtf`+1PxbzSFq9LIMY$9Y{>S5;WInqf6T5bG3=96U}C>pci!
z#RaP6_=(W1>vQ-&tIjGb93&RW|A{Y64j~9uLh+!Src?AltFzOIhPxsiZntEg(QZ6^
zY}GP~9qcRBM^>|Yv&wsMcc-mrJiRHG{erJTTHdL$(|PR;4?G_~ORE03A&ha89Bt@b
zPs76$VRD-`T%&y@D_=cXQ797bcj5Xf2dd%OPfZsGOREDXoK@AnIDWkT;%pXf<<FPs
za%w}$Ki?39XXn+4-T!{exNA`7-|s|^H|78PomFD4<+8uub{sKnC!h59*OTD7GKxRn
z5S|=KZfpL2M=0>@8u<HN#gSvjYVRM78XX(Uw`?zS9B#|(Boy?*cuv&S*Vot8^?%BU
z3r*yukp29loO|i7WDiYD_r#S?e~TLHsWEtb>2p;W@8n=Jy#qVz%3Zs5ZQHiZyeYS@
zG0P}U+KoZPg3*DU2rvrP;@je|YUvX^_dRx>pP{DPS+X$m{Y|(4jl>%Dp<L7Y`yPu6
z2d=I?_MD=F^8Mc@;0eudcfUN)m}Ob|^W#4&DA&iTDJhwK&NlYjvb+C7sz$=$b>(t2
zTweF?-J6@6D=8^Ck#;g&zqBYNrMIKu#fukZZnNuIq)X1%#2Rw0>~Ko_yBM0Z!=;mL
zOpYyk&sRs^ioib&V3L>^Y|h6&#Um)Hs;=M5O#8^o%ggx9?W0lqcQg3&Yh}h46cn86
z^IF4SFxhrl!f}Z4tRxY@@po&USX<7|{1j=mtM7UpqHKZ%khJT4&vxMIhYSOm7iVOr
z#~V%=9KEy0aBgZO&SSy(^|^BQ`Bwo<%_+wcXuX$N>HhoR0Y)!hz6_L}>BTK(UHtTD
zO6Bn3SNLcA#RJ*6S*x67-7Q!T{mX-S9*dJ6i$9*zbPA@XrcSmxw)=~;-3bb!wb)No
z3H}YD4yT_#KN%P}cZUs^&05T5A4@pwF;gG+sHmT(RHvhoNz|(2^ZB^>`C*U6m-GfN
z&by5Fj?9hKd`ded&~k<d)%njuntpk91zVw8?(RmrGWbZ>%=e7Nz{ROR24ORwm^GsO
zgN<38kBWyFBpfbTTU*nvT*n~f)Rz=qEweb6X}e6F`QLD@h)a%hpDFtOh?kFVtyjF^
z^Q$#dle*#jde7EVFix6OM?3#)IAz&Vu$RgUfu@F)dmk@n=({|=MUhJ4Paw<o-Ne>4
zUCi)5Whqed<Jpxp^bgvW8{fmKDc|NR?7ipb#oAczu_*jWckIau8Xu<Bo;C`zTbBM@
zG`7WUvQ?zr)pjoVi^9#Vd);QnXMTS7Wm=sg+W#?q_M?vFIu>b3DlaX*iRShNhYu;o
z$Qy7~ESViDeqR0OCaey=y1C7Eb}+x9BEQw4*_%fCPEbeh-uRoGJ9dos)h8>3@eG%Y
zsnV_#JYTJH^r+P0+}QJK3R&k3*I%ABJpboYrar~Gj=naS`%0gs#kcYL&256;?nhay
zme`>F)PI|x$%|_n<Y*7F2Xn=_jz(U5L@lv_==S<sG^`l!h}oLd#7g<yxZ%Z0MySVR
zar>1RDp9t%J~fXSY%R-5xg@2q`)YCSPbuhqVyHZS{JR(3t*`vIOW5^hA{SYyERb*I
zbKL@lT+3Bi^IL4*a0E(jT>11hS6s=3+I!>z;je@K^Vg@39Xpn9(TeLct&|&XE9Fv)
zjYbeUaCfY<>3H&wGJoN@rd(4wTD!hFV=UG6>({Z2tSZts|CFxorWG`jv>9#tEn21`
z&c_}rr3lDEq|@s=-Bk?PDMwp^2cnUgvueCfP(|eYsehiM-$T@-O`9JNwv}F{qoeD2
z8&RKSRFO%yj4P$kw#T8buKUS9D^L~m%3PguOx`_HT`7CPb^F?-73VI$x}K>BrSI1h
zpKQ=!J=lZHd?(V*xbkVN)Mbl;FIQ0&X?LFgTVlI4g&5vdSFiVq$3mfiWt>bu-FV%T
zt_lAfPdJ7wqLABTTovjkP$oKXknnl_`A_MVb>8Ina^Z)s%!>*dsqvux-LypyPNcPY
zEY6RN#5lI~wH)xIyZv`R)tO62t9bqB1=f3YwAj`Re#$Utv?jb*saCDs`u^?#Usfs`
zTU!GZ0P+Uc|4sCGNd_qwJ69`Y5<<o5Iq~8Ud$7!WpW2A^>*h@aSLC&^nkDe?9R2p3
zMn-b%t)Mq+Wo4ySp^Y)Fun?Y8Eq?Lkjm>hj54Ez`7fdLgP`~{584AZ5jRId%lELx(
z=<*-tpA0<QT=%%oO<9Eyw5$KKPRe{+E;eR<K~$k8&!F-(q#WbGf-Js4omG>b?m}@r
z?9Y&HP%gdkIvI>cRsUXPI@@XFtN(V9RdQCfhv-w=mUpYWxVTIXwR*EQW*R=vHh%Lg
zV8xTH`+p01sDNeJw27<LYtC?=sFfYX<Acp2(sv}CZ3+|#(TTr!={agV)KWM?Mksmr
zdP;5aN^RMMsP)0KSz3+T{v_gpg7>O*h(xVohrx86{Hrk(9TE+x8e9Sb87Q1&?QYa2
zMDIucdwC9LXXouA7N}z8&5sSwFPoCa->vsL^Ze>M{02YhxbrVCR|VHd&+<uMXn1e}
zw^wFx`0!!ZOozcHETPrS|0GrAWAo;(-s1NC<Ri_7l^hK#v0i57^)F0R1q1e`Sa<K<
z-Livd&*(BYo1GZ^vU6F-*80B~S{1BU?C_GZA?*~62=$Z6G(3Q7rfpC4|Gd}{6l+)7
z>j$4<UFMbA?klww5TbGy{sull{I9u%?45hH7rM&d#Y#yGh$@`>^hl30Vz{s;7NJR0
zg#Ha_Ebspbui?DH^k+i|_xL2`@Up3|ekh~WSJ!PH9UZN!t5X&4OH_>!H$HzJEJS4E
z^|?0{ac&cz#caQ!j&e;;PrrWs`sepgGP46Y#rEI#pLychly{LON``{4a<%^Vw%DBT
zuQ`H`?AZ9ax|%<tq{C|s`=ZYGkLfhCPWt+7@p9hyGskwf$=$jIR`vIfUb%8bC*MM^
z(5Ab<x-&Z`r$($Nx}~LM*yU&G<;jJ)9vRC|k6KaA_E@%+APlx_*)r6W_q|?C#_h`$
zdfdDNE@xo<ax++r_*AO>_m3|%lJ4KXt<q-yNiW#)bCLb`zy&obf;;(d+!CVI5A}*C
zfXE(0g*}MX)*}0go7-~0rQBvG%v%aZz{_eXE2+F#MMUxeeYi`Bf%V}A?$a7-Qk(r=
zo_>63dc0n(f<=Ivd#X1faHxE7AvZVo*|TT#2CiABGW07Z)K(FUl>c6l>S>L~!Wg)l
z-%A@C8>OA?s8fxLkI%lmae%oc=e*j<lOOvV(!5zOd`PLKWO3c&I`sp&e(I6#bXSO4
z8A_MTREPIaYcZu<ChiAh#RAnJh-s&4v?!`0E0wBPcPK6p=S@iyAQKxM?fPol`4)#}
z^ELGR4>gjj^94mr>vqJN^;E}jhIq_%vKY17G*&G0ydn1&yFEK!o9DMqeDP&@RYA8?
z<pNfLO++*GcsKAPRln4Qb|oU<xi5`O*Hu(0De20pDkU|w#R+90%eJ}6@<q38J*x<w
z)qhJT5wJaKzd3`9yQ^yp3M>jb^TcBf+-DVU#?3%m8yjiFk@u<cxzRUMtTHl-#ri2&
zIAdCRBIVkxXX}5*s=CR<P^*r*dL{^MYq4X+`AB@^^@9PfzK8^V?d-0O2-Lq%DExdH
z%ZVQxl0QFaAZh&qDc5fOA?2uV^O|!c)`D-#<%!Sd7w6j-OV6ctl{5nacWY<w5ELA1
z4$L=e;Mw9ArfosQFRxun^C~~U<M8*4^0vq3I);W{9_ba6?8VPu?w+Q^+!q=Tr;dNX
zo<HYX8gy5M1-kk=k9|X840l9;SJ-|3I9TZj)~<l4>2wSD@Zm#ZVj|d>p2z%8g|Y**
zD-Rtyq^P*SL`ih)0V~tqH}oO3ezN5=%G0fpXffOM`c>7{v7**zX;)sJ92#sX9J{uG
z1IXHxs2Gy%znak0T=@7~0B{R}Hn-sK@V{k{5iURbW1PQ5B)QC0OAJR#U3PeFjFPL=
zTy|7dRr=gFV0Ti}mkJiklj`a}fXc38??gBSQBadw$+bX6rK6nrGYCVnTn1z9ky)Hp
zSha#k*}7%-Y0Z5WZ6(u_EzWg{TwyoKX!H(WMZ-K-rz`+=qa#N<&{jHI1{$}xFpGLT
zAni7z>pJqXC-$<%*|TT0DJcn_Kd>5qNi*%FIk**PxJZlDbA!d1Psaz<ab<4-V3JPL
z+TA*jOS*0;KRA`aT&|*~rgr2Acf)#;+#*07e!NXhjy?T=X+5Lp`-j>&5Gyefj;rJ{
zk-3(RUW-Xg5)K2+`Ic{P<5MnC{~{JitzQtt(*rp**v89v0F>|H#1K_d`<RL>Be*Tk
zW;H%FFc<F*)d=|U6ze(IUS8gHYQsi4x^|~d|I4TnPbpbiN-sOQ_J9%dqi}(`3Sa!l
z2&R8jr1b-o-@Zi-g4pc2R0nAxPv1p}pkz(I5h$Mnt1qAFPeU5zK^<yFCEmZ!+)~zx
z@G>>hMO>iLsj8|z!t#SsLMs|HF=25foqlxGN!w@Mym@nWxO}l<zzU_#1EDNq0>T1*
zqydr$nT3yZs|aB79lsU$A}ynA=GAt~P~H6(-gkgCO$DbCggmLl>J|#*t|M)jTRx_1
zrq)6K91dWdMiDF@uTutu?f&eHK<Y%<5wU1Nm4=pzx_$fPs@i3qs}C-f3965e^e#7<
zCNoKfL`Um3O_(RE#z6931onatMhahy1z;rk*FquGNgqFUD52I{z(e-s4)+Td*Abmz
z4!+tsCf(n@eatXe{Gg#mmRhjj=+LgX+k|J(iS*OLyLP3fq;R`~ZdTuFmzpv(Yj_}x
zV)*5`Z%=nOJ`WVu5Kqn-S4Li{`%QRFgkbXNJTsa7;)MD?Q@k606^@h(Wms;MWK;&`
zX^wVj;|VMb&)|3R$4PEJ9HDpaXf-L|Stn8IEfEKZ7nG}+Wt7wk3AL2WPVrs)cI`4n
zEpg!@WDD4Xj|?{Dl|yAubq89u-%bI1)VNRN#LW+!-@gBn(+`A)+jy;0g=1e`5-A&{
zCR;RiW-KQ(*Dh&2tCm?H28-*Dr}m5K@$sw-{wQ3Nm5wQ=^ZM1?M?-M0D2_>p5GXuR
zu*g9DGXehy)<}*iEQA9XUwI3poyvIRvW5ty#Fl$}PtX6_39yR+HX9dvAbXhIr@jCo
z3|#t>Mk0Vi$FOIjcb4LjaT0}eHots+up=Vx;-}jVAwaXjfsMyDkSOu_^JfFMpAAqV
zkd<RRCmw`4_>v3#_Rd~%Q{h7NCC9bhJLHFZCNb$kA_bw!h=MF&+Wzg^w}s~N`R2)%
z5}WR-kLfy;a(gbkk3S#leE*!Zpr9=lgcQkOF*LJhAkm8)2E}c^z3rfMD*|5ZU&0G+
z(*L*>85w!&mQ7cn5Wq@01zI3v=TTJrCPZVx#sE4+>=3C6j2yJeT&G*e7>Uu@H%78y
z#FK);3W6sRDu<(d8iKCy`=jF3t5<umI!$+n`_iy1NQuZyw%Cq5^<1{1taN6ig2m(1
z?0zaQ)QMNk`M&cMp2v-GLw9^KSNsa0C)bCNKIG13VBf1_B=7{q*q*U(?|2*5!yO>^
z^4q&8M+637JUTl14g3V|q$?cxC?MVf7eA!j#cna4G&3`M?SyJaDUf%emVM*(T%_x{
z`}ic0HYX$3IUvo^IO+M~$Md6a1jry_rHT}>jE#wzAKb3*^ySL1VjePUij_pGv=a|B
z6_u34FMR`|9!5~^xD^+-C^fr+s1R5Z3$m4@=8I~d`_>Tw%OeHUDyb>)fbV)^hSgP1
zi~^-5!prB3$z8?sl)Bi@c@`bl&b?5J0uvV;T#I}GrDFCEB4nN4W5aJ*EA|Led7+j*
z1mIG6LCt*cm?iBVKpmf6kr0)aocF3y)cy0(*00zuy8~0^XD8bMe>*mAlX4k1415rS
zLP=iujlsfbvoaYBiW!rrh%%Q3fI0J>filuNY-v}bu7dC*(oIcG+ef%o5doIfzX91Z
zD8~BLHaxAgMN5G-mQoE&)rvz-TTxN*b!BC5bN+C8e%nlO`~0KaK%}pZMPU-S1dt32
zWf<s+Jq|dmO0eqw>~E0hiMIXPS(~6Ba60z_lneA!(YMHptJkdYW-UN<ped8#<mBYy
zN=9<Y(OzoGO~v9lwpdqued!0ZEd&!G^;WO~vuSJ5PjEHbl~1u8&Bczx!!CVM7DatG
zcpu%_Z+;qF7d#C@<R^(8$TFBAyZHGbV<6$7h{OkSID7gIC)9$DAciiY^1`2+y~U|q
z7PE~=Il>OxV&J#7R^KqtUhV-E2?0BVTSr<b{Tu+Ww7~2WtQJi`ZB^i6Vhn#&@bv~s
zLA$b1HvxpA15fP<%^i!=af{nz%<N#5ebRH(YqS<U3~!+|=X}j`=G%auPl4M-hMLFl
z2U9;YJmz1}sgb~zdHG|iMh_meu>V0?S?M@ZX0d#6axvCpVH#-bb}A7@%=|Zz^65^N
z#VI`&na38bF4bZ^FVuRZ4x)0x42+13oY9NY*y~ijT-5Nd0tv80%`8C7&<p63MFRnY
zqG9;Vhcs)Fn$BT;$fs!^f;vH-xXq1J@Sb`!2civ|G66}o`2;ebe)s8dGL}C+)HZM(
zIb>*Pn0`8sloo?|pBBcGWuyi;gv~x|m2sc5^q6fX%NhO}Q4g|z@%ZuMr7l1B2@1}R
z_u0POe9HCI;?J9c)l;+Wiwki7%12&pN1P5J0Z5euS%@bE#-$-9p#Mq{CfkM0#-$>@
ze@G39ikf|YfaYkd<aADrbTc>tf~-cg^X9S@t6D&@;N%+rh+9Lb^NTHksCy`wL7ZKk
zB4rJ{pxtmU)U}-Bm&3Mu2)>=l?2PonL)9b4Q35$R#qO-?nCjqD>~Q)rzq-SxJ7epw
z0@b0AiXew17BO$K1KrOhGrHITkGDo*lcq{idfO_wA0X!Q;7hB$;_(EiOTZ;t7MI?;
zK0ZDPK_tl*J+>rDpDbo0wBZbf_Xopc6EXXcf<IB%weg2H0Q(gsxad3>VWD~R{sz)a
z?@(Bh;2j4~X=xR}$Rg`8Xg+cT8N(!OrkU!1pT0a+wQBB&(W{%<1y+U@7UY^r&yUxS
zRZ9rKznHs-v>u=$k~;m@bR10hZ2>%?PKP&)x&j8bD?&T-jNCP6&+G=^79Q|b1#}2#
z?P}rmghC?1eB>!mkMtkr9fn%Av1n%KOF_Rg8Z7~D^!tq)H=fS7m>()0ekE`b3yX}m
z;#j7jc>33uY23`9rH2l97i#<>kW9zStiP6O)e|<*;7)JsJ+!@R`K|oaTO#~~D(#<-
z?VXt%ZijmU#cbdIVHXDn2OAsPz(HB(W15=dQ=NeofkMi#*TE%&*bZ2fjJ$$ft7*k)
zaJ#Y0?eZ_9;&t-}nyK6LM1}AY2&&p~9KJ~<v0Pm>T`TKe*C!vCkFcc#3cGIMhNG#-
zCIHJr{ZjEIf8qfs+<*Ovc=kr5G)VD%)tDkEj#K*1oCBh=6u}V@z(9jCs2cp<eQ|#B
z)I+UWaL&vF%QRirZQl<y?0|Ab8bQ^|f4JS|*JD^Ihnloq=mM?779t?^2;Y`962EbA
z&WglC=o?VVQa~9i6?<?ca1g0L=fEGoeCAt}<#7!S-V+ZtIna9E$iSw78m{&_0|x**
zxL#uY6T?fNvq`1{1M~Ywa~PP06>xNv{<zR{+$0%+(c9V3%t{3g-swxTEVK|LFYK|4
zj+T~|y1G^9D4}@<m0Z>J2ei&FlS&1R1!->QH_JO-++B*Jk#uB;9IX%%1V$H_aX*j%
zX8Z;RZqLPi;1VbolYRB)$9ro*uE<-nPEd=J9(jtE!!Jh*MvUuw@N3fE894N>tJR?Z
z9((kc1-i^?D|T(Gpoz+BV{n6HwYWC21HrM3ksg?WhmMMiGXPBnt-d|cYwej}zPK<C
zwb%~w4;Y3-hz<?iMD})eEkU?di$*C$6^fY|Q%l~vni@;^^w*q;0D5$jZm@X3d-+Hz
z@BvZ6H&WF4E5x4{D=;vlfA<sW$|ct?lpbzs97H5^0Wz1#VT)&WE=uh#jr%1~ipen!
zO(2!9{j6N?>@_+Z$Sj4=fP%xSKMz>(qug$=$sVa9NBcHXI88a6pWGp>B^!+O+?p+X
zcW~tw*Xd;7Pq0$)L&cvma1$O-CkP*S9C>C9NX!$#N6vv+lh&QYh9@4{k1uJPn3&9&
z_roV$E3uwnJMw48xr#1klup1Qu`O}gzVa!0QEOJOe&)ZDknKfq33yDuVWomZV}acZ
z6KpQ9c&Lci{j1@e`D#zPpe2UORtzV>oAro_%D~6;=HA)_nVH^%(9lrCcqsjd$g4~Z
z!pErh{au9nTCa}0Px?_Z9`3(A;4OSwK_MY@D^YRQU0u6vqXQ%HY{PXAn%@EcEF0)L
z+3rzpQ0|VUUnj9fb~@X2dK6rArQCbiOKIr)JV<*a4)5XldI~EQsfDRd2wNz9DAQQs
zizns1G+ue`*-TIG*#Ds~ZgK8B?aI!I!1AG@@1%n+`o|@Tk1hfxD)=*;u-SI~AZ((U
ziNWn>cX%2-%*@QhmB|qG>Zny<`N=UXTR`raO;;r=)xLfEkg$=zgXq~(=o3Wu2M~dg
ze#joI3o5h!%d>FqTOk@jNE-{0)DeQ`9fsSaw)nwIHUhZ-UqW}p%Ih?GQHWa*+pwUZ
zJ*b!{jAV!6{Ss2Zlo@Vk@jooMZ(lWVE;EPF<o^CC1FWD21c^j1SiGz&flN@6y0j~M
zV8^2<@tk^u7G{T!hL%<%Vgk0}N;#|gqayxH68lMOU-36QdhW|aoI*Z)yUrEcSmHbe
z?FVy?4jDr51fU$heft)RfwsWe7uOV+op%fzd_vuWq7xuu5hZ-_@wwL<WfTZ2ucct^
zsOYatdINTAU_%yuWZ_OOF5j)xgy-s=F!!)96ciM4w33Kpnc2@Z*pO)1g@ws-nr-`b
zP<cTLy@kF+c|`Aq_1vExb4P(A0QUtYupPZS(j7&&<T)9y6eMcjV$s^V?_4F)<N3Q4
z(W2N%EX5X<KqiTkKeu?Iq><brQ}7pxTe+q|OG1pef|*H;yr6@FXMwDd_o`<hp5^07
z;iPCj(k+BSEu&wiNkIHT?$6Fmj~QiYsj5~}(Cb?DrzT%TkF5hjp#jqS7oYl(W-#AB
z(QHXWHn!w<Z1KAiv(%~JCR;y0z?f0w!Gh3TEZS~Sa2H^LF4EL7BpMJa*6HhYR;mYR
zHo)Qe2|sv1UG~am|6hI1q0c#7R9>Xh09iPpr$K!E6~fnl=Ft9`1;De#pPeB)W%%<j
z-JP7A>3YSFfk|wKmrsqLFv6hwopS)*oho<s!0JGzZhjAg)xOpK)8@^aDdgT~7?hJ`
z1<Q=e5=vj@zw`u~>q%z=w0bR*xDePnl^4HWQA*qL&9daDsYW}20<Mx%*V1x&C;VwG
z@GUcQ&w#G%OF*+7+KObKXQY4d#3{X>FeCAIR(t-G-V_8Fq&SL&x(O0o2(1{~YO!QA
zwa|6o-i|&p*}p@{{2C5M`9)@7BoHk#u@+5PicClsUJ&s4$@V~l07kLVyS~gf4zho_
zzdXKV$yw|u-$YNZ7Axrtz=4*9jU3^jyOM~HCM_2@awTA3tW@9~i?F>7K+L5pmdmKX
zzWf$(-u-8qL3ejIg<Mc{G=972+*KKD*t0xda*2kqHUeEe7J57an5A5R-6Ii>ZO+LG
zfy}r8!qdZaVQw0++LCE_U@jX!CA)t}^5;h>7#(j57)Ya(z>^no{|s0FZ|IJ;t)>>)
zOmv@=Z~VRJRPAd7N~~Zz+}5<L5&!=1>({R!C1_*xqC*YQ@~H4z<Wx8C116FLb`zq{
zXZu=FePq4~04yRRVx-@E$HwXD8jmukk+a~CWxh+DM=Hbvgy1+zGZw}Oomp*dZTCm?
z7~^=;Y+3I1e4PlXKHVcw&SPT8Z*}cz4^dNB?_9C|gs!e6fE>n#W$lGrSVz*IaYf|Q
z+&w@3)gs%8?zkm@kfU8`Sw8Oorvw%A+4|+O&em^s=SRErr=n7&0d3KR3OFJs_Y_*y
z0qqI%)}p*~*$`(KLwHkFg)*XyOujbaIeG>O2<*RcH~~V8td_4`!McYFIs>*_meg*i
zAQa@6s6HY30sZAVC<m-PV66uo=pxvqm<T~v-s+&kK;vHU5%TWQT<Cr}h87D-pULmw
z&<4#NC%^2<r-{uvd|R54_^@EcMn@qG^1!0Pcux&M{dQbtfC+_lLCc+8SXlUBG{NZj
z2Ua`}pu##JS`fS=Z$MUI|1bXFef#8CwnZ!1GetoRH>@qk+HrB6heWJTR)gB~&^BMA
zxfK^EL(?Djb7WXp*qu9DfsFxzOB|SZV(2>c#TVGPNG1ZTSZok!{)TF(QcG-7->XOZ
z7p0T>uDZxe{0^=IE_Mp{4x1T|3Ks=5K^Zb$S=p~WC^i<I3tjvco_SYS7e*jp6z>)j
z6O)pX;^f3f@Nh;A!}fuW3qea<#$y3J6L4ht8I9#%NU{yJ;oKAR2^<XUQfy$0CyUCT
zjhIp;g+3TCyvm=@Ygb|@QZkug;lL>oF)lt2Er8vHbOXx9-nw-hVFFT1j$t4=kSNUy
z?TmhFHtWFfuV2)`zE`8wi<?BzQs1xyjGD4A0Z9Ewwy-k2(c>$B4DJBV$=nc*eu>N1
z>s+{ZTTKHB!jtacpUmP`&e9qHKbMB<H~aHD>D7|0z9;1$p_LLok?uXLJzuV)k%`d0
zvf4|<Bt;a~Q7NiZ&*EI%BH6scXbmPCb^=TZtAjA}9EBd6V@ZIaaC$*00$wc;MD(4d
zz~H%twgtIEg@uJ9Z%Feh`wjmZ5g4e9cLxRr_O;vo=y;0Xp_cHD0XsyW{)Yf_Z<o3}
z;R<9Y88lRQ5@<d+B4(uQq26(AOJt9VgYwa%ZUb2rtJX1S?kq;JU|VYPs$0)hc+6Em
zZBNojPssumtsr^W2dJJ{@7%jL3mn`y`vgQJ)Xxb_Cy2WIx#s|&Ke*z4wx{j$%W3bU
ziIIb+21xvb1rPDUsD=)rN7s5#Pm~d$yLnrQbHqVZz~`ePnZFR1kd4^I$2ZrI;f_uo
z>1807KnwVRkW8yZrQwCK@7mQzDWeZ_9j+cIMj{A_@g0oBl^y*PPTYdj)YO0iWvD+)
z#2_yr1^G+I>ktp!r_lU`(uEBSi~2li2V#WEcd#C;hg11Bn(6tMI<Xjrhg|D7{k|n`
z9aPQY`HAL=Ko%Cim&_h>Bj6ENW2m?dU?2%YVB$_*hj78E?yk6mRI(#U6&jov{V?84
zgFE|E<p?WH71)UYW_QpMv|$^yJ#7pjOf>GRTtA$EzWz7JWg2~@BS$X43jurhQOOa=
z=p1qv@KqzT=nAP4Ag@IbQ>uTmoOp7Klnxh!B11w1!SD{8UJWKm&WJcTIKa_GzXG!`
zCMMrdAp_4fBa$vX^Pxulm*ly-jOZx-<NJ6HkD~?iNWXMuaiM*2y93AqcMRqhKIfe8
z>FO%M)DdhU(Q|pY5optDlw2gEjm_v&WmMqV@WEzO5D+;Jlrm#R27(B<xr8582{2U1
zMhzX8s>+cgcck51;1oPpT}cp&qz#ABALMo${s9UhV|HY4aBxgaOjsC(5R#AVuxVu4
z!msOu9*=vCa;{lJ{}p=u3+UZNTv<hU9=G|89<qgF7>ZiQBo4)zE=Nnkc*MR#hz8Hm
zOHgsBQMDKBzP*h=Z;w(SDKXJ*s6~vI_u>=JWdOCgaGe-L0*w7Bc)T>l;($l8<YN_g
zN3{^d4{y=maTO0)@kW4zLM#iKy0AIR7k*}h>p5sQ-i76&PPv}(j)<kc5JL<sas?JQ
zyGk<zAU_NOWGpr$8I5nV9?&cV^ia@i)Y||S8F$=;437{&gi`!A8a!=C#mk`(F!vg*
zhfn1Tc`z}4W$$7Z#3G#Cg9){8r#sQLl0KIQ00$nUM*tdpRB7g)@`~;LhjVgrXkz=|
zt=iT|If8#KJb6zL>i3uOv?>M@UYPd!*;mgsz=|g{Kx07UcNk;QhuX^*JCb14lP-{i
zL@{U&9Ok)^j64u6Ak@Gwf(ehReSXN192KA$8eNXO{X2(Ibp<`SWTXb^p*PBTE*{s_
zEy0irvgCqL5nQJOt5+caO8{Nj+1V&Z;jyv$j|Eymj>+dj?+>t_%nx%fyOoh3+;_CE
z{Vn#0XCkm^i$T!QEk%`g!HfY$BOnX79+5#AA;`tW)f09~pX5nwX{bGe2w!yXG3Td?
z5r%|NHVKK#q(v<=Z-=lAtW!oMr};LpG=IwK3Sc3DZ7xorv;3?F=5g$M-+w_DE(5FE
z*w^8c38J6h=EBS1Uk0K81|aUVi;e9re*1e95Q3ts4AI%R6%xV`T)Ec?bFSRG|FqrX
zb$@Z+bqu*c5pOO!SYK$YV2BIbly6iKa6ZQVB!Kjp64mB9czp@A7a$=35I6k%X!IW(
zPf~)aL<PkcX>w<7#Hu?lxpIgfde}#0nCB?#xe8^3Muv3T3k)b4VW9#}2&Jf_CreIg
zTo9T;vS4&{Ca3}Ur-wc;q6h1mic-r>y^M&L{C6jVvEN<Y-J}M^Xeyiqt;{n|!6%_H
zJsP9FH>A<aY!Ts?lA7O|e5LaZ3cesd2>T3T*4@*?@;|eAqmn#KP5!@=Hr+??L?&<V
zY*_geriosLekm&zC{#Xz#fi=lq!?}GK0}t92iXyU1CWYnN4Trd9VpC)nEDk?;!wbt
z?C=hUX)NwI6de|34>t||DEC7j0Ilm19E%>t6ph)ZM@$Z@3GQK%txJu+KY&p?RCTDV
znF(@qX$*tDKicsR*B9PRe7uG$8?=xQqX>v85CD=o&}c&_IiL@ogSed>HbA(jV;7!`
z@cx-L=JFxoj=p^AB@~zo0ti%wsLBH{PMtxs&DXpgP1<5WDAwt^0f_=53_~!q$hn;<
zZ@{*s$v<~>5w$fS>j|~J7%PC%K_vmDMY3S`5p<EEiF*J|GC?%;zYt2H^#FY8?*@zF
zTm0s`xMW_xdnYxRTaUaTx=)d9V{afE<LmjQo;O-@t-!=M=$H}8JWH`!SC|7pAaK42
zTnxWjudk${B8mAUTo0s&9O3vCZHMmt_U^1Uy*!%*Ns46QaTmj^5rm;|%Mn7o+W6c#
ztk*Ew-gw$1m(0Y+I!%;crt<y^QCfTsKaRtNf;)2?iU9+&vGCh5whY<7%cio&^4CLE
zy&tQQ@xWAUMFlQ2^PFXY)`A(q11H_$^({_dD>@=%gLZ>727?#CN};UdFk!)a@}cMh
zYPN&04zP^Xcqq7Guv@GvQB`7HMqbjNH#R229`t!VCmTU%IxV%bL~lGQ8LcAwJQ(P@
zJYK8d3ugeFT2%U+i3vH-g0jN`wuiw-Hy^bV(ER`j_-}f7W+&w2<dl@Ej;*7vXhbz7
z$2TqriDUW)E)z4E279eKo&W|A-3si%zk=raf#0~&!3`k_%ZX**xg+F2aP@Mc!XLij
zD@qLYij7W~)2@U*!)12!@SMirBA8czs<`NVIDjEA#{3VXm7@*q{Np$*NaSm1SeP&_
zhK4oguN+vuq(McCVDdZ!Nlm&H5wVR0{lQ#K8fM+*hO+J{k{5z-qJ>WmIfC~4KEzYx
zqu`@9gzMTUf_ooOq#_6|(Z9va_4V7g$jmIQOW2Miw>{F_=H~QO-XK(^Kn{Q|z7Pj8
z=Y(OuOGrrY@_t7p#k>d=x#+aXe<lh;1^W8>0EzlwD<CN_PIhb*dqznkJv)(-<h}1P
zfCTfW-CbQOKvUSWlmeVPcg8|LleY%&#b_BSCW!H0UB~U%1i}W+jm3YYo%1cXURB~*
zGE7g@u>?SV=;Oz0DzvPvN6<CItP^}>5JI*zgk(SZcfgCJaQ(>Jx3`ehs^jv%)k+mK
z3JK}Os+E22b#T6ACk+O@3}NDeUz>k?`1#lxDw!7m`gZ7?UxbY8eluk^LGwO*_z=4f
zk)DY%es>4RPk_&o^d)dA0DT*2ts=WVw7&!9o}yZyxJ`LC<%iN?OeJ=ov62vj8=AM+
z54Ru3F|q@zw#2R(ER;-3B55ft0Rj@Flz)n4RQ}oVG_MtO@8L}&In#%_h5Ov!VqD4A
z2*WhW{||<El6+F%KO<H|RSXFGM-+WL8@ihO4c?rC&14O^qWeQ&)22;tG5v_D2%okY
z%$hBYBs3T}IC=6UN;shJSE$T61s^Covvlr5of+fM-Ed!MyyAf~pn8&Mc42b21*Z{E
z5@#>R^6>Mks;RXX+MWkuk)tdRpV{1d^X5$yEO6i;)H{$e;J`^Zn<NNkXCu}(T|X#K
zhGi(*5_w`tDJT%#c)eE*yx0aS23v2Uc?^8pMR8j`YInP|Y%YWcW*n{`WJi`Bb#a;5
zOZ|r0?EjZ1uH{A4S_v&h!<;MvuxGUU4G_vS4|RzFk3J7OZ8wRKUKFH1WH_7()<L@J
zD&(W)Bf+42FfN1E|Ak-k@tEOX6KHP6QwF4Y8zmyZ;IA4f)Rzz_Eq;z<ePrT+#iI|d
zp`r0>O&QIXc)!d+ED6Gt!S5vulPv=JB)`Edxa=YHd5fk2WpOedxDO;p6LhbkWg3Vg
zb`#b;#@e>6d`gZ(!&Wk42)%I;fF@}2_S)eDvUvnyO{0xy*2Zq);^68bw{HDfWJW>h
z@n1wNv<ajQJ%OgFzEv<rWBPX!jOIrp!Fm&4!&IzirMhwB26j0tiu{s5eR}CfkFN?#
zfKhYkE)AvF(?xcsqSmRh{~OHn_reb>HE84Qz^+K7kBUMw0xLkW8O(h{+C90>D<k89
zBO4<|)_DF!oB|2wgYRZhd+!jNNoT<X5S%<&0XN<E?Wc#@W4Dy~`!LRh^@Zgl{~B|9
zKFYspvM1eXSWU=*-zSFjwX|TZ1kjLdWH7%yHZHCmg#^)vXA~Yd7%C6b9u@`0wm~YN
zL+4WB`IV%323m_n21FTyaasq2?Hp<-I_0|H8yF72SQWjT*l}`Z1VePn%4r7W3+V2D
zM*#}w6NYI9iU>|g&I`mz6P`9NdmKb-8RrV69xhh=E}c)P&jVvg+<x~?4?L0#a+Gld
z0oxH}<%{+A-)WzZX}~No?3RXy+Ja>JlboIXU!G|Kn$KVi3e9sna3~rEp%k==;W`rw
zzBPlgwEu6%o?PGRpI%zAL#UxX)O7c1d*P=8#6kt&9Fa#10GgCE%mb4NUwrcZzYau+
z-Cm5QAzY(s2A@od-i~X|`*rdmr33zB90WxL^W{s}TVK8t7<+w-*()#{kblO@)LhqK
zyn$|l+&aSnf(Dd0mj}!6l%9P9rZdrAW;!g+$_RgP0(7|D<$L-8)ffNsbd`Um`<soT
zx3O{9^Cb@t^cc38_bK!b;Ot3Y(hVK-t6{W+s+M&A3eKwMJr2Li4i9O#lFy1Z_5x(I
z(#|vt@c}?l_K}XlmTi)I_wIe+BU^wEC&x2?Ylugu^B$a&W@GyTqni=q5T?m7Th2;$
zY;}SyHw42HU6|0fJ9h4b5t)T%F4*BENZ;Q+r`6_7oA}Uz$H*1LEixSjBBwIi4Zr4H
z(Cw$~L_**WJ1+bDEY$nGNjFT<|0l;fpO(vh0(zjPj$j~$9Ok<A)N|AnnGq<nFonep
ziHl>2OT|NAfO5P-W+4k}@x^3Cz*Vw2YFB?079W3vDJBHT^BHTm9S3&fT-Rb;>1@Eo
z{u#*HXUywq^VwMg9%w!+ICLn9f7ACT;+Z^O+tS?`^8(ED8j7sGd@OR*%lmlcvft_R
zCp1^md)>HHIUG1U-DV)Yl0DmEPeu=mv?{>Farsq8i?&I(aSbP@DRKltz+j>4HQlt5
zrRA`vCvo}mWr`Imh>o$tZR<Nix=h+se=Z#^loqM2t;OJuPVR+Z*MM2FE$ckm1=h*4
zeJe3PiK#_QD5^mu!pW!;4hRUqm?7juMD{<+Di5Qrf!c|tzOJ2JuH8|@6->>RM|u*G
zUst?`?dgod2uGs?9M<kAlnQK3=`&9VP#t3<+>PLYw)L#_He8m;GfU^M;=*8FVkbv%
z-MW_$|J9MgZ=OFtP#aHl?Zu4(m12N&Dk?Ya-gP|n#_=8_6O;Rw&Jfu{sGfR7c47_#
zckF80+7>Vz*azFMy?sSx0ZdL9ylyat@7}#zCk)BkT3~I2f)}=KE#X<S%2~qm;hVma
znEoY+>ne@uPSQY5JzZT_XM-*=oQAG=Nzb=vdmO_q;t(O9o}H~N-0b3ngoJx?%PMby
z0?bTJNxIKzQQbYBIy2rk2Jc}Dx4LX!0nw$1@;M7fYx(l!iBcl4f%DK7L$vH-q9+Jv
zX57G~P+Mn-hNYuHQ;Zn6h7*1kAzSwn5Pqqpp!W4^#m^~3m%Uk@%k*eBJY7FOKRWqM
zyHDGJ?;$bN#rcVdEQ?lg50AwcFAlODMBx*;)c79#k|gitK|M?|?j@L|!<3Ocm7rh_
zi2cmW3=b3hJLj#qf#@JQ=i*}yE0<1Ya_<ro``q4cz+VcfIeeXti@TH}$et(c0dfpg
z*WSTli+qXeblvmkodCq+xON;LD{DNa73xk+p-2`vjj$VhSy8!#nVA_;Fg8BENq)zU
z9j+KHh2S_T&P@=bJfjYT*bc5yhfYXg;{z9dNgBda9^)=e`IhDO-%qmnq^9;kSiK5l
zvAlHYp`9)v&4P_4R}=7kKwciR6>uj{f|cIb%*(?>?|G9%SoF@5*0DhY6j^s3c1WiP
z@_(3~4o%3gZJYC#XB2EN5MC#E!hC&w>#Ua%K@~yyD=LqDj?LFtqT8O5e4LywKYen<
zoy$|%+KwS)?EC5t^~DpeSNV4DF1G8lAtyKUEih&WKR*XN<oE$y!o>o`7lT5<uM!jj
zV+GF#1aF8su)NX)2YS#G$j;53gZp=d7BYrp9>BQJfHj1tq<0CEEPrL&zX8}~xMI~>
z95za)eb&@8gX*#SbgsHMI}x{=jxJ<9lei>0E?^hXqL(KIOi|IVZ{);BdGaGOx9vIO
zNh_apoMPq5o7+WH`>qkL)h(^9g98KWnI*Y>L|vycp@`Db((2^!=#=E6AN?GbpO$FS
z`g{FlV+HZ45U$j0;MkZ12n5^VW!m6bV*vjE^k{P6GlviNv3oBNLO19I9@(iAbq+`_
z#DVy(U5ZqD0nEUC9;OY1_Ya`>O%@q_Dnm0f7ucTg(Yf(+!o|OY$&)?%zf0bSZZrBP
z07NVo+KF>${e}fldEUL|=XVj$sIDGFdk@B-LI5Lh;us<*6$fu%?@M_s=u+)PFCCin
zD4us~5VjW!Y+&^W^I<F~GO}{oZ<|w>&PK?#(J?av?(O>u=77aJ)nXso9VFJi!2m#O
zOG~VzGrJFZ6)YGpOvld`6!6ffk7_aEoId0**uwm*|IM4B>sNa&!?9pAy1Q|;E3{Bd
z*p-y5tSoX};~guTmvQs@IZIf=h$g=(FXPd7gtd`I1-xY?-xW_fHo>$7*~9#?{561)
zQv0Um?^%!?w-=0V@6s><PjbCN!y3MA`*xuA=i1t1R2w#J!eJH3Bo4S*EW!CJ)K6|1
zb8uNYlf@c{6Bs&hzkO&ql|td}U4=(nw_({Hb(rVd_n!5k{fee@0jv?tn`>mPG5qCd
zZ}0E#FT8(0oEFT><dMt$4`!QHP+brt=>X=If@7F^`E|iS#L9cRi0WoBqu<X&I4f#u
zen6`|aRkR~ca>7a#{=lVR!>n`BRwEUcQLK^T<nLyMe@>Ohc@7WWaYr|LZnMzo{kT<
z8}wNb#Cw6RL1O&i{r^0Rz&4CM;@knd580h18S#2b&%4E!CoPccAQH&J;r4Qa2b#NZ
zZ)-`b1|ZMjV`XE5oWGivmq$MM1q2FYoZF9%e`vp=HE^k|q!2$R;_0zHy<u;=2gm;e
zMeGQ?dWtn$sK}s)ah9a#8iu3tzuyaBCrYsLz%jAvkYzh~>Cymfr?B;ud+bn(v6ppD
zAPnS#Swjs#BLuwyJ>V$C;eJ~3vDCheIy_i9pc-_ZY;C}Uk-vsGg8s(>5man1K)Iks
z1boP;aJU@ghhM;|mAm|WKzk3wqphtC5ktsR)!scoDIy}m_5!Gk@s=dq8NnThTw<;|
z_%EoBUhVDe#e-9`q3%`qZ%@WQ`(c1{9}5e!!&hk0_<>V$LY(nQyrR_T6<UpC)f1_j
zRX8ZKF-(c%-A9b3&Cx760Jb1YW6W-ODl32KB?6;9Po6v>2`U>#dx|f8-k0=zndjm#
z2oE@PKwzK%4whWFkb<nlI$!NYEP%XkkjMDPG??EpaYkZ_h3pf<WxGO)$+6J3hXk=l
zg<Pl4T>sgQHr7)uO2P$1yv%BP{uay|vb{ilLfWU>iF*EXy2L3<Xsf7Ie0DJ5`53`_
z#2f3uyPcXKs@N?l<7=dMStd^|F`Q9P96W~S&2|v-755j9#RzIbfdW#n`}`Og`CMe5
zR`-QirI3CS){;0M-`lDx9!bd(*?*QRDXb(GIe2&;9K5~}EwPD-i9i+^lW=pw(~!JQ
z&;R{8*jMEhsl_{p-Y9ZhhF@2U${O=pK0Yh4N^7gZpq@W_=0l4jM@9?H=f=h(T>X+T
zIZ?~W?e*M8hGn5z)xWih879>-)A!lN%ElbaJcUpS@-NiNQ;on$z^);B&R8AUO!UeK
z2nd+6#_3QIo_E=BV~hL#TUk5MjgY?vM;eiIqECxpH6L~Kt<%}!*@A}8HdEGyrz<>}
zZ?(sF<?|85+I7wn#C-V3zi8B348T%2e0Vh*4mE1>gkh~@n{c#=e_f+0L#5l(r%&an
zU|lIFC?pvz^UMv~t@E+ES~Z9Vp9Ny^KR!!hnw8$)a#?Q<Yyll@ZE7VmPhI*A8#Zj&
zveU=P-u{I67UDfv`XQ<jB-yoVRQO1OH6Nv}l6T*ozW`0RO``o{s-AkB6YD}S-Sqe8
z)l6aaL0jsdx-8=Ap#?RC2ZU@5>_=^e$~bGp7$ahA^v2q6gPKd_QntMmZDj@&VXe<c
z2w9Ona~_lnaq-hwp%ojqZQHeuTv@Scb0zmq*w<DhQvQ#7m>ASCRmX5Ic@M3ZE?r8L
z)6@(_Os=}3xlC4{t^8;(xd^miS0(J;|I)knjy7vvqfxeNJ9ReujvWV+UV`HG73@m{
zv1)n>T_8!P{1c_|aw_<)v16tBVQ}Z9r?Zp8Vg4)ka3`*Ic?K|;wbXqe2Eeb46~ti^
z&ad<5wmExwT2QRs6tpT4*}Gy~tb1^q%0)%iyALJZBu{E@`xgT}%QVs&>O7y<6XVgr
zwH|>*$%%>F+o{i}cP(1uZ^xtO_8)qHMU(xXi$+wgg9hkdtjFQ)r00seIXR6Xr%^W_
z9!q!yC-Hb|k;yxTU869ej7R>#R3Fc7do<zs_%Sb_kGd1Q@akx^=-yH>=(NzWe_C$t
z);ZTj4SVhh?YTYaod5V_kIy*`ocH`g#R*y*5Y&Z}2!Dp(piJ^yB`tDk34VG7{?t`M
z7KZwp&cwtW6xfRAq0!Nj9t(4LxdJy6j4ymWsW|q>N+s^$<`xzfK%y}-(~mJUlw06G
z#@0{~_PI`3IXX_F#JNrVkOQ0e^<{VWk3u1Qva&`HyO0MgjEuT4waNQA5Pa*_#;sd>
z`}-L-ZQ9|Zp4RZsKmVWrOId&QsY45y{7Uru`%tLa9T-WBd}k389nHtb=R-@wA`O+=
zR~gJ<WMmW;0EUDw)^&ekmHg(E;%-YE$+4ao&}Q=q4nBqv9=r>~8rY6vvziSe5OS#V
z>sNU?-Cutl@0+p&ZbUpu{6WeeK=2>Bu_-Gv)5^{cdbtxf(O{NWch|TL=VH;aufRmy
z1NFP|0J7@pYVbJB@hQv8d*K;8Z{alGd+4UZ?)03TG17wBtz%+t4r1ww86Z<WAE&7w
zcfYB~Qf;88&v%`+!7AcZ5v<nF&CTT4D9-<VjD}5?4Gsr@-OR7s`1UPcDR358fWcHZ
zB_*Zxhkjydq3H33lyZ2XU`8-hFTpXx90?p<X_t1e26Qne00wv#Sa0|5-^YIeGJOqc
zVC%(L2EmQ8R#rb?K9Sw`#3LIZcCg7%#AQG|><g;RHrFkr-2z_=?ErYriRX`DxCWS@
zU*;<2%}Xx#WoX3b?d|1whTFGqBdbzilEao`=im_TL1Py~c>|zR7;LS?N24Q2&fRX@
zcuQ|DZV}_?kKqeJ5Wk6&DNju`%{u!MBz$o!ZV~5WcXCt=$;->bW9;khHnz01#Caie
zbEsESHMJVdNXR&k?(jB*1SF@-;m59DzaD4C!Rb<P5E?%DzStqN)igA55E#kH$==@H
zNl8im{znneZmzB;AL&K}GwL@#^`<N>EoEY41fRLFY1h}jzCxJ$$dGqppoo|S^TV$r
zBDC+yU%}5w)W__eG{<*^Lx<9J3$w9Qn|U<3)#E)23!QQI_&36r8Z&Xk=^T5<Lr`*^
z925V*z`$`{Lu2Faj*gDLKC&3SfB#;EZz~<$7^dE^`Wdi!;q9U!jQ3Tvqi>!qH9C;I
zBr(TzpyQH^v_S(j#kZUbM5jzqrXK&r^a0N8J#TCjeYfHc!q#Q9OYuRmL5b6`%j6Cm
zKN^LW#Y8ULg!_>vv(6qGG)1G!^sqa^(776$XeU{L$zY|~<clzLzd?$T%E!QVV&Jpd
zWs18QILc$Etg3pO_u|XNyb}L_fRC}$2u`J=R-Cv|Dfc-imp7<2x{;4jqoda@=5!AX
zM1N!9V|xLG^FYiN8qZX<7%cN!E_88~NS~9gb2Y?^Ml1PZpFssNcwiuL2&hPtwWxC{
zN=nDm8t|;7iixy^t1$t|@UZ-BYAieqN+eL7>p=yD3OFT+`Kn;lct)_E=x=VEn|V)`
z*!A5CID_Da=7X_Nr}Ol~hjFZKsq1uhz#&BZ$z)X~CPSq22sluL`%Bi&`PIak%JDD5
z9pDx9%1+>b8b?(HE51WWJ}@0U+GLupqHvXwnfcK?H;CJ*v<5i+d-o|N<eR_~={}$c
zrHn3WU96FzVHPMCUzx61-VBDALGe_F57EhsiiyF&XDt<)#Z!Y+)dxRh*uFiGQEVR@
zo1EiTIOS=meUZGI__VB0rOfhhVzQ|zzs%()Ps4GDR++`QoVhGb9jI0u^-E!T4ih<-
zYY|El@(CcKYMY7y15886r=J8cLB{gG69eqvV0K18&qBx36&@ZQQ03#xx5#7MMahMg
zNhq-5eo-o_TwgjVjx|GY<iWzyGOo>a%^E+9<0!Jc6Y86iDt8{c9}=PwnT9zEgUOI8
zv2Cv9_x=z?g7Y1c6Ojj(ej0sM6eT7)I$OxFU=9VPp9Ng1(La9taKaEkouPXqeug<!
zar9%T$l8I<PPyy!zWw_rP*dSI7s9<ji>9s2EzRx@5tL?^Thb7#LZ^EH<0&yP9#9ss
z_X+pzNds_Z#!aVryD;vd5-W*@VBsOZEvRKW=vF`#proVq%z{TYtwzU07>_sL-w$>K
z>-2G6o*st0pc%h*cES$9>;Y?o8LAUG19sw~{%16^JTT*oLYJn=BO_A|NJN_JpEiWr
z<LBp>zXn_-k1~TH_EEftib5;kK|?>xlt()GLnECPBO@aqzF5sz_-?SHVDsi*`do)9
zG=6dl4i~CgO^gH=$=zC!dPsSQGfc<GprIjuWi>V1W6O53X$cB>exw^fFT09-YX%8E
zI6&aBh?gt4UbxT;-`T{}beE6Q^k`CeI9@ZAfhI8NzC!uwmpJX+xf3RY2=+JZhA|d|
z>}|Z98^lb2zP>(8JvavBc|>@>nwpx&kJkYnXc$FVjqhM|f<fG_7Zy^TE;%2Lj#K#Q
zbO#$7phpAzSyXIn(Phj%VK<`g+yRZi=nKiM#V1k=JQmBaRUf`IynQ=B4&=hUK5JxT
zfHUxp!}|2};RtKvOf@qwKUm*fDtC}0lo}fKxDLRBWB>mB?Cf9hGi=lrc^_Y22sV2d
zgWFugT;1Hzbld4hTn`K+eJ(h7Nl};p@9OO2@WDx(x7ZJS9_zdiIRt@aR-(wm;Vg6n
zRT?-?!FO)Q4q?m3y5CG9QrnB{c@cTQaPlzDUZbma{OxEckgt3|30V6o8dCh|_r|OQ
zyOhUh1IWqA!No)2gW4QLQ8=oif=xpbl4BQBQ&TtjwlXs>pfe=nGS2P0qGoTQ=Gx!K
zIEiyxN_WV}*xXe??y9J$V5;_kR1caLHV0Z@XTlA;SX0{)d9-*kU1jJ5YhAw^Z`3~H
z8<M+uOD!=V&Rt{r=~2wYaYzO%4jTW#$+fc+IZtOd2i?D!U(#?pcxS=P`vczv<3Fz=
z8|Fn^v(wYmY*6+g-_o^*-FPLOM-wFhPc}nCLx{{sI&4Sz)YmnpUP(#h%U1w&qcHz8
zzr)Qq5ovVx>}c(Mb(W;*m4kzWXnCZ~&oXY@c)Y~P+B6_N-CRrSPJsRKpIdLgc=<99
zE@GrQh$be42gNWo5yW-y@L@Bx0>|Olakr9~7`@1vZb;bRs3hbmA9AO`2NWWd=Y&$E
zI`V(qX2&_&;ZqRJ_hmdNHI5&z3KtNd(11VDx=r=Sk!v43_m{p~x$o9TlRXD4a3ohK
z8T<w`tLfcEc&58E1R?Y`aw^@#ut-BRMmj5C;YJFA?lRF#9vdyA!)S|=?K3oz6RCm0
zDz;vBX#jo)n$vlw-ALT)Nr54nTqy7AItS}}*SozUA~N><nh&T^Z_CQc0BrL(Nt0-b
zEK(cZ_OrM4K%@~YIlPZ<kQ#hA%m60=U|QIBdEW+WQ65`yZBADH8psHe;=-cPiWMum
z$yGTroR!#_0!Rn=ek>Cf63PWfsbfbqQMTNWSKXF>3m<cUXLtceZR?}sr@x1Pvz$d)
z>1r<p#7dVQ>1rQCNTQzTQ`G`b;^X5}etm(|Rh8QJ{UzyrAUGLXSzl2Y?z?HA=>RvE
z$G;|8EJ0f?t){KqL%`Sk{P}!`!PC?6IS;{3(3Sm;gtn2wTNFGU-srr$|Cr?vU5jpK
zXQu{8pef+wER!G#?taTQJ|CC|XDkj*crZ)57Qn?26cl{NyPJ=X^yvJ-z>{J48dq%s
zC~HpQ9B8P}5hbN2yyRy8VUnPtlW(>tEg>q3-jWIflrecAu`itr1E7MLfu)57u&&>W
zV$Etc%w5%8;(83<$`mK9E^gnxeUKVB4Uy;SLX&<n4OTKdY!08ytSs$v_tJ^0kFiyH
z+2^hcm05jx23xIOXC2sRhI*=|jbj}IVqY~4TnfBp;5c9!47W$gPM&H514IAd;O@M$
z4da;E-Hex=FyLTv<A)D+nAT4H2K(jnn>We~Z@4ZRKSPxOyXN<Ib91YY76fMl!*#Pu
z1<^i|?348M%a;)hJt;7tRU;)PrhT~@m<Sh;%^&qh9fe$z4lnaqzG4Li^DqeeKmu>u
za{%2v$$&|7o_DQmIJtQm?_o4;L4-6mYPDX3<y3C!PSn2r0RV;3r7#$GKQ1mV*XU~H
z6E!n*`e6swmzCk*Fou$XUg4>VfYL+ubqM)Lpg_gTmtY&Rv*AiqBV<b}<P;EDcukct
zO8=;JEsRr;cI`>;HVj3;b;SH#BUB3DVD;}wJa|yapOmy|{t7@@MI3hwYCe>>4-;@<
zN}T>Z@1nx}<c!-8nB979m|254zaW&1`%->nwcNAj1f<ECGiNaI<L%}30-SN(`t=_N
zjH5QB4yBw-OEOlA77gaG-p*rdYnz&oPWu%v<%m3bUPBumGdfi{IkwPfw!GqOKIFR$
zkX?BhI1u@d<bDwmwVZuV8X8V4B)H<omu^;7R4B|AJ6zAt&u<BLgf$FkGgS>Kl1h60
z@go@+bxuHvCOULjcdi3&MYXY;p4@x{!w<VpX^!K~GmpBCBq4NdYzrXCdmA_5oHc7k
zZ$n!6hi@9al9PPkXFMz4cGK1P(2fp?M`9<1l=pxF#P)XyU5{q-y>g`#eH+_lm8tiS
z$TSNH3Wi8LxStFo44zeAE05*zk-2Hx!L#q6P0&>|y<}*(_grO=5Peo|Zm6HMeNcYs
zJ`oX?y=M<b+Tydjq5$1&F%BsQTv%jvb#;Ym0+h#+O)p*2#oMw>=W1zoiAxI$;k9aQ
z4xPY|qECwT;1$T6q~odkFEs|W`AMFyX6(zAVtaw}N?k^eQ`fRc>-KL(+R*c9#m`!E
zu(Jclljg4>BX5iJzPC+{;{T<6K{G7OfO#$=VQ5sq_QI$Gy*$WJ4|xV=jJh<>@cAm|
zhn=Y~)Gj+ar)%fz6glg)5r@CGGc)rW*xB39C%RljY3-`q@a`fb6Vn^K*i>;Zh|3oc
zz0pJ4w@0bXv)F;@BFd}i(Mu{3JF^On2+0%8VYsEu-0G~Iyf0q8ij{UdJr)VDuF|13
zQO6g=v}+bdHvu4Vx$@AdpgLWCP?$}F4t8s(si$!kML5G0BU6#>x9)TC@Q|l=B0u#`
z4>y1rV{Pm)m)(by-AuxUN6sB;2z8u>!Y_v~v5T?pNiYszIeL`33(Gfae*(z@Z9MTH
z9rMLmCMVKV(oYKL6=_CBx6wZmO9VGI)gSKYU=2hih%nJH4pZvx?d2AJSLf<cEzaaI
zbcL22R<rF1pk+rx7m=Lm(t-gTjIk&deR6bgxV_5MokhSSVG#sL7cDD1eUmL_YZE^<
z(rSl%)6d7a@B0mhQNiKY3;0k`Q;VU&tZ)%lBYY;HFmjfB6dzF#TP!Y$K#CMDX%4-f
zo0Ee>*%}sC&tJL}ZL*(#L1gx*zyYwgTNPiveA#BImUcA9$*wANSNY<?<;dwpoKqoR
z!qUdx$9XRsXThsFF-!6R=X1tB0^`svQZ%BW2^_Dsp_j)r*-49psQVz;l$4apouusi
z?02f}-k~+lB^XCQgQ!nye<=;CJQXFSFHRQS9ZTM|m}uvEM9h7zAb{9`Wk-*RKF<BY
z-P7sJVm4hQg*7)X#f8RIGokrzE#y#<{=0p&uPM6lU%#@)bdskbpFEKr926G9YeaQt
zm5hu!OR`(n{b~FXn>2t^<3$e@Nq_Z3`)W3n--Moqe2^WR@bw%V2|~PQ&Ouzg3yY7x
z32JyjzXgmPGsPhz#ax`6Zg>+$<fofBKZi5vahsmwwKGC~!Z?&LmFh2#*G(iAJ}<zL
zC6jd*4>L-+<f0r!ej>-tLl!tdbdz0nkq3$OIc|>wHxzR|t3ibbujv*%i_IRL7?Qt+
zHI;_P8gjA`{~vNvD8m`nB$wL?7t~};fh%nb`VjA=se@k6aRWwl`k`C;-jt}h@R`4Q
z^$MEl5*Q%PlOq*yFap>D4xLnNM!T)2w-+C(oLbQj0R^ppcV35V7(y;-Yiac^Y!v6x
zOi^ch_g%Q$Tg?)|Xi(-_*8#`<^VTplGXr#l^SPwax!U<LKQl9E6+9lZJn<7R58xac
zUaGLm=bFF2y`v+GxZMRP()#Z&qm&>MU`Qlp!2^MYwO9G*8c8||q|xWPiX;N`oA^<`
z@O(|SxB0ZuDi|9J$;oc{($Nv5|L*MJYy+{Mo3$(5F~#z=tBX92!1e-_pGC?=$YuNj
z40*bc0X|V79A?|UAFsyRB7Y(FCr_;;)2Dauj#h|+1LS9C--^mHtAA+N;3jPjA7)^1
zFexV(EsCJf(8SJN-=_$6MBJQE&i4a1=4Mmp8T#>yuAZKJ_jwnbIl#kXe<VsH;6L5w
zcz7c1HnujvSWizcslOSopTPO~iw+l6Uw-%?l;M%R=eYP%X(X<up`B60bz#48sN>;}
zI;diPfq~(#K*v6|Hr+k<`c~n_O+*)t{E)LvuiIUk8XH{^SlR-mVJN3xVHJf6NTb#X
z2?-T%@hwTPke7dk5|_($q<PdGr&D3b6*5H7K7RbzR3Dwcxmq!Ks>0^n@dkH?+L*WJ
zc7$BNc1;T;oTPM`DQ7NTyomN--4DD<N3s6aty{Tj|KK41WmJr$5xn*Y@2AyAR->*!
zm-j&!6~*=8(J^<Cl!gBc@vMF)KA`DV0GD-?8@;jVm5E3RGkObBc0wj&W&xwmg^W=F
zx%)XjXM8)%8&U0!7VgQ1T5RF=fYOO5%d%akb5k-hhQR=YYpmztv#Fw?`?j?eaQGF)
z`N1NmLdrZx(Ef~|$|(6n#iZk5{CcM=R=&HRiH-cklo>t$>cxweB75bEw>Re!_l>e0
zYWnA(FC2TEl_ljow9acdfjGDG?IO3OT|&ZwD9$3@fx3NUP_9pvg*gSLq<r?AjM%Bh
z^OSgZk6P2FBQ&+YhoA8_@UucX)`^O?zB3SbT~?ynn>TD=3k20kxjd>6Co!6f^}1+r
zcB<BL=kdM!_Awy0zX?b5Qq;lx0v1%!3s1(pWyK5~FE8)(y>B9}+caS=XsRXR@#1p5
zU>-i$Z0A_4SPw(0d$k~&&rsP1Qyc||`L;9h%IeKCA2yRG7W=Qenh8HBEp>aad3<7$
z^ogMe$OG<Z0h5c3E;9>D9A+Ufd$4KO@h%OPy(m+7HKI)6q!kDmaM&Jm0wlM#Ha>!v
z7ooNM89Th14SW(O<P)Yn7BSV2l;6y7sHkuk$NZ~(tsxKbfi}hSns48|gYvH{P8gEu
zEfU<8cUlUFg||VRv(h;+=yn;6Yz!s{;)D+T<0F144gHwNLH=5gcD+U<N=r-Q4wP~_
z%S?10;tENHfmV@;Qs#&n=@pnP4Sj6q1Q8D_{rI2}M~u@><H|xttyT3+Z;i_#$C6!F
zg+IniW!^~S1NyoUR^;DkbmKj_2oxBja!JP<pq@d{@ye+88(v{*2Ct$>`26f7<)GbU
z$of1}2a1pPBj!==5S@><Q7clBT#9MCR(G%Orq^ipVj;>PX9hnh=p6n=#Ri|bw4?-#
zgjX*bv&|qg_KXv?u0OF<_!1oD{%RD*1m|I2K|tHRKivt>vo~)_XG?{{5SFZNbGv8j
zW#ZhfK=bhbBJ51Sdd}On|Fb7s#+C>fLRmst+9zYHLDHs@EK%7LsVvPXdt^yOL`92a
zODb7MQfOzi5f#;lQlhB$^9?iieILhrJdWp`=XuS@|Nr}azt?qM=XGA^we+-aSGQzz
zei5eO2g(wtLeh?m{(1N4j{USJ>{i`?Jj}ymsl6bA?N4W$%^?<DYBWK&p}SjED0b#H
zb0>52XCguxwtLj%kxjP1kT_I3JX2;N666+AT|@EA=bfSu8C#bQlSefsNQBT6`rIJM
z&At<zL-_5|o~M)fa>2A%rhDgdKZFY<`M$)OhQeV2TxqE}V`U`^vq!}r>|Jnl&&``n
z@HZhBckfQ6Kf=>SSw8*I$Io0+zr5#X9#G}*j*%?n!2p2vDH4Ak-(@!<jT;JewY8mO
zWTu%#`<6W{hwMC>@H4mRMj4)z%#RNU92|q&!lUv$XwDV9#Gx)Pk2Z!e_5)I!$<3uz
zL3~IyGhc~}<C_-X-XX|Pfg~RXHXszG7Rx;|qtDxhw(Z*CQ2ML88*GX&)*+gr!vm)p
zf5;w%Y^O${W^?ALaRtXF7H9lj2hahtQYA*XWTd8wDLWKg7cO7U+IRz~?c9*J52)vt
zkkBCXkx@~hb3SnkGyqOe5EBy<iqj*PIynWz+1K8m+IhwI!7tB0U_(kSR(9@LuRqp6
zaYnUtOJB&tc(#H#vnqt{F>(4M@|nhvt-y*oKs7;=hsILqB=FZkoJ>5ZG(^-9jSH=v
z>IP%fW_59)x7w%rt+P)5eDE1B+QwbD)6*P)LAZ+|?Q+eySF$}+)By#Q@?1Co{2Ah1
zK$DgziUq1FzYzJ%UnIT21!S7kUl}=)+>`IzF@PNMy2ca8mo=5S$Z2@<#({#HATg|a
zU_Zz7AfeEwVa^rM!pe#NPwIFe97K(9hzhSHQ&dyzP7jS$@pFP*hZZ@<<TJ4>$#q_G
z(xepu&-h)Bu_wMRxo)&_ru_Jo$8FrT7^H#$V~`1I{WsU}iz}f|Q9GK06DncoSFIW~
zPxxKFnGkLEA*TTy(3r+@d;WaC$v+g6(mjX6At7a4@ahTOtP_nW-^aS%pM~l0kbTe#
zF0&rL=$9q=KK=aZQ~q_gIV;@~7+gTu9j>%;3KhZL-eb>T(_(V_B$O3%oy<uhEjco$
zYaeZz=>7ZmcNAukTbjOFD7-_!sI8q(>Bj?4ne0CYWaMAiD=GeF>C?c6Z;}_|YC342
zs9xhvTdyGN5_%@^CJ6e(G370)bSRM*xI8*)GWYtgO|DdfQnC`#GT+_5Xr$>_!!pR7
zhUM;GaBWzcW5#Lkz0MCtFG>n=o$6<3Mw19NytkLv-{{+zOv<c9?LsS;6c%t$I6!=@
zU*`y7DWEm?n^%C(_T1`)ANzC@M3sW0O~|zUa7Rm@rf_|@EK9>pLZSMH243|)aNuP{
z#WCB+-0PppOZKL{Eg%AmU;6!i>e}^`#~(ITKYKB;<P>48VQr6MFeG-zk|LwMWSc+o
zhrcI1#P43GTc6+J#pV5F%Ga)3H`|q8U>;>|WZ&*(lT%lB^l*-j;g*Z*r`Gj%Q9SyP
ziyHDfRR(nLB^@;nj|NH@2L2}9SOUHX=q6aY*bD9u@a_p;VW&419uEw}ghkm0S)sC7
z{}YTwapgvj9a~Y55PLeKY#4~07>?Pq@PEqGps3MX{lc@f@1rSqz$Xv#TREOd$M`f|
zXKwlBV<#YJQeXM{`aZmOFqKR{(!YB`Lc+15N9|KDtz5a1+VAk=thKJLC)FJfxO|1i
zFE&pPFTQ6n%GLM>6Q(~J4m-2J|0TpzwWV&*sLRya9acY2^zbhnNSsMC*mLq0ja05$
z=7hn6OGq!8%FnO6jkWlrrL?JP`i#|%ZjtBEkS5QMX3j?$o>-77`}$jin{pJ4$Bo+*
z+|b3{{U~(qo{fk1R#rB~?_Nj)=Zs0X%kr)jFsDUDN<P5a5VwODFRmfthg_`r`AHS@
zAQwFxL5LxfYla151pw>$IsfvfPaB5i?4POqp{gol`DaH5=!qzk@Z!ZL$Kv8XWA|6Q
zJN(j&M^=`Ky`!^Nk59CQtRJ;cW6rV_94*wTn#vC5QiZLd3_W#|bC#nyi)Vi8T`+d7
zY1{desxHpz6kd!X`NK<seQ~d)pm}Q2tQ)(J9kZrA5n4spG4#Wtb?AEb>uPPhvP3gF
z_}}b^mJgaRx@P4jnt?><kSb;elzLrFfvs4+Vnyi1y*E0*j?e9+eiw?qqzcae)S@L3
zdk08%GGqQ~?SDA7UjO_h@i)?r2(52Lzn0gcs^DKZal`wdz1Bh+*$FWUtlq%+tL*Iu
zyPg+ueDA7g!Q*GhlQe^0na3?D$j#NvnL!}(yH9k^`Z|=XCTDOYAi&+jV~hGAQj~TG
zr6P^r%nitRc`DFiKPi~SZ5zHd{7ibvxROch4R#Q2Lt^i6YFq~0MP|$QIh@3ceY&PH
zvGR(I3~h)`pF7?~*Dz)3)QTftgaZ^AKqiBRM?q?7SjQXN1}-|Okhy5_;u<)*RBzEL
zy}_%Dm`JI%dGqGXPdrRWyY!28ix-E~yrRd0?)R~aom<O)3f4oS7a8g*h@0ZdUT}-E
z*6yVqQ=hP<abrN#7+Pczab;^~eyFWQ!y2XzlAr7TTox4n`trN4l2n$F(fm|{!9=tK
zGDW$?dFYkhKg6E6ocZz3TyERougV>iXcQrw4t0y%Xq5qFmAT9o7$XL+_a$9$`>;eY
zmQI?ng7DkOnSpX)zmXhjYm=PY%b4wnzT&!?<kKmx6|>TxhO?OF!P3FRHaPgfqeo{z
z!FlvjZdLR0UI#_35s+UzVU4uY{%O;*&NTE@<I2;CGJHjA1X?X*;tbl7&n{~C(fu{t
z3W_IAhK@(>{+a2LpA7g7p$!O5Ej^BYF*85kMgbeMm*fk&#_qLFJ-duOJaqO#@$EV<
zLJ|XW^W^-@Mp8}jnK)X8nHO{6JVFy63(JgizxoaNk5IsSAub2dQu#Q8NqS2}`|9Um
zPdHtfHosKf_$cVPSG82XBFZ0IT-<WmvI@b}ec`+^5<U{ee$H`_CUefdo%c!|k7l=a
z1zI8v?Ai3Rbu1?ma>38KFg_uH#1MEm*6`w`OAP#NSLZBlZul4l`X^N$@-gg^$fIY>
z+-TagY3shy%qwZU4bBD^&CoanoT?@f-3~}dq0Tm;nyx{Du`g-$%Rk$PPrP^)l0<Q$
z^REiamd+u0^8sjH^fI85I77urNPTKsOyoo${gc68&z^-Iy-I8;UWaI>`bq1|eul?4
zDEEQD|Mgd|kF~V^0h))CP%Ak6b>UQL6gL5;)Gvuq1cXC7Kg!Z=im#a9Eyv;XuW;x2
zx#M>DcB1G%a@fJTE~82y&;92@dC${qqwdGN0M?g~eZR<zh9X8gSg@=#j9wYV#dHPH
zv;?o^Zu*04A7Z1kn?gpZS1K)u`3-e9p|O#eWKI}5^bL1v*5xjD==X;hI-rK3JXoTm
zrZe?lAR%NfP+geSe-idz^eS<U!eov3LL-4#z+c@#i7`id7RwhO`gr>xImd7KToVQQ
zVsvueQIYWdVakLn5-bKS42YkD9<!#)P>G?@F?vM;&W^NLMP0(FfJP!W9l=-~Sp^`x
z<lPYImK|9BGv}k)`~*?vK)j#z+_~b1)7*!Zo};cmX*|N$@P_>%bPY`nmXRKx&@Dxa
zbdd*55F$%!yHzhn7eP+TEWLnOMfhG)d_`yxXD`!z%=|>sA)3rbuHoR|0QbVorfBhk
zCi!ieOp6EY0Tzjm0)NRzrEkT+2k@F=)`iYW$f`G}TZqar?<{#|e64Al_JTv^BE9kS
z^aL=CA33s8yyH>x`(aPO{em85wuo6jmDlq?7$*+`Ro1^r|K)}lKLqv-liz0>x2fT3
zWMl=s8sJ|adb;%g6cj*&Vmt%VW4^%-UOW@+<8rpQp|{I7#uC=B7q?GXRdU?Xi7Dyo
zHiFI2H3+o;ZATZqcm@~vexaKN(Wd3bXyU^~kw-HFgN?7uc+$P#+#geSG(oqcS8oIJ
zHX!FRHe(#JaPT@ZcW4s&PBg+u=WD4O`DP=%F!u1#?VG^B=eFzHllTc3Ctc&kKnx5T
zJlK2NHYTpCnX8xW-1+f~vuz9bFGM-zcpZezw+anNCr^kHxt${=j5V{Ulo;E#Zy%zp
zOc%Y8Pe!TC%`==neVE1heExXHmmh*LD-plaS2-`bz?)N1>@emX$1C^#5GLWeD=J0>
zcQpVR^wZmX&O?Y6&c?QKW+I&<proii%buf=S+MTik{dwb7^@NEn~bt(WnKvm(DAq$
z8fqW#!N^Z#9c&-D%IUc}=G)7+l<Yr-P-o24m4B#up~&ObH4Z75xA`QzpIhh24i}26
zjd>mK->)t>h{(z1OB_Tft$i*?%-)IT&!5jUaa^Bd7-JeR&&s}f{!*OsYD$SEn=&K!
zmv%Fa%s~lpNGmk-<%<`q;UsE$Zft5y%uh8uW*AzYbvBI&udq!AzicVy9r7>^#0Jke
z$Qz}7g@VmRJBC+DKil&DKp|%4f$El(7zFTnCHXNrO<EfeEKU5}d(7e;l@Cy0jy<3h
znN!}}W*Zha$}xb8m*Z4RZL;p+{x073#`J6{74aJlSu{iGSK3VL2JM22IH*_QcsVw<
zi>z$s`3x)}ts~Zn*8`LbiQ9n+QAnXoOkin7%**0{D5j|b{}|vB)3xrir+;3HXedI9
zHj(xH3g7)*2K4^oSa@yCqz`K-iz>>?0}2<bsz#C=K5soHY<n}sj7Vm}D#TtP6LTAO
zI@U9Qq4Db3%`G=8yV;9@NSY|iZdPk<PV<&SaF<avCv_A0nUrl2KQ>k+tCMcvLq0V$
ztU}F58iOL=w{zz(xHbZgxK$9AkIs8IQqVskBerVU5^dH7J`jR+2F)#~6G&(Lo3z7)
zU5+w~s)t9%X1X0vX<NR0S_9#MMhRxK39~tucZCH>Nu>z;O$Vt->66H$bSkTeAD{^x
zcCFRa2mV}#Y0VpW1A`sB%4M#ryo>dkKJSZ-t>e=$N>f98;_7sVh-B!TUWMqE#l)<0
zcYWwT>NyPsWPQRuljGvGZCld)`+3OHCQd{Pm!6gu%^!h<$f-)lsPZ1)i6|Ni1w?wI
zKp3p9E}W83C_<q*GfDscJ%adSiHV7oXW!$t0t2r;bf^Qj5YW~Iozk8?S1w(mbz({+
z7tC6Ond;UzHYo2q9-(T8j9j*Q^#!?w<Q~KA!P>N}A=Z%e(BcrAD`*wdRX;q=m6B%u
z(J}Uv%mBRxO2jwQy2B))$+-!?18s$9^ye{S04CSzFcY!4|Lmh#=6?2K=u5<$C{*~=
zNK30Yl%$r2bVJc%i2gccY5cWoGD?)Zp%2OeyNsG0ZVIJe^WlS_UZ|5k^DdYQ8^i1v
zRV!7a7<$hLzCZQD>wHt9Hg&9M?eISYc7crLPQxNHNA$6#hCyIg;)F2mTvmmwD*{+0
zo&q-?bU;Ty3<b7qBRfG$OX!2(@HET!>FKG&SVg&fSEqQ61S0S}x*Je=Uj{7-RejmZ
zud>E}!cX(XeijoJaW`+?6dLyUZ}+va^T4Tmf=W`F!457Zssn>E)ZX^*zwVl|{d;x#
zE&NIPC%X5?DZFZtqs=LUHuhV8FADlmTzvm8nDwKJFq8HVy@2uN<K*Q#|07bClKVNu
zSSf^=svsEeDBJW(DxSy&Cluma8hs~nguD$y;~_-$PK3)i)Rw#$%`-UvXPd1-4L<*<
zhu>&Hm^*UF+cnG1Ps4-%0<&{SNW&47WO|ym@I2u`G*cg7CQ0%Gs@E^R2el)()9b?q
z46p+S8_2-F!Tpr3wE3iMulDc!W&Pu%q$vv5H#L|GNqj7AFUJ8l*R)7(gDAXO_VbEe
zrY$5p&*Z%QJ;0FEvyJc!G@E!RCyyNY1jIwe0c^o~6!8jbEEs3|SAV`%+Uk}t{e0{1
zJk?@bQK<cH!w#7G>F&oE%|Crg(f1f?`{Ewed569vp%pJ0uxe^g?hr$G;~yO}WN{x%
zH3z5EdD<tN|Ng{<g|-U$39hrmCvHb_Nt^*UpPRfR>a}M+LcHFN`eG8`!KyjcO9B5K
z+qr3tc@2L{Fw;XLPVByO_bz=Ze(7!e<je1Y5}5423{ilW2GWo@nkE}{|7-Zb0@=6Y
zYVxUg1Bzl~Oe5*r@WQPNhxp6c8Z~g>AL2c9K)Nd+EHYz5<_e_b6~%9p`_jCD=7N$i
zEYr8s=SQ!t*7<gRYWU72fEF(~T03(zdrH&%U2JS+s_@oPL7(qQ7u`z^oa(b<hq3s#
zd7Kd>CSFNp-n?rC1}f`Ez535d11bj25xs9pCggnLLO3#{eL+18qE3t|Ol#ayM`b96
zXF|5~dZqvU8Og0MU>eqmiXmwEAj$Uxy_XJ3knw<;2$$iWxA%`%@@pK@(z72kn<v%Z
zS67>i!-4OR0kcC*_S&l~!gI=x@B8j>k$eE9Fju?15}<j_`xn0;p7+e=AXS!^t9UtJ
z4?>~Eq?g(RU@b~@H<nou6JP<^lE`!#<9G<Cu_WPbtyr-FEa13s;c3S7q@kV+L7iv_
zxgJ>feTX)>72dDq#l?ff%%c$G3YotNqRLQphrFv%eD`fo$EHrfwRC+%<C#&c5K+G*
zCr?xI^7ro*$`9H)nXi$y$>cv~t0X_eoCMcZO86UBQ~Zmc+BlV=Fn)@+985PlZsweB
z>3?o$FxtxT;=M@@2cHKJeu8;`($L*}Qe8gjyioJVwG;Y%d+`JMFtKp`x1(eD;-otd
zwDj0jGDLN2OPjWB(Yw!*DSG{S$oSiBd{eWX)&Y9KtC1$*;@Q1?DIUT5`KUaj5gpfz
zBhQWyV+s1x#bDKwv!(yexfL1{NXdILN?@~#s6@pZXu>?}nC7|&^?nSGUb-!9`z_R@
zQ{M^3GN;TVk^P8?srOA--p?|z7+&_zWZ0@rPK7hyeV>^w?oG|NZp|7v<4Z9yVB%9W
zWcc`=N64I{-)0#>L83>jn6w%(a^!hXHR=|F9XxOvC)P|_=u>f>g62jq3><;lioJ5h
zl&M##`;R%ae`mV(5YN|FJeeWWl|dcXfYwqQ^w7>c;^xpsS7<1sQEwQnU~+hr4_Nx>
zfdi+PYfHWd4vbm;ms<GXCwSB_(k$tVp+>!<ssD~#*$(Xgnf)b&yt?L@!#=tjH&AKH
z-Ff=74rbVt8MZs5e#ET?J4pT`&{8nNC?8J|!gpz}gn^LQht~Z3=Zl}N%1<{liTkch
z@wGr<ER4boWGpP666lW{Jvxk$*jB=z36R%83GMer+9?Ak?-HZ3N?sFCq{9pF)_-r=
zh8<R@38KZsYBpzjo{bF&fNmyh)QT!A&x@<@rYl$2ePEWGR^pGb!~1*sPRxXYm+1;4
zOi9a!vkNkqG|2<arjS2PEgKP}r>(8soFX^WRnDVIMjCsFc~B)>t_1jG!oL}N94c%n
z`N}fawaM4-KQ5aG)vY;PI?ggme^NA{&}|;871JZzx8H}i^_D9vEEJQ)ywavuKn#E`
zf|sraSRlJV`!H*F>qSRQs|O6wml2&oKwL@M$@yXiCtZt!+};1b{zBl^)zw7?%|bJB
z{?I;Mq)B5pytfOl;m=;8Hz=kJ!+W4X2TN0u8tf2bN_<K&B77K{WBU9eYb&dj&u;`~
za)>j3<)+X5&pF`}#qZudZ@%d+E-sYh40;EP=Y9{J%@t0Yl9#vlJ)R7Kh|r|3z;s;k
z+I4KDL@hJZ{#6UR32ZFNV<f<_ubkX_T#knE5vBS)Tz`}QoP7SxoexOmBObWy0EX!4
zIvMQXG9$DeqpofmK24I}3j|^W*-U1^;>A~B*y(dK_v$-x%a$ho>-J*$2R$G1G+x=Q
z*P?W17<z??$pN{S&=7SWKFkppsYa1ew0!>&ZV84_=1X1ezJ(q&8Gs{I<0ZK}fVOkn
zkf9#J(v5nB$~3i19!&HUc*irAQR>^bFTul<8sYe6W2x;jgwudszrB%Z=t}EZm!QjF
z86ZHEVRCm;Qk*!*RJw-k+Pt_d7Kb5{8AqWhlL6=?O7-rY(S-^4eLt^i=X>mu>c2qb
z=%E7#7A{@dt49yN(kMFkWPdaia8UeW5}^84X6Beaclq7PRGCCVRj;8YA<1Z-fxY{E
zDq4BU4P-P85ehwW-;s@YYGFxv@RCxC<RHd#_zIv+3o(=)4@hO0QdP0LXIeTa`0LlN
zp_OwvsX$M)Z)4_Cbw@5#+|PYJ{5!|%20_w<E?c+OmS~+dJjt(d9XI6a)yFpS3~M`L
zqs1HZmgC%U#cS&7((+BDpb-NHiYNj|?rsof8UX^^%Z8saTm}9>Q3qN#47}nc2MI^d
zQcXj{Y@CX8(uaRT;!X?DFsiRk<W6w1i)LW#Ig8XZV5+^xzo}`=DyETz*UkAHV#v9<
zeyVF$uV#ej*3@?md`I9D^|?*FK-9EFtSl`TTP&A4KEY4{W`%z>+>38K7xh|!tLOEh
zJU<#y0*Q_s=aen{L^&76;v@ae&=YjWNo_Zi_zCpB`C$=Y+TfPbfPIX``c85l_#baF
z_FG(g{bmvZo&`c`Q}N{Mn}?98!Xv!;#(R`hP-dcelJQ>Jn1mFaCY@Nk%;2ojulom1
zrgQYBtSlV&hcTnn)K(|^DrIhQLW0Nh`&gE>ec(jtXs~%C7lw}G^Qpi7$_)3TN6a5g
zB+`ez;!~xG^X20_@;^RR9qAgNSDLP*Oy$Mo0_<CEe!ljrIxA+;P+7v_c*$|{09%6B
z;lw)+G>}eS<f3+l-er6{b26kJiZd>Y3C<jg3wYx^NF;wM^TaS8bS7}^kdLhXiM2Qg
za8HGPKi=}pZ%2uKcq`4}Pf4Gai^0;K69Z!V*rzfsDt@W`1|ds7@9|DQ$d=9gVSEZU
znv_M!ou=%ez9Z}3kYhTgUE8*rT3TXqhJDKuyAJV8OM?;=f&^;E@R}!g?kt2&LLL}e
z)AN&dHWp$Waok7)U8=~*uq5D~E#zw9=Aw!g`b|ZJHFiy9sv7}0&|9EO6XDoAPQ2F1
zalWxxKNKl*>Eb2um6qFFKujd2-uFc<rR+&z%$U7<4UJzt$^s||TnPXi0RF0HU6no4
zX4u%Uyw6%xf6Os{PJtO7KL5sbVvTIuwga~J)pQBtzh=}Sb22@1%o4B&c+UR$<Q<I~
zk_kTBVz`P{F#BEo4;>mbkChzHpyb@#-BtSZf#Ms@O+7hi{D0ik!JJ@zti#pe_6rw|
zL1hm(=#D(ox+e&vgm#qJ71O`}sQJisYxxVWtSrbSQ-+$@r?zU5oPf6x(L!uI1NS2J
zPC3@??WG@DfTtD_rj9&dxVcw8EkEsHaCro<iYyAT+qmWL0!MvQ`&V^UyVqGRo@1&u
zK3QLFwnDG&f6kk|)MiYs`sH)OM?@u<8O@oqHZ|KuWvul%^$`b;+N^HxH`%-EJyYXW
z*$Zd?BG>iM(IU4sC-h>+??2UXZQD<qZ38F14ygEYe0kq(&*Ns^d(6ESV$V<$@9o>4
zzg74mm7nC)Q18!=%$YKIa-iOp8w<B^eW^<uF<+~i0x(?hYOR*%mcgm7*XMqR%VGOD
zlwH^~Q2Jr0UkVg&c2a>AHPTH;0w^8Rb6Zu{tX<0x$s(kC@Wc%MV3+Y*PJJ7@i#6_>
zerf)Ny%lX=Xeq+OQZ@ON{j-P-n0>AT@6g_HE@^EylfuTD+eELV9<r+7#{5&pP&G<J
zhbA*^iKK+l4t}Xu|6U(;{klzkn6JOTKd_p$dZ;||qD#6NFmq51e}XIq+`8LlVhhQ^
z^HYB3pg+7fuKeMc=$SCb+?+r3Q=dMOS!FJo1!(=8CY(Z1L1||TX=gPe9<rTf@2*gI
z_9iAjJ>S{llNK}~Z8I^V%v^TmaA|SS(+UlRN8B39W)EO>+$-tA1bhJKuV6gh%T%R@
z{~K(B6`6A-LCJdGdGzR!>DuZz#an?#KdjaIVFdXlfuas<6u#!zp+lx+1FJoj*xPHs
zZXml?U-snTpl;?<*RTJ`fMGnLh&~)676EUsOT&GGgBPKbqejJNlLac64Qs`9YvDVe
z3-97m!yj-bN`tZA)Vhn`2X*sn1}~_eTk(FrG{Mud5>%Kyn6#`*tNO}YxHihLqM}pA
zL+LHp%$wIk)|&^3UmznTn9-@<DkzwD${3<0;o7yK-DNE-ES4@^>TLkPtmF^Ad1E<x
z;>o-grWayjYCn7c!}W$XLu&eP{XyA(j+4RW&5cEQ7`eryED(W)5hWEBO>S?&3E=Z8
zz#oq=Y|URD?B4nWMp=sAXrjAW{2adwheM;gl<%UfMS*zk+;GSRO0vo&mfI(2YSNhU
z*s$Rm@3EnQ#m0MMDx~~h939K}rj+U(8oQ(}%W2P)mj!PFXuupJR$%=$)yyt@J9`H2
zt+MGX^b-;N1K~MjlHjKx#tcIh@JDIi>wPOJs9ytX+OSyel=Zl*wDgSeP-YR?mHIO>
zVE{XeHSOqW(Df)@tChamv_8M^)TDOa1`8VHf2=DQ;3emts5@fBjI&wOm*qqMnLnEp
z17BNLXF;z#bPQF=gb9D;=DKqy;h8torlGk0gx55}xEY66{plHy`ryHA6+fKq40e#?
z5WrZAH>mE?ymxmx$03(@Nd{8YO{s5az%o1KSf_PQsc4{AtsO_`Aaek;F;i|6WghF8
zHRb$rNwU1bPlA|zT0YEFMJ&o>CBd@V;ga#-AFr&F1vh5q*@Fr~Z4A2|=6uB8--1xB
zq7X|{&95y6uUK4}eh5+VD`#hjBQ)NO1FbIQfhX|ftZ`4oc4bO1haoI~rzAO8c5FKd
z)4zLXpV3fL8P(4NAAL@vL*s_9@r;hdFHP&`!}34}yb4tmJ9X>`DNnwIK7I1!$tB!B
zZyf+J@*JbaEioV3oiNUYmyI3NazL+tKCxI*?(BRidp7dkN6E>2Vg|K{Pgsy-UwfN=
z>h8_QNK^t39$fo+G&lV@2?;wC*HIZQQZEA@ahNlb$)J#+aAQh6H4y&{_ez*l*e$xr
z?`gsjIrQdfD|<PB=m1LIax^zij$0AXd+h1LG4dVh-!e|fgV-|PIF3G7v+md(gaX}t
zKv=l;e8ZX=D@yGLXSplTq>CVNA1rO_S^mS6+Bhxr?AiNCNrpaoKvOzDW)mOg<!KEc
z{+m<&`m)S>w{I7ne>?^oO3v-qtl8GqV*4}jbKmnx!MejnjoN|-FRmnw$Y%+&vnwZX
z*Czpn^!OvNDeQ6K|JcuvyUO-w&z^}#%0PTr*tFcSh*D8O$tZ2+x*QJM{(#T1OkE+$
z92HOS(~4Dk#c$rw3yTm7{fhUq*lE^zFV4{p5D@*MW9uJd6zqg{Y~v#t{q;W%fB7-)
zEwgI`I%F@LpAt2iYN$ZN*Yx@6r%du~!0{Sr{o`2N`)!8;j^Y2YgYts%hgOHnh_j+B
zwuswGOw~)ocp87Xh-zO3Vobsh={~yZm?c)R^hn*W(tQ`hxWJbVO5E}5iHVQ=<vTru
z$7SC1e2x<)|8r+Io|luCm+##>+&q$vfl4cPb&?#8bZono0aO0N-^Lt6QAnGaUi7j9
z&!nJjKTK@1-J0(&EX7QxeBfMs%Das;9=?42s;Vj5qD2eN=b4lXKmx9yMeZ}c6G9jr
z_>nJXdV}8T1`HgykSv#Zl~_gE5bMqa$4<pzqs``Ou@s0Kv1=*&PpZFxAt&G>e8b{o
zpKWKaTu~o4&OA8d3q=cz`qI!~KuK@y4HpPf{FC;<a16+OQ+eNCX@X>L>=$bCSvZxx
zztu|Pq^$JYAKyK5^#gG^4$UeHNqQ?f!G?wreU(SFSJqfb5nfhYY-;3&kt0PvPoyTD
za76JywZz)=H6sFr0Y&YJ*jQJ>DBJ^>oV7xYn`c<eo}D55cR5rIk9lqHp$>Z)bi_4X
zQ!{ttnifi4a`fG^onjPxly!A)WJbTjgNpJt(mZnN9{CK~RB@Q`+RdatymQZ<a}PSg
zSV!Y0P46deM;p({KbC(PkGDZk4A?Mv^^wsa{DjOplb>*AmF?8YPa&3(#wAAb-MhOW
zC{X$pwgSdBD|l*`%YmX(Svw2x%*pYRBggzjtbE>%%?o1HlCtud%*~7!WtDl!iA@re
z08V<tjAMu{=-lQ6XNXpc=;HLhbnzmWdv;_FE#)pUGQPsub_w*+X;YP2_pgaMiQIOS
zD*eYTKjJv2h5n;wDT@5f;+NT2`Hm>@H3K<4{TPm0X#dQR0}6>5lC*+zLAE^_Y#7hf
z)rk|0Y`Vr0D$si<f6#amjUfMVP$ysNg6-|?(L-&CE|gW;GbQ*+1_{o}sNI(Y4$Ms)
zaYtc`OY(wYh|F<9h(MO}l54wFiMb-$;H=iDs;$M;2`&v@j;G;_>%y(pd!{TRToQCK
z{ji^1cZM@evEX|?xus-u_eWIi8Nzi%D8eZ`n|1I0{nxKwTTFbIlM@3nF?n^BYJv~G
z<ni24o(vR4HLJi5YRT~V%eTU?u*7hNOcf5Wcou*BIH~Q{1z<ny+|E_+1YpjZF{6jU
z7e)f%eAFhH1_e~wOm<?rgqKZLq9$Q_!5*_>{TaM}9n5&R2MD%A$@9Pzd5PFNhG>%8
z_!~<@e%t;1sCynyNlBsL>(;B+5RH{W$5HZBgZ~^6lVPZdQ-0gFZLQ|cJ*p=xq3p)Z
z(ysE#j-D}ny6ETLYsAc6Y{C)?xy}v|B?qjBtoP}#upNHCF1W}pt5AhXZaN#3R_ZNY
zwu}X-b;!QA_uqUC63H1y_MiYpw{c9K8D*Y8@y=z*>oV8c8o*Fjp0kW9BWG23%*uj#
zP)?Uloua%iNOog>#LI^xR8@`6W|^t4G{WDuZ|U2&QKl-Y3bBwJOCIBAB_$7q64KD9
zeK%OFig<<Zas1tGIJ2dsrlK-Cz!va^(Z2_zq@_VsMNbLdxoejSYgk>>)zplclQWG-
zPd6TUr?q5x2$c-oC!Z<Z`uCq7{(EV0v2d$EMUVdg!zxO@(uX9C*akYXzmHF=$)uE3
z`%coC?+)xz&=c^oA_Rg{K9H%2BFwDw+Vu?A#9hw5)~QPue2b&p7NMb(4xbJQVWj$=
zof?${^u?~PEM#h;nHxFwE{=PZ7j#G3{SG*Q8Z?gt7*+ssR1~pOe#ZC#opHtqz+8ij
z3?#|W$cS-V5IDykHNRz<1F;lc2Zsf~*<JSD+wIt>d=~qO-JBWJ5(h9)Gw07!ErQmk
zf49F;)7g64Hy|KFM8Xvu{|tw7&LPp!tG<DRZynjAkQX9Z(^jMSG}wW8y-*ovKBl<D
z!+BZGap8zeq%vGl(#mP>J*Phi4t|@D!wj<6_~w-XD+xEnIIKGubDZcjpUrTfE)!~h
z!>N&%O+4tu!;V(2-UUs}y=~j3&C%iNQudf1m+p*M=t4!Sv6Amgn2YF(`OTA<WftU3
z1%+I2!`RoY2keGpBs!%AvJOb|6{Hr*veR-4IbzU0T@B7byHO<NoGc<{QqD$1ZS;(@
zK`=pz1E%m#aKX*5zVx?2A>53}9R5L8_jDhwqcaaI7L$wqCDC?EgAxyuopXMM!f%2$
zcw>bryW415aT{+&XxC@YnY9UH3DU!v2c4uPzkKHfUi71D!3Y>aXYdU4IADf!nU*%H
z8P9MJ8?$8Pr1D6x)E4w@R56GfotNUtxAF)gBGR$99KHT7UAi<OKOi19%^H8`6TbM7
za1rrwaXCw-+&afRE|*MQ;SnjYrmp<JzI|zYVKbE+&QIJ~St&^Q$LaFfwM!EPbZ9Uk
zYtpukgmVxY5z<xT101O+0}Qovw6%pSf$yon!kmdej?J|W0C4c7$*XZ}Q{t(cb^h@2
zqq&M-4<Bp1&UhVm6JZBfOf}cmwz9Yw7@*8Fu`4`lxhFn2$YA-@q)h~Y#0$gJsIT4l
zjp}OW`SYJMB_DisQcGts3(1d;u@Wg-^=s1eXV2mY%0zyGtE;Q4KWQiEF-x}#X|8cu
zkn+>94-F>&Wja;BkzI_5NsOK2fR(e=riK&zwM5AC6cTmv3esY+-beg;F%x#4VtAm&
z$}R+6R9!0DMNz~1;oE6zA8$x)6=rN>^Myr9A`QwcShPq?U*mUSTWJjDQ)kOYY3X`8
zo)ly3M`MPxDa(FU(_#uE!H$i%*J}8E=r3VOnBhQ2g5_E38kFGZ@ZKZFFb>F_NW~S}
zu81;?v8{JsW<4O>L$@y5e4a8LB`ry!d0<iu-$QbGurbH;?;n`#OP563lZOwVX}-i%
z6@GQG|I}1`hvtGR_lskT?DkwUceV}QU?fz@i?V>k$xA5iOjo%ChmdnMtL)L6Amf2N
zc(;BV9ri?wWn$zyT`*o^c|lGxUM267R9w8+XUd$CG369KU<jt~V=nHg=K~`Pe?sLb
z>TdLnX9D|55BK~?LM%wjEOX)0=<3>9c^19H(wub#CK=ai-@jK|J5yR`;pX;P>)rLs
z_=qw#CiX80v~GC?5)rcrS;4Lo@ZK%h23!PUWoE~DYTmL_Ltn?mq!=cQT4<Kzw1h82
zjFn>7u0>3h5+*%u{<#rcRq(KiVr^AbZwwNDV8%@&>@9gPG-d!jeMXInw$f}P9qII)
zo=BHNb%@r`$Om=HUudD5O9gg=ASdG*Lx3HbIK<N(?f5vZaV3_)mhtRZdKjn`t2}lw
z9SI)FuWLNxRR|(hPt_p!qn|$CoP7JX690LqiBCVBc;cu?l}B###a;jY>l6_E@4wcd
zmiv(^jVgjw{S3!?4I44SY-o32;gSixr0j9u<73dNX|rbS^YH=t(IDlUL^vM-o#Nm@
zorjkd#KqCIB7}DmxKRutdTe$l-jUH}Mz#_h5<L!GmOfEL!J(UJD#*(-yKkmq=HM_C
zA%=b;b7Ip;m>CYVVfIXU^>#Z`-vpv)bMRFR^@mOS7D4BK)NJKZ|3{TGhw#9sF{XLl
zfxFsvYf<3|U5XM`K|Q^)-_w0k^iwg5N>_=3k0HZ_p}{cYeY7{MUc9)tIkR~Sl&`Ts
zO`gwl#p*5jkGZBfWnneCd?YA<^1uHc&Z`fb_5Fr;*7GyZhK7=Nr~BlIlkz!dJ5&&2
z7fL6E*l%@9=`2i5rJw`wKEw(ZGkTnW#O;J<@#qDEf^I}ub{j2Vd)i|HJ1>W34soci
zy83{KLjf4GQ_t{>h<iBBt18<2td+tBf5$TV1&9B)-uMP*LX%b?KgH_q;qh+#?YC~a
zv-=HUlIg+cc0zeXMW?){b+!joLvGyKvxGSm>gc83ZfEp~H_LdvRTP0&93azSKN1yu
ztiPXM!N12$=ga%MBq;GwvseB2Op48l%=>$JCP<&eY}v$z$TNlx9ZF|Sr2skT?S~ME
zl9ca9P0i+mQ`|G<gMGp6M8`9`TC39t8BE2rzvrKi9zQ<slKm9cw@7sMFu6QTLA>=g
zZbqXyS_no&!qd!!h(T-_ZW;l95Fa0(azIW}==|d#cmC%fb3CaOnzKsh4jrIfuzKU@
ze}jE5P6#O@$=qNEPiiAu_*fcGc{YKVKX1kXKR+bP^R!z0zU_{KIvXjCDu$K2#2Zc}
zhUVYgvW2w0@J|`Y%-r2wd-Pbxa5C4GtDRjqR|=X*GcO5#D8_09Zb`i8(BK}urXh*B
z|Ky1|-zZ)BvE$@$SaT&X-6<d8+yAEw5E%8A7h)$1TiZq~&gn;VpkjGNH;b*Tl;V*e
zTu6H`(w9_**cl+bM~U*pK^<`~*Es$wjO7gcahRdUX9Zu72eZDixL_kYjrgX|P>Dj(
zF0KCD;#T#%YpWLkQWboba0qG8U;k=+@7Q|~8!AEpy*l;&9?tB#LP)b~!p7!!_B9>S
zzu$8!__q4bpNIJwf~Vx=9jLAvuSj-nVG4T<ZYAkYf-N?;cO8et0e0{Fa^37s+l3tm
zPD2#Z(_tppafB5FU&U${(|^MUzZ(@5wQRE{8S?GhP0SN0d7VA`8h(w~^5|?EPt77+
zsyM#E8GqitZ&AND`&))*n2PQsM^0pT{U4ZP3}p-{q=!!)TN_liyLou1`lakjiXveW
z-aJ1~o;p=9Q&c)6AWhfYHd4^Ja}PmRsFY-i{{EYOlTC>lO<{yBmz7&7^aJ_&`Ud4V
zk+we8HN6TLw?cs_D*{Nz-kC_RJ#>wl{kI(MP2wHV85wSK&G?jY{)gj|-GhIyH~mUs
zePyo6q~kGUa#R)kGY`UylWpAo$5YpZo)iVFLM(C9|H*~VT1$l?ixyc0ss=P1FkV?o
zW<!|~y#gwFgB|DzMJH=qWidS_#cJ3LZCDh89fYJ!^zwwL5}n6mth^m95h^1=8ZcEq
z=y$_evz|JrTSbXZa&w}|$QaFS4mHuKPX4)Wwb6D;b40K~nU7(s0o&Ej<n=ev=k=oi
zT^5szOtfc@9?dD>Uvj8YYAYSy_C1q_O{S8j{|bTYlLr(C&af<*25O_03SX1Q9>Bw^
zrF7r2+IrXzu}h`x)?Z|bK!A*s2uHv`VyxRCfB$xE+I*$dHE3qcyuI5i)wq8pKZ{-G
zlx?G(xt8!C`$r5-_dI%Cr>DHU=p5aA=j!4@B<ljH;q*tkG~i^pO`=@y-p_ER5Zp;)
zj~~x_A#@&Y>EMlV<E}D&nX&+ANDLuIqK?NkbEh~n-dc25K2^P94~UnXc%oEb#VkGZ
z)E3}%kGztw<d{BfT9^y8(|VpaFZ=RsyCaM_LyQ(D%e-hB`yen&^cbmT3W|pB)2>b?
zXw&|g(M>^mc;^Q`p^Y-RenHcE(3?=FMdpCF9qGo4Jm%{C<H?4DPT*p21`1!}LF){p
z(R?$7IfqXk-WV(W+e)tHCMIKuxeyM_>TRw2P14`mLqS2b>AcdNVn|q#&TKVb%ENrZ
zkz^TJ-t{u+OYnb{xJO_7{dYd2#oyp3Alt(IC1(#Wlv=G>g&ao=sct}KD0imFT@R1R
zLvv8)ihrnEC(D#IX&){4v&&~;wZ(U+`*Aw1_4>aG15bBQ8{syYI!$sD1tOG$!44Ra
zetMgar_T0_e*-SSu_fC<ooINzri~cE{}u&N8|mwNUb##afE{s@Ceh_BV>ViNThu#A
z!;Pm*NrFk#P?$mY2bj20&$Q}^Smh_C>^UAzJAORr!{8h$v1vYeq;QmSaVZNRrFrBB
zo6Gg4hL+<9-IBsyl)cLsh9q|-zV3~q8y%?5N`H!_mBj~V1&maXdP4~=0yA^tR8&+j
zC&-*VfBt+r#9%5`cwV8k6|lCmyxI$l6skUc9JkI=y0&ae@M!F#L`i8CIMbrcfoLk=
zlu0;euBfzBS-{rzKN5{uRz)pyo6VFN4HE^Qq?=tX|7e*s!}}wHn?>f_D%MPX6r0BJ
zv7Zu*0J%=>0s07F0&B2nzY`l5A&{*;E-eN;F?_+JBF&KwIxVHDct+0=;Jo0#a%pue
zsF_|)aE=rI4IEi$aAs(me;6~-g2-<BJ#D|Z$+>mW49vcU>FG_}8vO45FS_Sh9{!sO
zLC~2b{*dhu36KPJYH8hOX$yk~r9Zh)$}<8Mr|gi1PjPpTzcRBJUMccs@ruFUX;7Ny
z4p&>K9YZJ09x~7u>ZHVL*FJrm#2qJY9X5>pUa(tzukAeBM$uN}Nt2_Kl9nu9Z2BMg
zh6chv_$IptrU0N{`jb5jEa%Q;tJqpt&+FTleT3qo4y$z+E5WJG6crVhDGir895{Kh
zzoMd${XK`HDO~a<H>N@?T_-Xh8z(R2i<OL-V??va&q=h~{ueLa(!MrPpiMyeOx8Z(
z@sWLv;rK6SpF5J+@S5VnL!3>RFoEiV|H?C-ilJz07hTa<<pNUMXMuqV^0eVrGAn8g
zcF;-oD<6?AhiBAZSy>-WM8+EIpl7nZpZgow@J%eI{!PB)|H&COBkR)p-`{NZ7D5jR
zpHs#QqcTca06##6;y0K+CY1<478XYX1@eV-ma5oVN)PJolhmn??evYfWPkGo7^nTY
z_~6y6b=TyLSFSw2Z{Mux(|0mhL{Tgv<uN%=`v4|4NM}yEX>5XwLBvIR44Bm>NQr}s
z;Qr*4VD>lU;STX{H%vucJv$}EP{7RI-|tFLKOO~lrJ0%d2x>Fi+{%s##dq(bO;teJ
z1P=n%=d6~}Zw?H$AOm}g-Vif-;K>MA6ws9!q69@br}U?=k+r+slAg9&H`3dlJsl7J
zmIH7jXRa^LelX~A3kPEkhlYZWd8E(Io%_vSMw3wyH8C4JWnaGn{CY&p8Q$W1N-Ez;
zyEbqp6f85#Rt%?<sNrs0(u6m2FV$%+NWviHP!mUW1#dZGkvnA?;dtH3oK8~MS||z;
z%>VeKrMCh6$L-od`mNRg#(<#RX40@0E90o;16I$JYW;58F?L8QfImXk04mSQT1(dr
z&mSaK%vgyhX0&!+th<2rl5{BnK#s_VRtOp~c5+AdU4ps{D5s#HqOp<`X;*q&k}kLJ
z*wF^-R3_G;;D9E&6h@~{4|FWjht%<<Y1FfOcNwJ<Zl%2=V)gF=t?hH$!`rgWGzXT4
zK92?`^UA?iVf`T&(5RfvaoWO~Fs`$iW_d*gdR<hU3~Mf5F>G!Nsj=O%Z(j7q{vpjR
zz;YtYOjo*fqJ$Ud@7pEe$*6Rf6%}Rj8)h=NrS<I3;y`j5p9)yet%bC@7jO*;AWf1L
z%a_k&%fzi#U6KQ3Jdc7hUSaYXGkPtGZCH^8P}h^BVyXNz&d_)+zz8cUZh}_L6B`Ne
zj>S5K_(fy6nXmc$y}q=|p!b630vsxk5n|o%F3JH^azkk-)x0SxD{~Z&&}c0f2F{n>
zpkd_z%Bn!*=VDAxrs~gbvo;btobR`=QwIJq53Hu(6Qh2U1cxSrSC4(xJfpLc@er}I
z3j*mT6i(<@7#NU5?{6wx)Ct3%Nzyu_0?Y0?Y*!?Wy4D_ICk5m0W?Q}p-UN}A#aU$N
zP6hF`^K?oWLW1;+Zrp&Hfy0}i*(Tbow#v$^|9}0jlf7+Kvu6Wf=``TV;RGp(r%Z=i
z&KnRL#KmbiX8wssf93Exga0YA-O8v@qo`Y&i)?gkL~>DNU$CU$=9lwX+Oh2@A;owH
z-_&SSqG{f16;)Mhdcr~>Sx$dA;%!!o3w=c%j@s-N`Lse0{6~injBj<ynE#_YGQ#fN
zflb+k(_2`|z&=6^A^{~@LDu9&#xk-;b!!@KN^kaRY^?w2L9m9Smo<z76#x#&AUlVg
zxE47C<WR?Y;2V)=H*zcqayk8R;!?xREkW}hVZzqqZA^l@&~R~iBlK=SjuDw|M$UQv
zzYk12zN1Ak2J=omtzfi-8Go@~i;tKPW#1}nJpC>)yMy|Ni^mS_si%yS1K;$g>?(7K
z-<7n207eE)OhgX>fdj~5-OM)v$HABxu8TlT7-F%V5M5wzit*jMjiOFe0fbT?KJ0FQ
zOgMy|eT@4^spnuk@=kW9eUCEcAH51>7J3za@yg%4fxG-y$HDJIt@E4wD-_elLsd>#
z8XI4raRW}m1r1qaHE6N)3-3e@xha$a*O8%x9MNQ5|F>nR$uQTUlKwpVy&qR>-h2bP
z0=c)7Q$&Wta_~jLRUx|eEaz+k@V+qD%Xcv|HXcCitxiSj`1mos9a|>1anYxh%DnDa
z=i+HX))TxFNRFtRf(deCe!NR~$^!m#k@pp1`7UH{ja9#ic-HAD%(w91+jY_$Qik#n
zAUn@vsk(eecJH3LLeX23tJPKNpuXUa7wE`h?&+up1)Hg)njQ=}`GpnjjP<L{vy|4E
zp`B?KXZK<c24Xs*KaBQ7{#ZidYqIj`t9h~X0^t?D(9@$lRatM#2fJ_~{n4YS`VXz!
zw$&v~yRIvTE-uK~060w0xmQrw!CFQ*LngRC(aD!N_B5h?LsZ1R|N39ihixN?KA|Sh
zpi2@SwOY&MQ@Id@VX9-aJ2?L_Q#nJ2hQS+B)x;!fc91&Ern-gkV;wR_jTu0Tzw=3$
z{0|3-F$Fbdkm+QtJ%cXeNB<RSq$%rdcLL+2Nt4KqP!89gizdA^mpE?_QD|jtEhur?
z$1#Z$#I!5;@n^3$tao5cIqF0Q&ro^>X98E#w?zP*e{s?F+jiGB5woHo*0}Z<@MCun
zg*&l)NBy4ei7W(A<3o%X@s<L`J<&GzBr%hr1Lz8>+C(0`Fpe5Ognq|G74EN(8-*Eb
zQ57+6d_H#WTv(pzu1SBVpK8^A0Xn@vm`j$psd2Q$ZbyJPS}Nxu2KLZ=Jc0Qb7874p
z*N<Zb2&w~YBN?i&iSDSrn&J@jDW>*d-dW(Q@{uD3<y-=Gu6h4HEax(Qfgmu-lr2_=
z^LU%o3$%C9-`G?OeF60qgFgrb2D!>fo}uz+4<I(Uf$%4mgs@Sgm`trlaqBHdp+Zyi
z{M)fD!=fqD)i`~56|z89^cghB|DrjAG`+j5_ns;#ITC>b+G02ytEi8HQ6IHGo!$;W
ziEQ^kzKdkM{RsRY1sRG6(Oq0v3cIvVe)K|`4I*j0jpOw><%wz1b8~-Q!JWVU>E&)9
z1C6HYGX~yh%te8K2H3A3uNOh^^_om0atP*A`^Wc@jOM^p0ewXO>gJbfqkD>0h#B?R
zIEk?4XnA5HAclZV`y2HKQ!yzE=p<+MhyW0(sa;TxHAb2=i1sT^nZzhjQuF6X)<Qls
zKrhjD)z!i4xJhD#jMp2CF5U)Cl)`SH8zK#Uru>NdD~r!g%b82oe#wpYKY3~@X=!<o
z@OT4>0JD3KnRJXskWYVk3|pD9+bgepP9*@l6xw6iXY9JD(LabUbouhwqPaYcho#^G
zu}_<d7*cKU+xD`u0ZfRqpyB#$1f1S{%r)?hnUjzYiA#nSszk4o=N7-fKw)u*-1E@k
z!-gJ=gmBRN$jPl<)eV6nhl~?KHix}~H_gkl4mUMdUrB-Ru8L6xeTwr%_rVsSD^Q#X
zgLHCPdHSiX*TV@>Q=xOgGjc)-!)W*ct<IjBNe=4l^K=3tkdysC5<L*NdGKD(WlJv}
zcWiqXn)^AU{JZXT=dSCL2K{Z_clL0TBv-pjA6LK$Gp-A+8luDY#bdZ?z+t^9Xi5Ka
z!1(cXm(z5!9Mm(H4&l@AoHv1doL@U#!#qKoM-7fEEBbk#Hdj%bwCFacMVK`Pj}zb-
z@aZsdUu7}0J7us3yU!y(*C0<9hgYb;itchL&GxQYw~lV^GH@qHm#V$++}yDTL)%F~
zJwrl6A@Wus=j+j{KVqt<)Vc@9_mF~qPj^r!sq`{%sd)01p9AzUz7Gej3l`##9;>;)
z8>R=~!HWB`f|g;rTW>>nZmTQwcn9Zp=p*9iCPV|TJ?2Tb^hVEU_$cIp6;ZLo@liT`
znUiNXX$EX=KpF$Eb`!-u0(uO)tkc9u4G)>@Rei?E0ouvqT_TI429~Wo$K*3-g)Ffk
zZf%RONcyNJSR4yqSXKl8&~5m8EM>`-O=4BtCw91-<vRXTTxhJM%ATvxLNb2Mm%vPD
z2`WxBm2kTJ(|JfsNnM8ULqTXD15=9U&)!x+Bd6mlgcn=6EXYVOiAa4Ad6YbE6Esuo
zg&_U0vZT2r5DGoMA-)Q;_-xYf{f@b@c&vXx2E~<SM3B{V5Ao5--Z+yp-$Bge0LdHS
z$};cLRwq?}(|#>9?R1|aN&vz&H@$^M4d!p-ZrPx1AY^)rZ1r$1YgbIs=zuVZ7kBf+
z<>nO#m(ygG{;Q~;>Oh=%Mll2#`^VO;C7oMJYQ2Lq<P{a)mXze+Y87|cw5Ot?4ns%Z
z<P-gLoP_7kKxthPmaHFGXiYbqS;wo(Kl9a)XQwL4>Ie4f){WlT5u%KeS7PE<VgZ7A
zRUn1uJ>Idoj94Sf@k3J<<B)M@P&YUv%X5%ZB$U@gLW3R5jsi+{?bBc4d-9|n0~na5
zv{yn50UCxiRN+U}-X|@dU==eyYf4^JqX-qy1Xrr*vSFSsIi)VDve6b(c#zVyUR+Tz
z!Hw{RA)_9>dw&KbrUZsvg0}@x$|@1g#$aknp%^CK)<m4;9XF-WLtsoC|2iX@#N|85
zM|#x2+^4RBU?5b#@fly}tJPh)b?L~tO-KuC-F0G>m;&xZw=)U*0#=lJi*;!5G8~%y
zTBe)mAmt!YAVnzs%6)>D;j)!<ZWDB4B1wf%!`QpG96$Om2D5m5{44PdG{#V<kuD7V
z_jgdDFoMMxQEpgGO$MI9Ub1D&%}rljp@k@w!ukMJG$2SFJ#Y@DG^3BafBa9L?r_OB
zk$RBk5A|d3Yz3*!fBa9<t<y*TzHh;wjX)_MRD`kA+}M?Y(b3UzFS$sB4(FLO_vW8!
zZV7UEr|0C+u=zVLc3=4j&s;fv;prxpLeU*oP_w&5p8N5k7p_<_!fX{<p0=I)UZZ;i
z>P%HE-$Gl}BA2_^Hpg1ZsR6H<n&N@b@RrW$oWwSE&SayL8>=5+SrW2c)Pl&Bi+x&4
z7ltZZi++Dl3j{#NwcVV7q;f)P4PE+W+pS_MQcO#Z`1%g;a(H^47Fz?6c(6DtbCGJ(
za}>dP;D7<_^J>LAao*li3hE5&t4B^5#|WZ;y*;H{TkMxC(XC0JM2y3{1dixr{u&hu
zXk$u#FF8So^Y;RekN{`55QQ;O+1l22rM-PEUxfuHbI~%F$BA9vJbnWi`VK6east_i
z0Aks?nU>KoF*bhB^<kD4U%JVhR_FWStH`;`{j$%5U4BqPxE;dx(bx}aOyEw_AP5;8
zd-v{LPE6`3&lb`r3o(|`bClRM!nS5{=XmoE6lEn8+Vqc1hjw>98<tDNMKe0m%aX=1
zJb4K*tMiVXJ4N^0HoBLAus>h{1s^Vb0PE-3I$hXJOmx4+4-Tse@u8FV3C>l21{Q!&
z3awkWPF4)*X9YA9#<E`AAttW2kbO&LwhlTajIk^&|H3R`nAayF+uct07y`1OHQ|4L
z-u?+%TCYn>>7k{{w?`|%Sf8JvJ#K}R=X9EcJt5KM@JKXc;igEqnBt{k<KU+R%ILSM
zX2XM6TqILOCyGtQ;7~ygEKUmCPf6mGl9zX^e8md3SbAPr`8(QuhWEn?TdJK!^Y#Un
zFSFd36n^oF%y>e=R(u_V=`Z4k3zshaB!)B(vBLhvV<!H7j_Ov!5^s~lHt(a!EsOT)
zD4VT9lqFn}`FNdDA%)eOcV#ltz`tFhCNDY;zP+32j$1#8<tX?g5$fd(u0u$0h!1FW
z_1xyby+myKlYy{UrEbsH?L#5Op7sVrN1WZiVDsZH)2m;oCJ(6f4-C{<nLi=!@{wEc
zLYnLUhP{V4a1#dU<x}|zh{2{+XfV)YbvBC$A@Pj1vO5KytT~Dp9Ukt+H{`8IzTKJ`
zEX&hkBuzX7xzjNAh<9VImY(S(6<7VWXx~@vg29d<Ts<)GL9Nb&9nZG+>Ansc#Js6J
z(F({-vY=u+-%j<OE}L^9Tz)dZ4f~~^h3$U+gpn(BGZW5!q5<+361jEn3g>Yh6vBx2
zSlMj7{i3*7NLr!So8R0w?Oo3yRA9`b)R(^GfC54eil<dUGlMjL&ADWk+(JSjcyK=@
zAD?Pl3E3tPCxf>h47&Jt{dr2V-~B-#Er6Y!nmohJC-(0z!sT6@rSsn!4F)?<7cOG5
z9*mM6BqVQOp>E%my;ee|^6Y|_JU@N-W1#a2HF+;NlJP0`4*Is@tsQ>JI_TOb)-dvg
z+Atm#7Gw#Q-{lXJZvA9t(>ly}C}m|FDVok<YM=1Z(#L_Sn?8R){%S0a(G{yG(?m2Y
zM>Uome&{HvZy>iPa5TK-1k%FUC!ZaTmlk|`ONU8@NREwVs^GYeYpzS7Z_QI20D5KH
zhqoM(R(9y@5q~6XJJE|T6Rm?f8uD5=%3u^^fe&)rq!yl&snRL9`xBReXArd|dl$8&
zXW2nUCcUCoj3g!}uU@l8JlEakwzlU@Rp2QmCt7~#qyO~+U#8hAz$6^&#Nu|Ah|42#
zZZL+xT+E7`2*2JfxphImSKxnw4&O^o{OyF(kq`lL20MP_rb<{Y(j8uQ;*A?WXHQUn
zO0Aa*C~P@{Z?WcIG)U4P(VNabjV@KMBfMC&W?-vZnJWsff6kcNzeh8`Bs-esOO}5i
zK!h=bOH}$(=;Qdx&6P>3it>(D3F}uNFhRh?e_@{YJMS~yS2(dn6rxC+{yeOu=ioQk
zACV2b<k$ze-8|Ayob=js@)54K^VhuF4MZmmdAbWLZobl1xjRtt_){Df0N;)VVrCNj
zX?S7F3vwVn-s8cms)c7r`j2kLt|CY)G*1^WUim3rs6k?!I6s+5Bmt)t^P-IINw>6#
zX&R2N$QuX8qb{h;M5%|dB2QlYH4&x}Lr!ck;C<UIbWda%*w%N~KRkx9Dt<ihEa_){
z^wSPXh3*w2uv`_xS}q`>T|PcN3@A3JO^rFq@8<(*How2!u32ixw!IK>RK{GOq`moJ
zhsZ`CMl_%&+?F7~STY@m;kgYdL=cdS!Vp3dUA|zf$Z;=jqIGE1McenpiIpo>Y$EoM
z?!}wr?DY|1V%|9A0;crFGDQ^?qDo3uH0;NRDrX;4h656uhsntf^hx<)mht}cH@g9<
z4R!!*O)MlkN1T&JgSM3ZBp%|Atjyu@jQiT)EN*-^0kh!IBR+oMuL~;5Ww{Dn2f3Hw
zXHT4V?{F*A(%M>&HjjZ!aK<OB3z8MvNs{TuXw?}g#(>_n2uO``td;%*lQ3gh;xot}
zDyWYSohl9(so|3anooL+J;cg4o!aa<1P9L)FzAqi$wCoOy3~ZxyT;H>$AAF=0V-ba
z>*`kFBRh9s^U1Fl?$165r7a5wNyeeOB282aqj{Fz#4JOCK4V!KR2eUUt>OnaA<b0s
za$ppK!*+w&kK@Ptia<E&!3#P~ME0%*8{FIysr5x2)s9tNOYQC3E4A$?{{r_A9!!`p
zY{$0HU<cOUXF@{6_qAKulzGEv_wIw|^*eRyq+x{JhG)G%T2PT>|5ntTTm>&VNG77n
zAr8CgDX2gW(oMOSAR&-(bFBXq&A(>xlsyNUDrO_09aM=nO9%^_o+DYt6bYIIwnGh$
z5aXM;ZVwibEHk4OpS!(QtaNW3q$zO53b@N(c^>a)k$#m|Tv*&nQrG7*MqL{tNW7T-
zY)1^ToyednUz(2TNj8MK9@cyT`X{D#H4p=_D33VC#u{b*I(!W@075`3HTkg@3>3d>
zJM5PT%A>nT`H%1210IZg@}#NSI<EdLR|(2vx7?k)yaw-Hw!Ylic--r3bA5YuZ6|dw
zt!islb-0b>Frl&Jwb-B9EXbre_~-yfC#VtG6lN-`;S6DRiyxb`>x&*d*I31`TDgU%
z)`mBywukuaas>?8wWraE93qs`<k8WwLs+VeZp!LnKk2u@IyzqgMs6&%;|H@faG>k)
z_@&UGnRjA^coi~}>zLgs^XVu8Fhgch`G}UBP|@-K?HU`k3X%1M<?mKMbWlHY_G|}H
zT|L-$n+xDt@q4>|zT?Pqs;drY7V%Xxr*O}TXS9@DjjQ^ZRh{P|4@VLzY6{W{&7F>k
zwxc*WwNp`)GV^_zw#!GLdo*W$%#?_=-y9j)`wscLHce*;|LM9(wzel~3RA}?NyGE$
zRk9<lk<(UMQZjvpjYP-lY|JWDLv)tC<Y47XQz&<gw|uIvH|&G(!2kO6^44YXsO@Rf
zU7s$!=&Vv03{dFec93(abs;7(>rOYbOb2zgps){R=%aa(A6HQsqVVM@YDG!_`-gLS
zv8V7dv#~M^$-8(PC^dhqd#<B4F6wye4sJ1qeW=KJBhry)fj9mbuA*{a90b>+pPEnB
z?4jM^p9y2lC)-GIb#1ws<K3*RIh@JW?f6&V7kHl@@2q&?n}ukUO43WT>tx%vhqiex
za!=<rw+1qd2?&e5eqEb^M%HCRUgeLjKL5rgp5NuQ#IF2GmnucEoqphLo>uv;U9rP%
zsJHdpRyy--9+NXhefXJUjKCD0^+}vkXr`B^mGfyz{4RGVH#eZmNon|TCaBh|xmY9K
zT={kq8=`7QvD1zsGj7eRwr{P2ZV<KEq!Ax^7Mve^_>OvOAo__b@83c!F#9Lq{Ntg+
zhaVaz>^a0yFs~YS&p$~@PtOnE<nb!0lOsOn4~8!&4<QOdtEKs^7n&rbCzp!D{poUU
z!kZ+o?ytN&i;aCc=SQ{uG~Nl`(JJa{=3}9RW^wv~sSA&WO9v0?PDMtWsu<GQcjQ;+
z_3NqA4r_Jz?dP#$v+mzNoB5b>nXPWgUE8VkCO0Xyt_Wx)8K*IDqy<xogCwbpu~NKN
zj*$wE{Klje?3(BZo<*DPfc}DVV}xtiwt-4YLI-ECgY?2~Oc4so4`{|({slM`e6OL_
z78<MYB&OQMs&kIa07$5QooOSZG_OmmgKO8V^BZTmx1FRApcS(UmgdTuSPSXy7#%1n
zsNWt2=d5GTBL$(}I!Wu7=IUm0n$n-dXoAuFudiA3pu>m7Q4EKL2Pr8BHv6;NhP^qv
zkqHaGyTd9wd7x0j{F62x$BZcv;qOKnM!y5P!`!2(;c0P9%heW4EeZcWy1bYTu*T~%
zuNEqa20XleCt2BDJ9c0^@u=W2GI`Xw9{dbmBkdT7`a*a@G6sDSq#MdjKYtq-a;Hop
zJHIA>!XOGCAEGK{ITJ1M%42aQbOhsOuBX<cN?*L|debD81J&!MteY17(@E_Xmb!&`
zEu^q5C|YRNlXA~8&fY;u?1Aah^n}bfQn~f;IS`w43FcH?7<c359lfIy*Pgjj)mVCA
zqs<R3K*PJs??-qh7|d&-KW}!`II}7&%^U_)X5UyL0}oM$`zAvfpD)7bLpCD!#<M}p
zEXj23-(QcROsy{C6nrRfMLwtS6U;Psu<&m0cQcsKHBDSWu^edaUk8zUV4N<L%gy<t
z945liRI_ytuh0+YPnZ#<aAX5~pb9Cktv8%XP~sOhK}b=ONdDBZ9$j=l1%<a<x_B`~
zt{(oy9|O^R^NZ63y}pe9^&H|b_tS3oRndOo*?u=?D>JN4vvx^)B?e=OuqAsxG|gQS
zQC|!yM9~Qgs!JCpbP~u6n8532lGYZ`$>}8v2J+g>m*&#t;;!*q4Icw(JF!&j4xIfH
zz9S?Ewpqvi-ly~@eR{Iz+6Eu1kh&xeWE74H$V)L~KOOZpabpNa5B|ch>_E$~a~zf)
ze%_pTq*n7q{UPaV;I3BQ?}T9KU>Ioi!S@ZFFyR_q!^{JprNzBi-kos+VYjQZGyNiN
zAWW$YY$z4l&&>o0B8|ZURLm132FjDcUa$gXv_t(CeM;$`Ls=QmdqV{YQ}2wHvrcW&
zk|k%Zj20rJ%S(@q=}TF!*C}z67Wbs4#zpB*ICzFU_T}1k{U{V`m+W`Ty+pskiyrgZ
z#|MIEykjuOVE8RNXCx_9S8=f?_X_J8+6!jd9WlN^pdl*@RdUfbx(VzCD89_gq&Ih*
zRwHC>AV<+KG{UWva?-WX?HgOj;I8n6fkOef3Yp#X>GUinFIiGbc;O}0@l=`1dGck?
zfq?7gQe_HJoj0$#0vq5?c$mIPKZUCzZ6~q<UxfiPaW0u;K)b{e)2j$saG}&nn+s(_
zz_%5)$r-$8l#q<OGe*SK7Nj^#E`kF9!xw1A8SAz_Mt`>%gpk?4X#doHy>a=a;`~-0
ze8Am-p0^NSXlSgXxUPHOZ~(+%xQe-8eq|fe3E*-v<zr~L%Oz3lKy9LLm>Sr@Q_dhP
z$BEfB-uJX{Q+m+*V^%#{BH%?55bE9AdYTCeLC23D=kqCaY}>qAjuz@%J&<9l%h~=e
z;$u0QJ@xdI)|QNp&9m6#ubYq^kvD6>wyc86j~}BZrsc^EQdTAceBg^1>;UK#my{HI
zFik^J#g$O4G_+&i7jo&>uX|y+jO7L>8odzL9{jqFudI9&E&m7RoA5A^QDOrGjIP+a
zrdVB5vuDSSH)L62536>eS4R^N@ZaE1<`nPwsoM>6eZ@=g4g6L!HUmE`Y?O$Auo>+c
zF8u3Nd3mlteDmU2<l#ZQn-VPcy;I0=IoG?oHl1~&i))MQCfq#_60qrhVPHFh+OPXw
z=_M(I%H?*?xG^<%y5ldZ;nO^#Pn#vy3>;ySl6k|S%4tYw=B`5{mQ6NP>0!HL@$=Y=
zKJWU!uG78QaI|FMo=|t)M$Jb$Yb)bo*Svllm-a!MIeC0Ll8%u=*-PQT6gZfR79A})
zFuK6$BmQ=S3FAGkenwOLp$^S5?5rOkPz6OBo0`^?=cw|>z-r?o#U`k$sarsy&A&XK
zn~+qf*;zyBv}{?GYD=5DvYvW2GiJ;{xxhXudZVnU8qVI}%F4oXJ&~dlajVVC`@I-E
za9~Tl<501lJ9kd^a1}rClRc}feEUAAt*s5AlOlQU-8Qma1YwG9#;H?9tNLydECfmh
z?vB_#OB#ww)S5T10Gl3=9@58KDJc;#u2VU*DCK|l0?Tl-o-S$l`Zgy=Ebd!L*#M2l
zSJ8tN*}uQ*^C(9Lha<<19Za{5;3DNcdsZ}N_dQU<$_I+;Cr#R#n5fe^(a+a+wTsK3
zQ?Xu0j{HSW5R4>ez%?9H@3ZO;9Sg~G$=rnXBQ$CbJ8`~o47p}wYU)ySr%k|mA@nG^
z+s?~($Btwem+5atTok5c_V!-J%)zpfvO~h7#bY=;f4qLcwb(xPZS1aHDQIU_3n2ye
zd}TiHkj%a-D21FqUwCJ8<3u_jO18bfJ$`A!Y%?>eDrl75J9pv>Gj{y=*R_j|Pr<Ib
zePJ6(PFY!5s7~=;4-72eTO+DyIeO?#FWST}?-1GtREbgCbQg*$ZFV_dzrK;1&wVHJ
z7Z&P>K_1D~B_nprA4E)c6K&tNZ6$?3r}g*k)EK?!)3fJ#>R(LSuiWm(=~nBTs7~6k
zG@kBB5^!Nmt>^C0=;$pN*<l}X_{tQ$4;~B|M?^$iJ01`a;Ol#<ZZD21H#aqGOGTN;
z7`#m+`DCQk6&Zh)D?zY4tn0TwD2*0g876N`(KxDL(>Gea>G{`NG;(oIzGkipTi{$G
zJCdt>nIho$x?g|&l{v!QspLUg7^OF#%h1wQ)zj-QI{3F@_ENzx=r-~sLAuO`&f+y7
zysa&!`fo>+756V8B|=5v!GU9Q16V6BLF@a&PVc&N_wHp54qvFJ$(Ih5UTJ&_+|oIY
zouB|gG~i*!qEXAF&3%?nTu3nkM9PgE8?=)e6;`aP0puW7Nprs+_j{I>mUQqqF=B8z
zv7z~SpJ=Rklc(3(_E|fnkl^5~8#<rACa0$I;tnk9oLs6fc1g!gZOS+3%+z~u^2G}~
z6Jdf7PE*Fk<YQG;k;lFxQuE_NY74@fGUpe#pUIQk_@iy4g{gNw2M9IW#N-e0ysukj
zWn?IoT>$~4&f3@2K!!cSOs(|y=Z-VyINw%d`cj0uV6@yrNk}&!{ks~Fq3H$`B^=yx
zh6MpU#3kH9x;_($Ef|-3)QJgn@oRH89t)c)P(gC%(Ie|3uR+G;;VLWrlJe*gW{Dfw
z_p<i+1b7uJ#5t#&uq|KfV?cdC?5K=M1N0fNy3+>1#F)?hzx`B#MBpvWQ8#ZAJQ<Po
z^5q}>cL%mmfGns*Y_?!{_<;kNpo3#3bEserP1l{Kd)=dM?;&{WX>-<M+Kh4OhECB)
z)sW~{=}b}fT?;5YOauV|T^Od(qkH#h>F9|>CuTwgWYx<*IM_3)Prw)Z$d4YVsi~k;
z13X3!I(_<sC9aC6Nb(e4Hczs%Ln!#eF{{~@%B};rtuw3QE?js|IUsWitme3E*`)VF
zpM!1q#pJy5tP{%_M_#@>pWFCRO!GDW`f^9UV-lP)x8h@Mg@y9LofE1zjhj4gp5pL#
zoizhMF8`0NH-XBz?b^66b3`(fA=5=gDr2S0QkhDl(jZfj1`R|-3T2KcQHm0gBt(hI
z7&4@ZR7epGqCq6#`~A6}cYSMp>$}#wp7(j~o9p@y=ef^gAN$zHzJB8dUNd1wj*JMf
zv$sbf8S(rM5QMbh_2i5WiPV~UJ{#J9`ew~vcYwnGKDUw*W_Q1f^!PO+X(mseJY&XE
z6O+qu&IqzO@86BJJ=2V@%=9g7n!|?=N9IDq6udk7ry?k+ygW3PGA*A8<%1Aw(tLXS
zxUH4dXrcrqUT*aOeEY3i(bXeI4j+CpF0Qi}X|QI*#tiO(Tv=FPj~^?msvbsybNKLJ
zN*@SLr~$vgD&qE_MJstAh#B1`vB^23S$@;e=>Ur~5~3*kbne)ZSIEqa_-ogWqBvc!
zfOsdIW@LohCJ&+uVC$6v9nEgCPTBn@t9u;zEiEJs9KQMaCILp|Z+Mu_+xFyx<GXdb
z5;c7SrO(C3GcVIyN9P{Oc^c?yYisT8>{`FoO+lXkJTO`rYi*6B?B-2B_7OW}`SKWe
z;(hy`z(kFN{-A2vwd-z;+Ih9SaKx$@3mPt1@c7xYBf-HKmluH=SWa}Xy~KiTQRc;n
zy*}RMtwH9C7RkuT?R`uD<c`%6nLRpaP8kb_p#TvPx;LuN7W=|R>6r=|S~?p!2PI-K
z`J<I1FZB~vAy!r;NGpYfn>YT}j;s}wvia~9eZlC&dKNK#!|OJEmuwpdCkCS?D=i%y
z9&TxFE<9?OH#QpVDT<`Lo2l2Xkxfq-H*TDUhHP1+;1CBGU_cy6k(O`a^cT6g^%(WA
z3bO9qv$L~9^S`k=Uw7WTHNaRN3ZJh<llNuYy-8d506Rt2ik6lZ*Gzsxx#}n-g!uS1
zsB9yA#{64^Gaa}ulo7KeE-gNP<;oSn0CKSyOVvVV$AQOK!E9?jv^TJv*RCBQlCYUI
z>o%2v>=}}hl1)ub%X@!u-IuijsbZHSj6@O(=a&RNug}=Mz>uB94(=K|$47i6^l{<H
zha{(<X8x0_0|J<vnp&e_Cd=t~Lw*@C0yPzXA6Rne*fC|K=F!pPAP=!+GBq`Ay*faP
zsMxVnCxV_5`8JorgXI!QV3q)%n2ls=MM?S2FC2G?YA_G@qzwgWMQW|19?{|z`5mPN
ze_HSXICV<L+<XJBiflQ|*K}jc))F3p^-xr-c&Z$u*d$?Lc(w10i~GRS2ffj<kKkQG
zkh5dAMOex7`*Zxr5u}xqP(0uuj%rBFC#<7~1BV$^&RzTVG4^(}l2WgJ{T}p`KyiTX
z1NCZzM_-b1G{U{4-r`quBqXGk5(D&%>{wg<khwzKmOihsC)2l~0)?z<>0X$m*RV_^
z#Q+o&YwNvBRa8}lQJ!SUSWgHYS70&Yvdbin3{rP{okhxGaencv-`<?~a4R{Ec#g8z
z)qa(GN(h~T47HY8nT0BV4;5meefItPd$mDF4;~aYX|e6*rVl+nMs?Yn)%g88=Z+6J
zrn#9=1T+u=L6<JXrQc*3t*xzJr)0rY?3N^$*(d|_pvH}xVulBxF5!ckkCV=wvxzW*
zsj6UFN94yjjf-F9^q!P~NE(;f2*V7jF<Hu8J>#EvH7B1t_Xz0l?Ad|V6+lHtN9Ir%
zl88C2Tv_mt@0Z3|_BCQZ68zac6lY6}4}Ss|f{qp4g@A5Bm^W>DOE?yOf@NuOIu#eU
zWAi@pJ-KOJfyA1vvSQS^_cnfYDeY4@vKz+a#9S{tRhzRAOz`2GATlGu!^4jqGxc-m
zTH~YZ8<@3%*2iYB?4m`B-e-Y2*kim^Y9Ydwhj3feG;^_)m7b1{(NfxNGz$-5Wwb`~
zaHmVC^$Ei2_Wj@@!3uO(yg$s}M@j=Y^6~O=E=>uVXt_<aN#X2Rm4I9@A(8lO(#6Fp
zM2i$h(isvuKWS}328s;3QY6`za3!fxMw@_mby>oPj~|N;NxRil>KYgXC2RA>q+E3+
z)O`jUnVTzCo;oG}`=o39P*LkNL5lyWxX+)o({XWJNxuH>t`ZUwQc`*nlu{Sy>FqeM
zzhM^AZyTF*cm|i`8{b%SwSV)skz#QSa~GVQSwR&`r;QtY<@k;ro!M&O)?g<|?J95@
z3-bH7Z+tPy)kpQn2nO8oC_jI;U9!Z$cN!{fKBOU_Y!(k}SZvz3xYMV@j~*3*Z|e#=
z82D`7oX!9<{w+{(*_t);l8?qC)B=ycG;u&;%7<n`>dSNmHNg|sP1)1!Z)n@1+r4uv
z=#v=e`dN+>)5+1XK2d$hF-pt<Ro1!M`ijC<2GO?R$f>8-d2d$5plPu?ALs!k&XD26
z#c5D#crn+Z{p9^5IS{ep0FoidvBEcu=JUVLL32>1eKEr-=tSAm)<l?x9j`q94Oua9
zPx9gd0xN9a@=fl-E@uNKuj+C9a$x1hk6d#3nE+t#U)QD{{JUu$?<L|*&CL3e+fpvU
zw}g(rl#d76Zm&$oq+5L8&?Q1LVl+;ARu4A}wG&hqgg!JAy0>&MEJjU@FxhoWLq#WV
zW>Z}T=#XC!9yhwX#~9wcv*HqFm@H=-W8+!2b`!#3x*;NYO=X<9T(j1*bBv+mkbY^~
zT|(k0`2xittYzlfKFQ9;qmK{$UH930WLww1vBBSiafHol{Un7m5l|);m-F<{G=g+x
zizJL~e=$Mqv{u_H{`w@`EiUK=qx(91Sf+PxAytH*aiCwvcco;{9tr2pam9#z6xAbR
zQ&Q+R=JNxIH{~sQScRgRL;f1<({I0e73mT%bf_-sxsZ?oLOE{{K}T6tvGC%p&PIJa
zh6@|wS46=Lu$1IajSUTS5HU-szcz!AHxfJKAIAGK#Gr=7r@6`<buiBfGpk*D_B4L`
z#@);{Fi@W~Y4zT!ic(fG59z>cSA;4;V<uai$cdOG5~ntlIubE#eU{lKA0LepD?G35
z?Bum~-CpLFF>eogcJffEj5>%5K-sUhHdtb&@@xhu0YJ-z@tlk;d@nQqrH8)aIx-%4
zBM*o+&~^`wS4l~sJqJA#)18j>PfcBZcL(y&NsLdyK8dx>3L<7m_2@w;s-e;gTC-FM
zV<bvYP>;P?j;HUgQ6T=^?gS|Wsnb|QZN2}1;`2zf5)xvQ41+XrovL&_<6`K4VpKpb
zAr!7?=pdgdD9YfF`5iT;Y~H=Q5~2z4!@zE?3(fz@<72iako<x)(fRV`NbnH9RaaJm
zNdx~_358{8!FY*NoIhigB$t?Y>XaSAC}Jpk#FzO7z&I*B$aME>CY?J+dM`AWly3ZF
z@#9WKhmdUj1q<$0(rn}L)XZS8r=!}FBUW3B+!n>2b#?xlI=*-%$&ZiE<RnLOe$D(P
z7(}V|Q(h0*L;J_Nbxz6CQpPHVRKI(dOm_m+1&GK-j+3A*MgB8_X#~RjghP`e8~6p}
ziZ%kRnOn4@gF_X^>&uf?ySZ%6vNR4lfC2=aK7GKPn`h6qe66Ow-Dlx(#*I&Muh2>9
zHS=V2^fK0g*Rr*|Ikq|`mGKD)eD5Ywr@+8Y+i#CjQX(50^)YvxJ3UN1vv&(=+T9Tt
zNH7A#;c-Y3Z-AG6w%rgd4?Bci&c=8ChB46<0N8v2zvF!4n!d$~!h<5>($C965{Ao}
z5B}2rW&oGV)sYZAfAYkc(-5mU=e3d$uJPy3w|%QS#-w0aLJ$_lNkuQJMpsGDbOl;-
zFV4%ml$YhgZ>jACmCl{}korLVwq+bH^n!R5khi)29*A9y&~eN}PayyG>yt-}ct@GT
z>vR&3k-JIVoO?Xmjz5KF9<}s@{`<hkKvZ1zxIv=153Meo^_;nU-_qn&2BsSxthhw$
z9c|{`=!n6~<>^*jx15%Ty!P3XCs%N0lXQG&{&w!%=-IRHvb`+K%-Ef{V$^ci9J;Xa
zbP@4(bR}SK!XqMp9U2Y&>G8tU2J=2DC_EC5@)i`&<p#t@o9!2c(aWDpQ^%5Q&7JGO
zDaQ%J--e2!?*<*oE#nfP`!O0_2&@q=2ejL>Zyyx$1riPPuwW$j2hf|_3!ZXN3Dw(i
z<AQ59b@Fi7v112+o5qO#XpaTutlu9_Ifx|YN>*Sf##1jTu_8+Kj=xjQ6SE8OL)-o4
z%^UEHzT!k|l*mb2W_{#7d38_1han;tNvH47;9E6G**!g*c>zzOrhF$Uca}Lm)%)b{
z8CbY{aK2qsbi|BW>-1lrDe7?}xX0u<_|buuW{&7!<PA!<=MoZDCXknFx|FEF;+UvT
zpMJdPGlDq%bM6|?Nh-e8S*Lpv-c)4wCg0&&-2w^p95X>h`!Ay8v0<cbbEGxsTiDsk
zkaIXuu6qH4pg4G;q2ZaeWNagjG3W1DLQf$%@nmG%tpq`<KhG3=7bydPA(lLxM~z~5
zQk+{LYZ0RX_qF_lP<VHbVp%jf2Dn1b4)4;tfB#Xp!bqYYu6~Xsgn~hJb@jo8bR`xP
z61hg6NvQm3U*Plc+^pymqiSSGQWiwYz*!NU+ylv#F1pCPZGM!KBb7RlcTd1_H`m<w
zc~G$hel|sV4`uY0Ecrz-?(46Zjg-z_<)902ICt<_DdE}sdCa%@2rDcsCaJ0hB`)Nt
z1Mucu#iZ)R=SACxkv2Y81{!Qr+w`N+-u5}Z;B(JazJITF+WrD<xpKc&uJ`m*@MQ5-
z$q~Rt9HyzarOE^`>zd^p96>M2GB!<$|I=8xpw+RYq@=Lm@bTjd-`sF28Z+JJ=f_*~
zB7O`TIZ_DnDBdni(lA3Jper$N!GhGMy|!w^B_;;V=ncH;+p8CcncV8ZX!Q>%ZGgTb
z!%Hve)u+$9)+LN{>3K*ji~_USq}^E$^Vn-!RFF;ipO6|jV1T>J*2#HUP|C^M4jkxd
zxqKNXp^aR-n`+l=1zk?`c7oR40|)$o<{YZWn(wn{Qx>hK)Lk+^s-ue{zsNq~_#viJ
zka7833dG6&w%{fYMZLA_mY(uUu%a^G^u3d){_#(Me1k$IT?;PTU#H!;5i<4$&^XsZ
zI1#lS-lTimr0xg^2#yozA9d{=`s7_}zUl7Je_mKf*?`4XJo-Fh?oT<)F(NcJ$(@K8
zeh;PQZ`{aw4i(FzoM>)W%&W}q*xk0nslsG3V)+pxni}5CNtvv#7;^pA7tWt+8oD|D
zXU|pv_Oz5KDIZhg=K5R+LQNJj8Dv!rN?p5lOwwQ?p2EnH>s%hNbc+5)YjeerG@5{I
zVpv2(G8?(&wT3fiijo`+W?c`ECUuC=bSELvvW_AQLVmoR%)}-ezd2Xha?>+1=5@MI
zJ_vB{H6aGMPSMIAZ9gVjZgi(e$h9Tj;e0JZsckdM)HIoLxYLyOMfZ!8c?pB(Z2FMN
z8>M8Wc>IEn{jy~<SAAR;wJ0_rVS)MH%#B}$JRL)}TVK*qP*)^{aAl}tvMNckOG-=E
zjTt;;*)4yb3dYn;sgf7y0w*q{<#eTCTs`N~vlW-fy|l}u(pE=dc~Tk&YNzvsGbSLI
zvnV!p#KMY)_j7ZDiypP#)|-)D_2si|@McI44<E34aEA`tA!F2j$pc$%aF9oWBN9S3
zRgjnFouX~%Yy8%&x5-~onv~A8Y>v(l$jk4-kZxoRuJcR_Jq@97^-P<<RE6(lTMF_U
zy4Lrh)rJ9k`yS&;J$&N8c%}eQ$C+S5Wn5lq&-htOk|5a8GrM`i9@toPzjE>7t*yf(
z1RuVoY6Z3n7v^y5z|IHXuKz0rD9Fp#&mck04=Yyt@%v7z0wtpII|9FDcp~Sjl={t0
zgt00)`9B5{;+od!26KEZVsAKA48e!A{=`wHF2+`G+PVzdk^WW#MvLg`+7qWpXI#Gw
z*}4HCzB+Rli8O9&(E*I8UWLi%LpPa`&`s1M-80{iEs%M>Ya~?9@NK<k`4OuA+C_xe
z7rqOwy~ASzBKeD8$bFAFj^d<jdQ)Y5KXc*nsRQKX9w(@?2>4jdW^-Z<JDx0^^g*LG
zh@!Fc?6{{c?ZkF8kJEfoQPD_Ej<`E?OXrj%Olp|>=Sx+R*g|m|e$`>EB$*|T|1~)`
z<s_CRBBHeE^F<=@J-Z12QMY2iitiM(I8W`d4t-HE#qal2gQt*T6i$Z)Afzaio%Ztt
zWCg*<eazZr_V&7uz*(%l`^C2L0<9l%Y<R%jj`DuU6=sITlLdW*CDE!CT9rY3*Eav_
zRBzSD$jC5EEdPD=e_DV@%~`&Cf*|k#&W9WxBM|nupj(YV8|jG+*s@<yWITO30Htef
z9N`@KdEY<6Y{N-a+8oO(DH+(0e+TVhSg<WFPO<q<ppOoD;@w(NTicK1x_)i+xix?N
zD$GmSBox)`x)~T7_xjkTd{SYjUgwWI-N)*YE8(9xvMR6EvJuF<23^U;#Re1^Sr~qf
z^bykV76r&EB1xlT4_=G(u5bG?RaI59oQ7T`uZL&9UCB6(oNf@u?}v=h(h6Ty{~4^Q
zub51PWR<*63IV2rQ?EJO+3BqP^b$$IiH<M3GBi<-^Ulj9YT|0CG3jtonVH8e)MfYL
z=d$N<WL~Pk#p&zcv!_4v!lX$rf%g$FoTek!K2&dB^JJ=q+Od~&q5d2l8%yTr*v^{O
z&N^ED;L)RAEC7nQ@VcZ|EkAy!yB8(O&zMnm_il$HLbUc=(h>*2nV>{dVP4^>oU^lU
zF8rD(nqf>4Twk%z5rW-Qr-LfQ-^jkFVq)6;d8ydZGh_Hx9c~Z@5*urT(sriHmzNV!
z*GMJaH8C@TSW_%A0C2I6uSG^hv94aN%Bc7$nea}sXHSYvgSF?{*lY$_rB8ofxO|u1
zi6kTC`i-xnGxs*Fo;cER@P)ioP#Dp8mITXSEi~5V^<5?Ik#<o-nn<adGZBt@n1aIT
zj7=R&o|67+p1C57{vKX*5>V3Mcv}criuB)2m~*0b$tzAT_n1?s0xU+TJ+{J{HPR*y
ze&1_}7H=-Q?lv8qURM!)@tL`kp`-6=YlXH-mblC7>i{A*z}Zt5VbU_1F9-~D(@?V0
z>E&2w-2zf0nsJmLIdrIz<2J(jd)XU!2B6wCyX-^I{)Um72W!91@Ug4W3_EtL^pz~e
zMn0@JR)cd{=4-wmor+yzPti-T<(Ls8I*6~(+?YA^34N`b;0Q-5EkO+M@LObP4SjV4
zcz`SmaBadHaX>JpCPn9G6I;$quU(qDcJS)Kf={UZ{hm8o^ge&q(bb)&Hu{Xf!dx<?
zdef=_y?6<WC%s9Sn6KL*74mx*?v8i|IgP;H+WAp-_VP`~AIB5HAz>nu7*H5s=jHwL
zo2r4N+K87NXpqX+KGoJPDC@IzqK3wY?r)hZdSV7VrwwP(8!+fKbEoVWdS5Eq`)zNc
z2(ahCfd`eepf)^8nMmxS7WuZ32ktQ8vVbvEi;mBh=ln|l@KjkzsazYk4lpner+75U
z05IWQQZ^g%;M<X-CzJuab`{NT(BpRYg|0EVMP@;io9f#ddNaeQyz*^zEooerewYSK
zoG`)b^$^;hDSAqag&vDTOHK^Sp4Cgcuz8?n(13{$ai<RF>qGc653TnRC=55Nr59jS
z$eeXgDtN8=`pMWZi|m?d689~kx3>E9`Szn}b?>I4G4Lk)womQey*q?R(VZ74lY!R_
ztM8VT{lSTV4e<W_eY*+89#>707i?JEzD?(OQ4!+ljz^|=*62($r|LlN7y<7j-AzJ*
zCcd8Ptrshc5{M567ylqK=ihr|nk?e^dqg>x+~&wR{9zA`LIzvRm}ZU5bMNzEo0ogq
zQ7e+~6bMY(v!`29gnakGV)@ad=a;E8HyQe_EHEy69{ND5aE$xwcS5e;)Wk6e`}<&y
zU&rCFP}8PmWPBSV4ch5vW(w9pOa|E-G1dDrJl3U4<HOhW{4K5aeIk#Vha#}P+Ev1#
zU6!e!)pcB>uHkG-C9AJ}oV;J?64*R~F3L5;2-2pU$(S(kiA*Yb{+vok58>fD>?k7j
z+tG9A{+;as3$<AGhE{j{yW>Pxl1NnDgC;u2ZoX+ye=v0gfq`|T_=lq7-4U(^D3Auh
zBh2XzKO-*|3tfM!?46%NvhCUxop~}b@%8!2^`sma_E@UO?pKAO7wXAHy9NvtSj79&
z-1aw3LJZ{2qk;jmFft-podXRNQQBT)U`DOAi`;<jorhY6VXix3+@wh}$j!H$-3TYc
z8R_u?{lPVO@e5BjO1JkT0eBF8xi61mXpWL1nDc6>gt)Q)D1h>p={j@fFt@t=a#)z`
z)0bN)tnjD%4IUgM&1h<1{$s??UW@3BzLnPfn$^W+?WTW}5M$tbd9;#t6<H$Hfm`+v
zMk(HN%;a_ZX;-nn$l1#m^%3h#tOv)5rba1oqBsT88n4zmja9TqQgvUzhYr3MnG`CS
zx#ia$>2pa*H`jfXvff`BHgur(lV6CVuU)&w4X=BFMoA*+^lqln%q?ao#mTXkTl(ox
zV8puvj?Fh!*pZo6SATVZO!9U)`9@}unVDPXiNr!D@(Qndq55+1V(>&c2aX~L7xf0a
zb>Q>2n<N9Ix8LqhNH(yauqm#G(VVV(_w4b~Ed$ZNZNIvrw6)d8!}`gxaf~t2NuC1#
z#Lj9eE!M-aHTaq>xmxQeWp#CiVz|m7JHaZgJ}Hhq<O`nIY|XEE>zCqoMimoO&#9@x
zj0T~I@4cM^h}t>gQ3)}~eAD-}WT7cnUt6jOL)VM9`26a<Jdn-j`=c??E=6c{@inFD
zb!%gN%k}Y8x-Z997Ae<wJpWp)M_)2b6{2)$Z5ZGSNl70MTVA|-@@D>_u16$<@-g{*
z%77w=?0)7uvsPSU8||E-3Z^^r=fWwyeeM^k@9wbLdh@Jr&F6>--?v-sTz7!gb*Eq@
z4b`vq8y=<dH8Uiu>Dj#%mq<+cU<Qx%Z}N()Zb$vvcCs3~{OVK7#;>Td4nWS+*ZQlA
zY4Czswzfk)zCh_$e)u7;b}~klOpT^m`>y%oK+2TX=H@#dgXg~X=}ixBV61Dn{wil@
z70*M0GKbI2OS^XMf~PzAr+lT`)hky*TMbiU9>+;YO0HaRX_cd6$@!{MLiQ!qZZzYm
zf!5<vjg~Hb7WUfzfOnd6L8l)N4~m1u_M(UiP!{xxgc!nOdj2Y0Am`wwc>c_po$EGC
z_o^pJ-8pfmG1EjCF$(taZCT|M?Y&Dc>^U^&&?FVn`vI~8%HMbJT=NJTA<sfFFI6H2
zyW@x|>(;SSj@N}j*2X7#nuMqu_4BWkM}^2qz3vL;Ed-Ffscomzwkt>`U3z_4Lvp}v
z3`rf5A4DxKF!ba>4XrB}T7r%ftm!Y!sTeWA{t>Jes;FDauL2Dy$%hXX=C%K6h#i9E
zm%NT(BjhlxZh$YMlxI$Tn)e4=luX4)ysZ+o)z7ND5tqAn+}Z`HUc~czbE*YgPMyHb
zk|N|5=l!dO$P7g?LiwGi`mc+DI;dBVguGKS^4}xCXiyXK=Z$P_GY9u^H~tuKFfube
zck?Qtfep3!mLH7@6a&-KBim+NF8BN<G_wL;f#X!DXu-R(vO@6nV-*JkpF39rReel-
zrv6GYM7r9LY3WHYV_+;W8ukUw3r|<j3)&($VILggNp{d46I}O}WjRf--#Pr2Vc)Kq
zb0nH+0L)aXd6L7car~<4*wB0~ZQAXIP<STrlj_LM5*>eCHDf0++G%Mf#W@795r^Fa
zTvgld)Eos`$qpE>Tbd^UF?}jYMM-Jiqx1`Bbv!-QizaCdZg!mQ64b9IeDw{JMds!(
zApON^Y*ESvjd=rdJ3}kNrqaVS@YnjSWgU{7D*_@S1}cTvpL!{;pa6J<<{eG#h$9gm
zrPkTY<G})9$Pb)IWRBLfX}%DDWFAoKt)NO?i*8i)?cI}?-pm_oS(DoaPbB_tu5_E&
zYKUhlF4C|x)Xv}u6+^Ng*jeQ?1cW^^{&;xX-hMd)?rh#Sn1)_!ya|YUR#p)QRPKnb
zgwxDU8Dm6%kQ{&dR3hk!Zry~lEUzkMf3eZm<*d{q5z=;mUUIp4O77`XQQGsE_}N3$
zQssN;C3l`1FH;qge(1Df8vVFc`KKkN29Duuc=ANvaO<%t%l5{)!lY=O_-K-+qB&D>
zhx_2oij;czStN~6lEU~PE2~G%S6J|b(<}B@I{#|fY})ywwTA-@4%g6D*!o{TdGZcr
z&+P1ZWB^N-`av#pacYz5Q1PJ2xW<muKcThP#qD^UBD+6Vn^nGG;_|2nojpatFQC<c
zQBmRHyQKv`ZlG-Z-d;$XAf3el`+|agd=}eEcXLvU68p8&vq83irsv&r54o(`Hu{I&
z$;5N&W@+`18as9CRo6JZttZ}7eCpDr%jH`gr$w|YVEL+GB&rm2<v0ay<T4s+YP?e<
z%Ui!_dR!1JMnXe5<~dJ7!^5+~y0sUdI)6SLIMLR&xwr3Qt+ZrSDvLr9tLOzT-x%}{
z?E1Bju0c@S+uI$c_DoW$Z`*QC);8(vGyzZ-G%Z5qqNd!+LrB)_H_U>AUT*vfodWB3
zhOue-m^<iic^VcPdd1+Yi{#CVlcV8<%V*R9JK;noO+FLWT2xrIeR#y;mx?EUyJ%FV
z^}Q-pTGq^QPp3UhH4%PJvV*alQ1wd6tZHKk%$sa7>tI-z;70_DgI<C9mS_IdmLT=k
zP6kiw?xonNYM;f=KR>^bQ{N!1#v`!+kWN}pW$W(a$C5oI%`Gfu(2&{lM>9n>7RQ}C
z+uo0hKB-vXxo%zTnDx3zyT5<?8@WZIfLy-B`sE$){^}=3jcsighE=UuYr9=Mdxua)
zUn)+h(l7=k3$7RFs{wGVf`%iDw<?b6=r5IZ`*yo7x~Sg|u2LA!oZA+Pss(V+p`m>P
zf^i0;9Hc&I+{B4}l2Vv@6_@hG+Vq9odshSP0AVZ|MVO>S0UnK8-d-EK%oFJ^tL{2E
zcB+xnQn$}Ndg_y^ii&(*YGFHp0p`=XVb`Zm&Jl8|5f7~2TlP$9SM#s*OZvp?Ry#ei
zZJKQ)-<32+LlD*UEUGe8|MhnB<DLc#G^5Z+Qctz*K@gNM8;}p8m&Xqu_Lq|*T?nuB
z>(TsY^7!$Ws=IDq?uPbwW`g6hs#4j`x5xGGPB^9aYFGeojbk;wx&msGGi(NqN~DMl
z8Im>YPwVZx-%+N3D720;)1WT7@Y$G&pb?7a@GE?o1UhQ2Eh<PwAiGV_Dnwexf1h0^
z*Ij2?cF?y|c_EQUk2<VfyWheaJ3oyH6C#*Jx@XUPo_&1kQa0dVKbKk#Fvgg$dE?KX
zg(SMFw*p$d1KB5fZ*~EtQ3z2Uik4Oy(8Q3IID$IIh=6jN*O*VN*6i4_`{ap>bGHwC
zy>LU}q1wZuP5BM#t*aOQa;^ONM9t!-o=dk#Qb#Vdmo^Q79&T<9kMfK^ws<VLAGOCS
zCa-UyjWEz;)F`&}@BmIF4#TRum*Zz^6i$Qt!JS1~e1&rYm^0mH_)MZIB4^v5-(muB
zW$Ps4t2}1Rbc|Cu33;SThhLg3eO_<ul)@R_DZ05F37+fMLn|Hlvu4?{Wk{`JV#Fgy
z&WWnNkdPqPx35mJBwLq8KJrTjeZuAW0lAz}-)VW(M>9H#WSBC{%N_MCwxK~=-2Ris
z4qFTwb0|M#M3EykC1veANtj>h?GnF*q~zxM>fYLO5$rH%H{%9$lq#5znP|IQJ8%-g
z{+;c53`wS$g~b`RVtjn@7GGz>!gl$3rZq;&MiWQ5pM17*k5|uusrmjc3&IQp%L2-o
z?m}^ila9zwh4kd~iuG=8^vVw{yW!BawZS{$wzqur>^?FwbUToP+(36kz5t!aDb^ss
z(a5GdPo4<+E$8yr<20$sM6VRqVwfOTqKeZnJd2vuh&*TLS`uGv)didP{{9jgmtxsX
zYs!?;?Ch>xc97~qb%!|RB*sM_$aML9@=fB-h*K8>r!Bv<7#>aYtNGlyRbcj;pBpoY
zj#O9C!eI+}1a(DiqLQMLQX2I>eyr)7|HM%v0{VB?lk9T7bO(e@;fy{2G-$`LD=!Y!
z%3K{u>4>Ip#|1u(39-LZyN#=EIq?o+RMYewKhSf?)R*^et#I_{H>mG|?N%{qmdho;
z;rR<u1ab4nF3*9}q1t?~h@I7Nr38BC-RbD6PF=bnur*q`HOryn-rLLcBQ^70zVt$$
zz>{+Q^4Whs<=;J_4BXLv%?O|c+OE8L1_W4NjwSIQG}Vi<XFD<j9uE?J`Jjp6`q0hv
zuYVyuDoiyx^P9GxGD=TQ^Zut`7x|YV>g!T}G4b?ig)&3<!svH&+c5=ZaLy1EhTgq>
ztt~8e&XFutXmIH=Yw)nsQtv%<6Laq@vi0sU(DY{$oPbd5v>U6s@y~DH-X}2K_yr}o
z)M<$hZ~XR<i1X(^q4uIJh{O9Wae}=~Q$z0^wdLgv#3w3VHKralf38yLoO*|~m^&*s
z>lL{k2|K8{T21m`Y7(3?dW(uiwiP6an-+-;MMWzsrneTWI-dxYi<(yudis)FGu4Lt
z9FE@~Lr$SeV`jLH1d3`8gU5zJA5_MU{pdQ@fK@#w`6_(+saI9=;%;s~fT!R5`FHG*
zXfK`gRb$MUwLBn@0t_3H74lh!2RW#iD(>LWF)-@p%X4&Ed<{!w!M}4P0W$uhZ*_d-
zRfQtu+us82(|@#PVdJh|BU&6nes{YSpLcXkjl$k9Lu;PaEqZY8fw7m5&+#c6rNnA9
z=c3=yTiv^lP=zemj9{xu*;KiH@Kr;D6oZ=&xLuriY}iiJ?kZgmL@z>OU6`G%#3ICD
z4T@aa=jTqR_4;8Eoe(*Pg#v-xYt{w?1AE|pqpx$xSZ6h(z{lQ>0)AsFwPxyO@#?^t
zdEsB;|92RQZn?e#=|M<J#BkXCVujSbW{8-y2u;W#wUNwy($X4mg!K97;ln^|;AZ}o
z3@25rc6+27939urB33F`<GRF1&r?T!r-{n8-yiI9>&5fuheI*(dF~UFDyYz#Pv{Q4
zrtNpmJ8E^I<4)}GQqc+k&(T5&lh3E#Xsa*pVErv<|2IE@_g#kQo-!;P44=AVa)*bc
zHMl?&P<b80fT^BhHON)jEtPn1S?KF`KLV((Dk$q_7x<l^f2ffvW6Ro|@|n-`^D82X
zgb{gm$H^Op`2EQ$^z_0*$9qxifzOLfO-I-tT6MTzYbPai-t*EFRwwzg<mmNvf&T{P
z68zd!OU8^+ktKpl9f?0UYH&EA01cvIC8=!6#g%6}6)A81t~2bRjDOaO@_nyu)F)-0
z_-(~d>$w#NxOT0T1=jHvF8aePEEW>Irnpyh)D~2rf@I)@1xME@mi-vCj0Jv9$ichz
z<AA<@U})L-g}oW2RPG#{z|%XFc9ED`Xu$YcwRV*+VI$}Lg7i*ysZ46}@6Vh|rcJI`
zQNe13#}a?<dJFVhr5s%D93bsE+>&xI3ivU2?ghR&E1Ao)LEpmTn`hh5*m%;s|C9|X
zN|Pqb3FO4zPfHzE=oM=1D6Fi_@N6R>1Rea8f+#{@=k3!UzF~WnPFiXF^Q2Vpp?izd
z-)>&s-}}iunP1WF$oRZRDdGy_>;ERNvomFYAW8Nsy27<(Ic~bV{vJ88FJ|DocN!iU
zueuP6mz_U$>E3^4ei;d5@J^wxYUWSJe4m2B#cZOvHYH>ZRK37K9G-yz0cbT}-MP<!
z2%Wa)v5IUhD_8BXfYW3@OZ_TaJToG4zna+oC6Lm6)|@#zojP^iK7Q4c{`n<PB5SM`
zVA;>Wk=$H?7i{(f*qJCEl(s2bSkQuP#k}KpaM^D)+y8ZB8VP-9e67^Q+X=xN^@-R*
zBci=MT-$qK?%A!Ubnuaew_uaWef`KA(d=uYuYW!;{8Z^aRtS9#)vwYfj`92XEiv!r
z6jof@kvqyYN50e^2B;_rCwQiuimcGYNL2+Q|Es$7StSd9YL~6vl95$^OwaCorhaLo
za`QQ%W)mW^$xOQ7j1U2cw@uoFA={h2sd&c7GUnHSE}bHudDsfw3WU_e(cX3n@MQSJ
zd90&no^uSw<H|mv2hb}b?gR02t~WuotxigUK}`8%^Kd4}@;1M{)h^R?*!HqTj*g?B
zX7v#3WIQyom`KO6ynV`=@6HbQU!z~-1t30hsA244fWGmtTG2AX)cY~N2w_2Qh5#uU
z!Yh2<Klt0qAuOldnBeR>XwFxdH2-b;=(q~`wEX!dj!HHef+_DGji;Fo5g4e;!}2vA
zDqz%Hi#YSQGO7MFJ%_JS_Pi&w=BBsKZu;77<)y`lVZkz}8BQl<pCHH|+8Fkw*QXbH
zCmMGYy-kmna=~zqKOi3wSinJq%znpX=j~093aq&-`?itul1BstCnP8ji4+=Vd=0qD
zXoW35V27l4TcOhDN3|BpL)0n-Z&Gy3d`2n~5rMCxYT01*RiUe17;o61VK9O<sJS#A
zeWyXU#iWcTifJBc!@Df}yy_xB^x%Y~&5=|Pzs6677=D<z0Y5f_leYo#t!D#4R>2e0
zHMbjp1!CQM*9~Xd2ev)bWZ>L5d*tvKuC1#Dyt6hqr%or)Jhopl!94L~SBK9=fLV-(
z01OW*!M&jDJhL${6+9G)9)mQ46<3b;D@wrW<<P~~XU~p0)OJ|#H*NdjS~pIoMrIgH
zs?*@skoz*zgVK%s_PIZO@M?aBqxgc>!|K`)$$?K5zVb2+oa@O0$sE}AKsgU0F_HS1
zx|3|0)Ksbi7FQvdF0bm5{ryQ7N3&Kcz3g(Y=kiel#;>>s4cqc&go74in>hBdSla^K
z2}&sJcwc0M@SW!J`V#cL^D;99saMoC?WfU*yk;j$M(piuzI3T!$kAE!Oj}wGI8wZ-
z7knB{`?5lWr-NwCH~9a6BxYj=nJij#`b%$Ni3y+j$Hk~Ua=b44{n<ia%D2zB#Yptt
zeVJq`9lmzqW`fAioga^}^@q7<Q^UbaG2Ky!IR|awd1_2fL^4-(P0Dniue(CgaNkat
zfwnAm>7)tt2TYpO?^oH}uI&zmhMpResNW|fw@P9|ZH)K*ozgySE#t><ji?=ii#ly`
z_?lJ@2^%08sk8-}Z04gDttmmgLU914qU=6?Txqm=n^ayZZM9<K`9$G{rl#4dds||_
z-6Pg*<mmQ3UCnG!m-RtE<A%j=a6guQ4st7Bt+%Y~4%dzJ%;+Z%`$B(3_O#DL9q+_%
zD4&8%TbT-Zxl@ypvU12b@23ahAZEp;t%B8BczLP-h%MAfAX?)(dVJf(%VOwM6=>>&
zNg}uI!qP*W%J#r?L3N%BWB#sV(MWp5snq%;3Aa*KW_k)Gowv7Y(KRGmChxEisv_3b
z)-En}*bku--TGy%WN`F!<G(ECgcUd)qdUNtPO~ES$&-C?1Ivn(8?Cox+vN}HyEp63
z<FbpthCB8syxr+gZRk^u(}+@M_^~9NGrX3Tec}Ly;N4bFzjoN?x%T@9;=NgbN`<74
zm6he2vjx%hrpI7b_j1k#n;B;*w-~h94`G_l>ecAnD4q+W#QXMb(drz)+%xU=Idq?)
zAJMEWq+*icBXfj~hO;3nipeE$urOV^^u&joE5X<RzH6JAoO8hwP1{3nhxK~x927tS
zY&_)n&z?MC>3_l{_@Qopk*#LZrhvVBCDT4zT3MM)qBCmSsZ;Eh(-#KLY9=F#laI(?
z-6a_S`jKH?dy*FL0&R}L<#$6h-TYVeV{f&5yvsj%?I0|QzaRgl3l=Z#zvV5vT3b)5
zo#<`qa!1Gcm&2jw298v%xVq)-P}D?H{fN2s*Z11smlUw?D;4<@nNKNG+WxqM3>h_)
zn=rXAa%=Lb8Pqb>e#IvxAvqo@zIW^7|Fi(wb9Hp=K#*U5mCMZ=X$&`}eAj8wqQeZM
zRQ8NEhE#bkGw~PgBn(itq2Ub-Imt6Y)H7(6+YqtNkXas5InNmU+|l|31=X=*4_J&4
zuzGWijJ5;5OXzchHVEUG-j;rm_Ei>($8Y>>W3y0SAN-B1urNQr_uO}=fm;z7yK&C0
zCp<*PH-c*!%@uL1BshV5f`Ze#MtXf{Hqop_AQ&)Ra%@^r%+kzRByfmJG%rhcG4dFb
zmGf6;y{~D`+TzQBs$mvLm(G8xhu_Rm3jv{_P<2(ycS3P2e|!1*^=d^A@1XWV8Q2<f
z{=$WYYuk|wqdV};nGEZcu6^z0Ir<n!;U0}uIWR~5ZPH<MPP8e~QH$>gLU6j+nqHgP
z+P>kTPzEL8<PChOuAXO_M#l=v(x$^tb{w-m_DUZL3!A>-Q-6d0-)Ur7O5>ri@xjQ*
zHk4W+kw@z08X*G?-nQ)ydK0%FBzUasw?5ZtcJ=xB%_Sh0!o|lY;j;^dv3J;~jVl|W
ztFR@B^jxeGhIGg_+#ZPz;o0#@aTvD59kXY73AbYO1>~lvsHiciyxsQOn(M2ik9*Ar
z>vBof#EL)Mf%5R-`KyTf?0XZ%a+_V3_w9!_F13Dm26e6fx}2HVH^s)TG_7#pWTR=}
z?uttkaj;8`pH_YsX#}%Vwo0ROMBMwXdl{|7RJjIfpCz@Ue4k#@RZ42Zx^>E<N7E1l
zxA9(P<ZHM;?(p4BQ$IhyoD)LZ7zp@YCB0Y&%SxkNV2l~@jh1IDX8IrlsJ#!Z7F|Un
z$&x9v2x^$M6j^Z%$*M&5LmU6n!bF;xY%kk&9RpQwxp=W#&c(4~#>9oI>o9(NysdJq
z*5t_*gSrBkoa6gSYeSkUuJa(uF)22?`P*B51YMH0ghBX(EI47MCI5$5wuo8s*RX1s
z*(41p-eQcngAj11lD0fVTd0d+canTJJ?|xGOHTEe%ybBpMVhq;pAPP9L~8aOQC3wc
zQe#0e&@3KExf*Ta!wyvxBjAzpzdp#zJLnWV)7Lix!SX`U6*P%ZN3Fl>5kz>NO1FP3
zo6~6%l{R>!>Y~YO7ZHGIK6$^dJ7xz{vcId07%_XutzPY?USSfH?SNds&ck(XDP1~B
zpcjk*5x=vsK>bTQs?EBf48K1^W+(=SguH8&7w%Cyx(rfZY={2n>E%_(G=mo}*0-=Y
z@@g@CK$z)vK2riG@}uG1PP(m_69PX)>DM|l2`$;78(~08dj1KWLKh1p^`b?{0P^0v
z5o}}RwU3zE+gAcIrL_rSJe0NTg;jlP<4CA=<SdM9Kz=}Chd_r*rQ`1I)-$kW?;?Wz
zaEfX%D=I8xv(at_pI!Lk1)aeNuV3@lsN5`Bl9AUT^~x3bkkhDwSs}+M9kTzpB61`Y
zfnSkleWs0xI?Wo!i}eD+^z8b}-NR!tttD`va3|E42<X2_agj_=7t;F~5*$3U$A`Cv
zi!Fu`|6$S_?g_Xa5ag80xZ4p1PEfyvn<{bqY0VJ&ze}zt!T$+;b{EFbQiEilmK*MY
zU`siE8sgXN+u1T*hw4bs3zzzC!T_`OKXjIJxo<A5?D9Jj`4wb_21zySCHW<udyFjS
z^XAR*tQC}^2<xZv_V#4cQ>r7^7#;vW2F)Y%n-3nmgoBI&>%@sauv8GOj7^pnvp04B
zX!Vky)sq~*!o$Qw0QOLdOX&^$C&0x<a}d@RU>f-;%lfcgAo!0)L{OsU8MA3+`}L#T
zASn0Qrl+KmPMu3EOpp~g>H)qRJUx*Ete6l^_3N&&CEFo#{CDu@$UfttxvHwG!$U$$
zsjI;^Q!kKFHb?U!5ZqgC@?KIdUAi5j$0EWtPS4Lf#;|^$G{wCsG<|VqNNbY-SQ5if
zxDoQXWte18k<dUVm<H^EEr0s3Q}gw_+}xS#f&>v|xSun&7N0(~ad<AWut0cO#(9A<
z1X@;G0)f_{OTVS6;9*!GE{|RvSlWL3?W3F^L?A#a01N!7A{OVpiE4lIC{@6*i!UzY
z1VSu}wF!Oyt0shVVH~9qGyJA!I>I@_NbYP>&d<)?{UqhO@6Qy{aGZ?6kT@?uH=yvG
zfTt_bK}|t6owFN`u%u+NqGBA2Br|h}q1R%1C$k-J4H}=*wYP-0PoEG9jPE{2VUA1P
z9|X_i1>vZU9^I{bcgCOD(z>)a%f!@_G2R+aU;nTFUR19PF6y;VX%cg6fn8TOZUjy)
zaxS^DyVXdr*BzI;6CG>kk@L?cB`syH0O5nInbu>_Ar*0<r)$NEZR=za`l3XN&+809
zgG0}Bt{y0L=F};)C^T9?G*gy6r0&W@f_uu6y{iqmQKCNP6+;+0NF|zZarm$&L>~N|
zkb6tSP)EnHL9%A58{DsYSM}czj?}eL)wLnoYTNhh!Gk%JfH}eHjy|PeTgm1myCDOD
zWC9~m+U0%mKA&Z0&yrvc9Kp1?DsOvT(6(t;h_@7GRfY|t87LZgUwV2Ql3P|S`L5=|
zyDwiJtv!^CXk#;AJZHh+Aw#BPM$KV_!gyl%6VgRoPT89Vm<~lp&onV<<m^t2N)lSv
zq)%FUO&dMh5z0p*hQJnFtlfqNd_?>xEn4H=W-T5%&FB5wx3>~X1X0zCi*K&1>2o}R
z83z@Jv|gP1Py<)2nKaUZc=4cNwizTEEXlAn{o>hQD1k}*8>&y@!P|9Vl#`tuxz$G7
zO?1x0<$M?BOi;(w4EeMIeuN`O@rUMaPATLJq#V+7hEYSQreP}m+gutJ=@adXT-8%~
z`|ZUUK3CJyw30-(>Did9rR82%$IIbZTjT2b=Tq^-6d|u9C>B~A)sF)wBpsH(^R$`+
z?Vs8+#lU&>SZ)BULKwCwHO8z1qMlf19j8XyZ_Sl607yXV!JASn1xhU!dR)J8gG~`p
ztEd6VM38>J`Sr7BNqJW_G{^Vv-nMmh$gyKL5Y_eezropcp}||IIHg&wsj990@+vLl
z{MkEu!OA~2QQ`a6++0@}-=`=-b%w>$;b#r|Qvdd5XGFZLpjHO+L`2s(?fUe(tuZhi
zzXW<$JB@t*jz@kOMaQ~z>r#gPL2Qfvb@_9ZG>J+5uq$zMdpr5`B7J>kd<%o%Sr*Q_
zozW}q`KaPiC(LgEtMnYCc}r_$Z?WK}1+YAHZ)aj&f_dwm#`^kOmr|TD9e@V^+T5%)
ziTuA9kUHnc<pE55A+~|aR}aO^YpBN-!Cb7RVWn-<dtFo0kfP0|^l^U$DckfJHzEK`
z@$v23aZE*ykfro|Hr$;sI2kcaL@TEqkl*IW(bePn6LG0gIlj;gMEwk(68juRpfI$n
zd=LwTHtIbT5(kJ|b%p+F#%i*>*FDiSqA@-U(`;4Mr_DFz+V*gaX|8NQa|N|4_<AVM
z`@@|vL!37L$DU5mfh-CNQ@1?Pdz_+7M7CHw#*_v2yWw4zF<m-$#@(6D$h@|`hmIUU
zIelgK?ygDQB*Yv`Y%X%YrgIl*Y$h-hSo77|Z!vbu_zxqsWiM8F!R~HwZ6cdI)Ts>8
z?jj8?`$CpdYPcx~*^W#XGR%W|97&i?wU#$Pm#>@dEsyo&mYPDP3$x?QAW%R%xacWA
zjg@`XS9`8NWSRaZw-?Ys0Ks$r&>1-i@q}0;fst`D(kWpw5cd<0JN^Cxa5+Azw+76b
zl}_vW`AWIoz4J3RZA7TespIhiAu`=tAn}R5e!}<Ex>NBY`(V-$NX6MO6f?EC9+@6K
zE%oAsAF+PeQ2RpF_Bs1zZrmYK81R96=g4J1D{hA(0DhC>N-^`4Cs?rvBQ}o#<}gZm
zP*S2ZomW~b%?M}E7J>*c{I{&QkgGskTW@tPHE`z+>DJcf1opt0Gb4*`$#wr@NSd)r
z@#QbRo25WzQNCz7-d)t$`#fBksaPZNw>hD%;=AQAVpsn;$p$B;XH>^H8?0WnN&||T
z9t2%-ID2~{CdFL*?l#iz%(|ydAB5SvFjE*Mk4*zdCH<zqTN<ddX@eawdf|7daSoiZ
zWooAJhdr;B>mSod+<n8;*f<Rj0>I;JZEYG(=;Fr%Mo~fG`5D8ZB(@|1kspGO0N5F$
zU4okhrU_?O(N{QV{0fYO@|b`Tu{n1xmPhniJ$zWj$qI7-+UwqVHa-8$R)(R>eg52K
z#g<$_RmwmM{s^oZ<tlh`m@gKC92G29F=@};w1J(@U@cRJI~zv{<rpLoZwK}-8n1-+
zr=})ymAGryyx>n^dOupZU?{-%?>cbc&(~^GHW)%zclYtzZ&%Seaq?sZSv!Z^tlp3F
zO0TTpR*hLbU?J#o@4z=H(`NaeyvWrsc3F(ID6x^_5C9E7+44-7Ezg7fdTEDqOQtR4
zX5!*YE6@YtTEO=J*!Y7gvI#(Y&H%{eqqr<t?`Rp<e;@z*8UFp8k;G1l$I>z~b(k<y
z9mM-3qNP`bFgXd?$dA@m5GPzz-+uiT%$@u7Q}H_bm*`z^{3@b&pvvNmL$Op>7Zn`r
z-@%a<D7>s(R;-wSO~VD(0q{)xQbsz^<0LE+7>YjuF8|1}$x_)6iTRK!Egx;P>d?<j
zL7sknT8;i$r#%aw$iaLg&;A1s=n^$-njL!eI>_?R+xcdA0PYFwI;;eG_3w{MfiXr|
zpe?{Q#udv6wdoa2{<qX4)(-HGW0XCO$S(+j93RZKJV;f-(h~f1HYVm0rdKQ-W7GK;
z2jG6fD*FP?1W&R!D>`*#A;aerq;J5TayY-Ut4}9>@Tq8f${L~e^zi9ZRZJT~Ls@e7
z%05wb_u48kkQc*Rriqwi1u>HWH7Q-)KpKR2Uc7T$i}S)}Z~Duw@E<deH4yA$QqdKp
zdmKT9^kq4~U9%OZkBiL{OxyM_bq-6ys~hU93=N-wN8oPX##ljQ(p*;y&W(uBi1@JE
zkuo=HTOjWm8wt18{3^L5Q2<0FOnhNcsFw`oMZ6jcyMTTh533hNMQl{;>&@oOS&g-F
zkj8Ln64-BzjU_xaR1>^CI_QjnTC_xv4N{KepGu(+SIjb$Yk@353GT=Basm?tkN|U1
z@~`h<+f44n-cEn>vT?Ssg^0BvyJpU*K~ZSt4`BDiQnj=!<~9U*1YB3pZ^Ir#rm~SV
zRa8Eqnh<~*`fCUZT8DWr4}#=U83)hd^3aTeT(rv-<ImQ)8oUvj?f{eny6?Do)2~l~
zn{XsVH@9h|xt5CFAL=M-8~xS?msH`?vhw+h7x-#wDkyXk_Z>7yKVOy+Ik#qw$u5(#
z{#+V2vR{Yj4%=#FM54A!5eHj)t=|ll=QXVrqxhLss{g*@|Nc`tnv<v^w-nqs`~Uu>
z|BL}TWxve{(r)cm?fAb6)_;Gg9UdTB&>z~c3soE%f2*4R{$2F6-MP+!XIt_X(N=uY
z`M7+s75?4ZKSF`%%@Y3Wuw`y8|CPzx!l|-DK}%~BZb)^g#lsA&lfJ-RR^AF^yy$PW
z^=YoG7q=LW5QE9ou_2{z&=9O6!I7S_an6=@ydsGhcvA_d4x;sLRAD7z1mCNTbA&(3
zC%TTE4jCjXMu*MBxLm>cpzZg=Yd##6Kfj^pt|G}S*JhP*K=T($`Y-va>K(f4gv#H8
z+5f|xo2g>$1SJ;K4ne8B^OBipW)}=d_x;}Z{J*o?f5tsFSi{sN#9-)Mk_7oj%ddl%
zXs<Z*tq$5Qcf=8;{%V~?{xf>;?cpKNgC&Ylf1hnjB2(wzsdrp<&dGFrU5QLTpmm9G
zB{O3$T#$NZq+s#Cx7JbzCtOYvCLzXm16{2^$iF=>^S*7pjzj$>0m4%9;j@3gl(+Gm
zIbCjIO;Ehww}Zdy`w5-4-{;w#=@nB<otU!Sj75u{6GT%jcm91$TjWR<IFTe`2t!vk
zF5S`&@4IXC)9F0o>hlmjmOD+cVf54AkDnC#P<C&iG$JOt_gsV*_gjMEQC1*zrvL99
zClar4<2>g1cM-pxG<%D4LVWzM8Q(rn8MDX9{ZC7=c2YOldwY6y7#()Ceaf;USG9Yq
zk2l_)*I#{6(D<-dP6liD-CL^`X5-Ok!O0unwsrdCbi2KyPSBpSqeh=Cto;7q)#IX}
z;?6Rfx3;|TdkT^9Q!~?6dEc|+2alaHt8Z#D)u1!&$0OyWhdnxo{3<yt9_f8P!#ov{
zDAa|$i|OS~>5~{WlJA8K?&5#?tku<XXqSj3oS}BBXT?}z<-?I_e@4_T+@s(n<PEOM
z8)QW@FPS{1Dk3skwFGa>;%HxMX~PuPN*uG#MEeQ@a&MiqTfG`9P4nX8oUTaKUQHFL
z%tsCzD2)UQI(z>o6;bd_g7-*yd5O>cJNWCK6#k--8K_u3z$o5~-0X64XnO?h$*HOR
zw2|h`_C7a76grc?(@*=X*=UBcP7!`e)HjXMPvumFw0Gh0LEL@F9D&QYy6P7jnZNDq
z-+OM_N|oa+4;jTG^>-ADZUxASGhJH(V|zNrirwe8qfwrztm1pDW6R&NGqcyz0Y~Gu
zU<w<BFVrWdMR0XjXsDcO|ERD1&)TG$=J948O`?lxfDKa(6i=hD_HUDk%{%K%H{X)A
zy+tz{V*lQg(9LlM#33y`Y|4W8gPX>B^1wYpL^CsK<Y_F<2{IGDJZu>VL_lrY+zS2a
zMjHl+WcmNO%zl-)L{j|ZiGH!WXqYe`lVX^Jn3BlHFJH#u?;d^a?<Cz(y^+{2I%n2P
z8y0ZkhA@-aNkxn(65qRQ|Cl9fk0E_J4Nd2qfd!^qAD>Z<v<SBr-TbW(z)X~g>Au&w
zQ<a0nhRb%d;eIf&RpbZW3tq$2v&A|H53X7z@`wB@^!|?KnJWwe!~E6yii0sv!hW@q
z`qXLDkQEIN5axI^E?F)brG-bYdrSRtu@1$yfs@v%@RfY}CSsL8TtPusqJh~fK8%~=
z#c5|W<xF{WAL30|PBR|=a)8VUIo44$tdlS*!~f(-^90YyQJ?yrenK2R90UiSHcvF{
z0PU@#<OHAWSMtJl8x_LVq7_mMaP#nhd)g-4mj#82!!l#&8>7ge%0;xDz>ADLrkfaL
z1E_Zd|7>Razl&tlEDAmqm6ZwsFCoAnEbRn8e_<FILtaJ&jKaAdsT%)~AM-!ijN>d4
znXY1{jU~@NJUW{@@rXAYUAZ?GQiAreP>lP3Kf^EgU`WV_#Yv0;J39XdxPez7h76hR
zkULDQvuxS0PBtP19n2hHHv*-Bjze+6{crIFAW$j*ii-n~yHN$e1;T!0`(=5gZ(zK2
zbtUTJdmO*K2r<Jwc9Epfy)a?q$Oi=lO*~FCYnK_~AiwPI#F0=H*UTSr4AYYyPAgLN
z^>+R?JRHi4QPNcLR|*4r-w9VYvlsLZe$0lUb>!5>8J&hjG4d5Z)6hER9o=)*c>J`3
zXjQT|b@E3ZJHfqiK9VBYy?-aR&zB9-n}~CP{U*K+Y%c5;jP}<Ea?`b^SQB}-Z_nB=
z{O?21hGZoZk<nIjepxTf*y4EwNUuPIMh>98i%5#C;3J-DzTl-m+NCYQcxPeVu}%Ed
zMpya?1iOE4p+$kCsGoLsIt|c(a~BtF`Y7rjW$K~yow71N^cRlmWh;b`Y_tliI*8wU
z4<1O0dC4y@0WKEiUoiUh7h#j~pXQV)0PgG9GV|;%8u8NOQKB#o$|~^<>gHw3qCe?~
zL~D-!@$2CoEgaN)a%k^iXO0XIMvwAe*mcWdSaFDh#Q_&nQoz6CnS#Z~zLd-(w8n*A
zQU+6uGUk={la*Br5R`Y!4nZa$yvGHou(fCmhaVT;W5qLt#6NJ;alrw&iRuq7NIw|w
z!3cF3T@}7`8Sxq$pWfPd+4BT0Qe+jbT!p|=P%ZcY^wuUxs}m-XFx!#O9wQHrMn?Fq
zhbFyhB@7ru&-|DHLVr%651r&#Sj7RLy^B;D+l)E9E!ow)O&Bp5VRS{5ru!Xq?)+na
z{b$L}&rk#*tHEOMGo+%7&oF*qW&^Ikkh_Z|4n{<jkmQs2@{|_`Lpc!`E#lt@lXi*k
z!s_IY=LxMLAo@UQ?t_^h(H%Y9-Lc#WBEym%K6UD5>Vki7i9F5pTq?^N=?hbPL|(_g
zxu8}IJ>vrJ<wh(D94wR$Tat*K)xzi)_RI~>TC^&|d9Hi$@*6O4AUQi>3EO_~eN3C|
z--aA}f#$*EWK_V#<YZwY7#lEnM$|%Kn4eN9dYqUGqUe|XyYJXo=4(-m25-#LMeRkq
zK01}5zR5%)slAIn-T@N@JI~mQrovrmODuPCf<-Dhw<w{Tm}#{6WGnFX#WA8#cS?c6
zfKMpaKxxWw6Z>~(bjRh{|DFaerSxxIm#)D0{DOI61lLl31S9My8pR~Vg!MA7cR_@(
zM@-`SR7!2YDJg?acZGl!hNy*(oW#g8cCdG*U=qjjRSGalc+P$h8tY;h#3>q84VAs+
z&##TxXB5wRlnHr_Q|IVsey*h{btGU*Y;5dywl~sXtEB(<aO~fxjhvjd6kMRb!Wccq
zXU_|;ChK=(z*)2Se0HkWz6t)l(W0$JpZ0H}4rLuC!Bmq&e=-t1Qek|HFv7^^>s*QP
ztZr%zjc3(3RDS%K)*mAs66!$dRmnew{%P5ne~-<2Q5sTwkh)_b+A36ocB_?s67-Un
z-|Y-*FTN-Ie5xxcje+n)X9=~0{L_0)G8lx%(&_QvhC5#D%w&oKhyM#@Mvd`i=SU^1
zn|;LhAWm_k%W!-?TKEP7nmMYNabuDpq}l8Mww=gh+MgL+M;~@XbVQciPn-CF8HHhl
z+U_pxc#;d;ko!XX0VUPYLNL%5;`4H!!@kV+!$^>!Stq51X;+h@)>0(k{f;u_j6y1c
z8El|1mYKNqg_-51_o2^ifVJ21|1JSzOFW0O?U*69_Eif<Z5W}k+r3AR99iOjw>nTC
z`l_DQS!p}XUz_BSneEqLJgD_`6*J@LCLB$ox$=~LqUer!%&Ys6W(as;;9|bx)xSZP
z{j!|ENS6O`)-AG!?xnP9G1n(DG*n(V>RvJX(UeIs#W5<|cJ7?R+v(9+u=X)Eu^nB$
z_kUUdlPzrx59nCNL{a~=N%sEE(S>_=?PBT_z`LKe*;ylD02DJCy<3&IRZP1U<3UFT
zK~~F1VG~RoF-u*M{TI{)zT+r^>7=@=z3YVE?=(7!*!44K#Xon<uw{Y_q|b6Owo4_b
znaN@-@5<fU3**7BagscG%<?7<Eso85XwXSCYK<AE@Yv~(>5<B0Jg$9=Ct7B9Zz=`*
zwgleYqoXjJtQ5gM&zoV0uZgMWILAFqEh7M&J^PPa#_{;*F`!w+@-k*7)@w(LPFp~Y
zGT6uC#KM<wR;1WI-rm3Jb~A0QK^T_#en^m>Y3kY~jJqhPMlK77_hjNjVZVgB4P{Ue
z6|O*OF!AHMBchqrmr;x3n+10xAx3qAFZ}?lx6kzu9XG@&gSauBv50h7Nr-8PUj^F7
zCAc_EBvLRgzv3DwjX>x3*8;<_ORk?vhkE|Sk&X1Eo4p4=mqrN+GyYTpA*#bsn<)K^
z$DW9aLfUN=Qw&XE1b#CRKxxJ!x^8HKcZaA_(dzm(oTWu<_)g+~bK{remxb+2=qJTh
zeOS%DzAMa%#I}H)u(=CtkQ#j;S$}xbF}Wv&({SeU7V4w5h~f_(T+S##S|MZgdI)E%
zmLQJ!YloKb=rN@j0$>Unb`R5bqde(8Qs)v;b647f|BKFB7BHALjWkTbJmxC*3XOYw
zhTm@-@|`t^nWGQaa06o=?wQ3;Dufj~3wF~Wo^`IJhj}UmhBKJz8~bXR0OS~SCs+Vg
z(o_EtG&Y`gc#^vKG*M`K!VO=8(@_P*-wU7O&EieGyjp}&j)RiP#*vZF(bWy74uaVw
ze|yT4yLV?>_L)Q}31rmiQ;U!l&&(sj{bW=?fHYX>_`kEGUi&luuzcrG%6lf>@#YB}
ztPI8IJ5W;8ZTR%*TM#5N)Xa2kWi`Ww1u(t{C1Y_O_hPX;#Z?AAsP*HFKi)(amfw5t
zZbFu<bSN^?d0c;C3c;&4iSti)ladmq#-g>aT(!Lda)vkhgK8tPio?_Jmt>;YLI#cF
zy7<ajG(V0tEf{R_m1!BJcQW>(p&P&uGc|Vu#R6!NronM+vy6<xiWe51X=-k+T)vB&
z6YVMHz@{}`K7dTo(Z(i{;kn=+BZ(AjEUA;dS|rERJKLn88qSL$;R+qE7CMOLpM15*
zYoFdqgyokD9zS-%;fQx1e5{Aas8MDM!(k!!7```9&B2HRQu_o$CZf-hRxXEd2O{KP
zCGfhKa0J*XW8l(?aqnv>WeGE?Su<fjLlVssTtGK&Zf@izOMR*p(i%je9f5(buemdg
z>=oU*Ax;Gws&bbD@O8hxy~sACer>j<LXWBeGLf?7lxm^9Oe3!0OLjg^xYOvf#7<6|
zDt?Oi!_--1B1b?>-3;I`1SzQDiQ_0ZZa^7EaEjr*xwb&abA#@AiRN#^XaJpR|DgMn
z0Vz5uZ1fB4CWD4{29u_yj}nPG8Ot+~0+zG4oZL_SGiId3#LZ{xAHhu^UXq;tCNHnS
za)AbcWDYTAVpRTF;}K9F+YWrRlG!}9=-k_PY37}VKD#=)#p}YN+Yi<ycAK8r|H5-2
z1W^ifPIz8JD$GE^i&fVxNA=|~9%pp+?PU*tIbJ@=FCm`b%BbDKlL{YR_4#1Yo;lnb
zP=b(*`Lgt1L(7mGd(yNM&G9C4$7?q!y>b4f(|C04-@BKF5R@2@fn?H;Mm+ib<niML
zYbt8iU+0$@;wED~S9p-JE67?1b_WIUJ!ACfJrCSPqc(v|rH2g3q$b2cFe9z({{1+B
zDpv*RON*W9+JQ~uB|=>Y+K&dm6S_*lSpN)14aXD=6INdh9Ten}(9`nDBSq1LO0()p
z*>RAV@52}V0Qzw3{n0-Iwbaa-?KHHEKM$&#USQ2g@ktyJJZu1$GO?<@!6xDV@4@za
zb`=FB1k`et7*A#iGgwwn?$?`o+AqpeOF0wqIfcuZf078z)P!LwD*COlR`7FhFY#*b
z)V;r2{Fylgi#rBDtf?RbrEuL@k;0(BnZ@gP*q9|?TW*nPPD@<ym&gvI?h>vU;`Xl+
z`{VuH&%{v|m)GcU`9F+%$>|{iM5p(eZ{SP(4>@4FLbLhDpPx_fCon-)(^f)>H(}}u
zsx|MPnN99u^eq(C51n?1zF70F#P_a0oEa2@N`Mcgu=COkVVbq3A_9}EY`g&Dx-Y<l
z79HpQ{pPklUS15rLO5gZZbQ52ntwXO(=YgT2wvp?%=4tdSoX)v6Br^ZknyG_Cc>@K
zMozXDh5X^!h(gigzD#GzbFlTP7F60C(HvT~<PI~}EZSC*irQl{lfaKxhpzUOO=&5c
zyrI+3FEsX?cg@4wakcNbs^savk2)=K<8{5Wk{QG<xMtqz!PU{%D$>kB%+Gwf&Bsrj
z&k2S^nn?(jI$%W7jmk7_9BGb3NzKedm}4?zAevHh_8fg9*$cmZKW7%-n~K}OfujQ+
zqA+D%hf4*3Qe)#wJZ<&~SKL?I;i?t&uUCp#4!Cwzk`)fOQP~{Au~X}S*VO_}1(_S^
z1N@=!39n1V2-!-Ar9zzj;<s3~->6bL_?G-RL&N|)ALF(2Z^6@#se&-7l721ZaRsnF
z8%&7rv3%{pe_W~P{n+v28&E3F?=uPQJ-g!n6|$%uZDv$5iz$<DyyE+FgZLifj6%Pt
za*c~a8HwbYeMrIk#VxoYyefqZ>8p<~36Gt92T|ylE3N{@6pb28CV?%L$FXbF+b11+
z`eXms#zvEfYm=icfEv(8WDSB`7tQQ?1uR(^E=Q!H4Uu-_ARz%3^``Y0Z<@PU40YeZ
zKB?3~X_Qb@%K?E|=X1xap#dU)ldXSg0amUFR?v8zger0{ZW4M&zl_5ajD4YG@Cv54
zh`n@lop7Fx>Pi`k7$GP_B*ko@=%QLk50NNXWy+MF$PV;kF`H;61bOc_2X;_8^U27>
z6NhDM(!0HA{)E3|4v4E0<u;z*=+O+RTuCI(*?1IQFaI7!dsxn=rlyYl5XldUPQet^
zz;6UdvsI`N3q*t{+HUuO1C!|Bi*;wiqJ%OW!t-LJDsf+`Yu5(Mst!wwjPi3c|5Clt
z=V=s!DvHd%ryj0PedovT)}%a#niq))YVIA{w}))<Iy8_UGg4GgROE)g19q(vzj0+f
z2C4-MiFtGYOf&~p2qzfr4*ZYfu~4EkbaH=eaOM!r*XDK15=c0Lz=Y2zg%M^MM_fbE
zz)YxxnRg?jqU6*ZqSf5#n6r%04qTBV<ipXsI2YXhPUQSwQqPg=Z-N{E)zG75;~1BQ
zS*Mft{uh8+&>jOAV1!vdVTYte!}|TDkC-~RfWY_n{`ghg)*t@&JfByu=PXUWbV=#o
z^XwLwZRhwVJ+0Vz40!~pAr(VA;SG+mXZx&F8%QR>r8k)+Vje>dmsBK7!n2GwQLyaV
zPL%a`4mDRDRtu(KX<9^F1BeU5u`};d)(7H-3K#{`Puqt3!ZxUnWQ&7NJgaiW;(H(<
zoY({&g)Q}sjD|TwS<A*@=7c)sSN80IL2(cjJW|pijN>n}WcKv<?+)vi{#!C^xyh4T
zIhlV*HPzSW{xW3uqF>#s1r8JWx3^03sI+#?Pzb1RZl1>ARp!h7eSs=oa4Faz2kE^-
zZ-Gv&X%Ke);ItRD%|u4}MGx^jjS3wErwhJlGnGr8^TNNlB<ueD51V?<ip?ubw6<7N
z0j~hTyg1RkpC2;}MIyhjhzO<;zQ!m*e2?`bV6wmWXt`437>j!?C1btVY`aJ63ZFi$
zV^lLT+RvbH(4epphim^^HKMi*+9s%7j52s7G@>aeR8GM%I{Vo(!+$5$imW~Vom3aP
zLK-0PlE{N%DBJSW0NRaJH)i^D+xYU&#mOTlPHf7a%?LsAX%;EP;&{l4^Zb~$&VN5P
zHJQsjz1wL*I728S1WY;(OP$lZ8R48uC7q1y-vV8Hp7PuaYBlInN=oQ3DqsY~LE)L+
zGU1>UOrSe&=P2tTxQwl%jAW5i)P+ySj#LGdSTtkz{}*Sx#AM(U0_XQBvYQY^L@!%z
zmH2|>?f>EH%)@d_!~XvaW;E6!OZFN=C0j+P)FdQZk}M%hNm-)MB9&1l>x8mPB1wp{
zYqKXyilmTGl4wzBLH$10^YobB_xK%ee>jGDx}W>L?&~_2@A*Byr-)NV0C*H@2}3@*
z@33L+;Qb~6xGUHZbHuX&yd65Uj#eg+3c!ByjghHoa9AM)&5X`;`<$IY@WT;Ia+QMo
z{6dSVXnR=R#Aia6tS3wO()~S?t9Uf&Gg?qfkv(A0d^B#vWGKZjJP_s~XgI%qOdori
zwph1wD-Q6W0}o{;rbP?Oj<-%*hHn;2=^%e^%E72#K-A~Lly_<VEV$?bdwyW}rl=={
zv(7J`51Zq`6fC>1_B%E%MGcVNP9|tIj3AN0<5DL*jFZCjW-ZoTo)ZUeuJYa}rwdnE
z0nWQ;{M?8q_skU}>Ix0^GgKtV2Ms(d9pCmCvFZaZuBMIas#OI2M<zwIY}INhU<BBe
z>`+*o6ofOz^D3Lbza{w^C0)_H67GeVk+(;l4JeT6dY<Z2cl|ZcB_ROBl2?lD*-%50
zNb;z*TI|0}Kw_s2v4hP?%H{Z0bIZHztgH^MlX#3!Wh`?vryq$B=D*`qFs@SjlJuGd
zeE{>G14s_=<@^<yf$7<$OEv>MsJsdT<I!NUC0`IiyL0PS-cX_WexPK~X=`Mb552fW
zhxoKeE1CCGzgOWIFQS31+%xm3q!4*-77@NZHi5^&zfz-|+=FqalxUt7RgwjGF|XYI
z&t?*J`9W{j_y6?fL^857r#=M)IUUU|Ra2}r>`VG+q`oi771Z(#)L-T_3yK8(v)}|A
zn-we0&W(1?JaY&WglW<0)?mmi!t7~hfhak2rLYY{50rOD@jG01<?0c0AOV1@ZW*r0
zTK1X(hZjT%__UW_HOVkz0;iFz<EIpGcd*{(>2`r2rbw}@-~@9_iHR#c*MVOX^&IQn
zOmbg-)Pwp6R{o@eygY(|ALS?8w)p@1_uEU+-zIMF_&+v^asm|T4cGZLLdiqH!IgQc
zSO?||K|Cb2XU|NtHj$j|))2fHYrtm1&B05W&}S;`tqC{jT?y*59C$u4F)==V-QWV@
z8a6)#nR)Z>T{-GOvdNqDS0w{=V-Z(jRp52PMc}X?lGiJB3ics>ZcjFiqGG6JB+uuo
zE%!$1e-NT$+xNV<MFjM)DuJoum0!PFOEXCP2nVV9i7@N_C!l_C+7(T_Y6~Ms7I%tp
zLUJ4Jq#}wKxUD)8OG4581)|@DRHqx;GjeKQhNol7LQhGnqWnh6aX^Efj5T+BOOr@F
zDH`XG(Q1!F*QeiFhRiz&=4Po!D*YQ*(YaeUEx%Gm?`_+*+Qnt3FrJcjlpF3W*6q?t
zFwzSm{UF&Ueo5&kbAAA@y9r@Z+BKZ_BsHoc{U*-db#E1LKP5-6e>~jX)hEtAGzmOL
zr=uSiH2%E`7?~Ytdv(<~g)(rKcxJUqVSM8lN=e+-ZLAB^ukqcJdss$g5Kj{@;Q$!{
z%Nta0+HfW$TgoYIfHL))_>y}1oCGgIT>90?2+iA1k9YzLCE9(VSWur5MwT2Nx@^B<
z7)}&E4}p->o&_nhx8cSA6Vt1hulXXJt)<iD|2(bvK>4GkgXZAz5C`Rlm_i2SR2hM1
z!%y!oja!U+z>TRYFQ>~F$qOcc=3&mXgpZY&tHowP4_+zVpcDH{4LM5k)s{YKj2K28
z34pZQbP3(W=r9SqB^cnUk|vVd8ycKaQvJ{1kVebRV*9+`H0Ed|Lu#ebD8moV%{o1x
z?mS<0b_GEws0c+HKsk<z)Z&CHUxr?X2_|$R$pXI*F*ugCn5of`1W#4P6b9#fa5hBi
zmK(dYB(s>f2Sh@vqUM*-VbV%5m#!WUNY40A5irs^iGYXOURoz<2Vg&8$JmWBW3Kg>
zXdDGBb-xpHu6?Cf9Njyq88UVRgNj>tw+#z1sIveK3LuL|&y8-|rp?!?s)0e_%^xg^
z{pX*%tgD|Qa^e7nqgJda-7>kQBt~v_)ldANL4lJZx2nDhPhVA5#(3n7iZ>SPQZJe#
zd*{}aPK3AYvW*-8@rC=Y=S}=uKwL~Ks^m1&FyvM2mX+w^V{B|(JU-YQMCSzL>+I*x
z4XO@GdMPy+NZ*1Ta{LRGUD^izH~#h01K6jsPoZV2`zf?;{Ud*mt+W7atiqUEXAyI~
zV;RtG4>5rL&`D8I5$n;e-P^*#no~MHU$Y$G6(+}u0jKpfsXp`%*sAMFl8qbUqfkYp
zDE3b16TayhurvWF{w^cq0^lNf3n(L044GrFPa<_Drr(GBvXsmHty_k$D!GLaZ-5Bx
zbh}Xb<-d1|7vj8l@d?}l-2uPs=}L@>!TM3|4bsl5BpJM_XwuYpu-wd8B+K$Zj3?wc
z5WcLXkKB$y;FN96rzdudIjZsggm@{d`6RqmZwS|_w}7&l>DyH1EOz6op&4OIfpd5b
z{ZrGr3~^e;(B)y$X|-gNK11%ob_cBskB9@cPX_A%^KRK!SE6j#a7-nm`w@>qS}|{5
zMbU?P87Ui`WCVV}eU?LC(S+*oU^qdJg(hW~el0hk?wn+&O?Gxq*mTMGmG2*@kh~A-
z{I%ru;fU4mp+P0ATN^)DOt&z4{o?!k1I{C<2SrS#N#Y?5BKVaKE{i4VT^js~Wb-s4
zI%$s6XO{-)(d+q;@-av=yJbFlbViz?hBiYOq=LN(JU;6*dP8CR`D%%Vs521)kPQO{
zfVvuam>o?!k8nwJCPwonEeu&%xH7<!`p;mdUxig;4loCQ^K*$sfZQ&c?UHWHT`z7C
zNi0pc$wy%nEpMMm)Jw_NRWL__n$fXR3mC5Dhk(Ne##T!=rEfR_vj5)%1eJ^*qQJaw
z-^`@p`l~*ib;&npyzGrOEhYY&8myG@I02vJB`XnPcJEF%<-w9c#F=0h8WU&x`1t7Q
zU0U#{W;*jN=aQ<4;u0>#IO{xb93q;6Dp?-IEoQV!O@*YcpT7QDs$R0d9bdcldX^@7
zY0&WVBWxw=`{WNCWXh2yoxd{iCGOzB!}nfeDXQ?D_?<WpS1`~8+xHjUY-{uq1cFjK
z<uiMG7C}9VvYQpC90<Vgf>BZT%*I)^5Ap_%9GQcK+IELERQ9^}!oW&112lz2TS8yd
zjcv^({(s9KTO*K8rnk3#=MWNmu)p8yKe&GV2$`~Z`%63=_@CyZKtLDF4lE^#!3@h@
zMA3aqISe~V%EiG8?|3m>*(3nVOxN5=D^2O4@J$rR0fsfc@|g`WVyfOKHXXQfgedzj
zA0%;KDc^PGhovTw$lhLiALI_wh&OMJ030-J3YLO#-!j0G5l{G;<?T>$qG}cY<LiqG
z6INmFY}7t!!UjX8%?pM_*_ZnRMKK?;O)|x=5o<nGtLZ>$7|4J|hz$N6ewVb6@@n{m
zZ+d^C5;A7i9;UdLbF~!oa+YQ<sO=oJj%qph4z3)A$qomHdGm6?!2uX70-X7Rd5#Yj
zx$<umK>ZZ!zZV!8)qi)DJ^J5WXcwcDvL&^gBR62i*`({&-`tL9;%m}?UPwYGT*2f~
z{Q?mJ3;;hQ0(WpIAU?xUL*oJ4dGqV*DylbKvEhfSY-Y|}S9-lUK<C>>P_vTCvFeYS
z_w)iAo*qpUa2Kdew)0w~+`nJMs0n}V3_SSv!}Z@I$Q}K)yGr7E$t{gN!Y2PGmf`VC
zkLkL`OgXgePBNB4PRjeQJ|$wj8JBroa8ZcQfZc)5nXxDhDkaLIKSUJaR+<pL!UxzY
z%xyt4--1yMCO^qikk>&`&D*<Ce+3T3#zDT4dSQ?hs$cz*^9IIOq+=})kL&}-*1#YR
zJC(Yh6g6pqkb4FGG6~?VP(`(srdhcvB<iavQu~iwMq~-&YZj@e$zc^q3wdJf>u)a2
ztVqWA`H6-V`v59@{`j%BfIjP&krqcx{#vffc|?WzJ&cjH+w)hm$F&}^WnuRFDH1ep
z7x7Eq!EB$<TO_j?09jiWyvN+rj8gsWTc_K35(`E7&h90Pnj@s4`htZE%j1_2w=}?F
z()#Er>nMN4F-NCFXLkDj8Sqwz<VFZvX7?1|@W7)qcYSnp2_d?_A|au*(`EkI*l{IU
z4BlYx!}n_YZKE;->yg`?x_n9#<H7Rm+&4}X9i%lb0-wFqZb}BxnZjU<LzCQCLjh<O
zT|?X58=6P3<}}YYaRYl}^U2lNk(}N84;}<{{mpnVd@=|rD_5^(Y;p|9f2r;tW47)!
z@<~t(19!wP$wGYhq3~0>YTKFZNbujVVH8Oc43qf_7w%fSr6f9}?AK7ObFr~BA+KJ!
zGLj34luZtgN<tv37Km0s$dIc(h(U6b4v!=cnl@t>L;J+zQB}=ctm{9-;V*J>7rqGx
zMS+AGS?Z%l10K0Gw=kg`D@5dp0@250LugJva}cby)Q1k3mnWz7i_M~=!v}Z;uuNts
z4GvI*VQ}Th*Q0RJ1nQ8{X(mHS9a231L4Ge%bZ&d)N>GcH5c*fGI?sRbSCFL(ZTek>
zwA)KE&h#&<3p^R*i+hG;AiOLJ&tXAR6~`1ncS}uMh0xw`rAe+U%gaSc5HRZ`S^QGx
zi48|XlA24DgM<lnCLRt5j**eK3znuIyNB7|ujgF%cB<i_dv>#~u8K@sj0(Tgl*CTH
z>7%1EYQKG>xi$f^3Uxif|5^nKG~!S(r_m$;$lV9SAx%;wspuRNs1?|R5#Xg7eilj4
zfH}1<oD(&S1~pCE#o48tHUy2r1t;F5%dlN+jo|Tc1Z?A!;3B?;ibIJwg@HE~e;XpH
zRsOoZt_sV@9E+U=fiy1AlIoOs=JVUPH)8hq`kPIj3_4UtObq-f#>E+oMRlKQl@YNi
zxzv5L=CFmz2rVb=*Bz%IN*_RT9m}VR?8kSfBR~U~ETd;#L;B8iydZ95t%nD7Zc2tz
zp6Pt}ML7aol<iGT9vqF?-31<YhsPWhb!kBK*Vx9e`|^-eGBEF-p5C1e+*cf#kpJcl
zzH2L+tz*hk6NT8UZc0jn$$PlS`{MJlu}d&e6j?!Ei=IY~I`r#VJEC#~xP|sL5DP^N
zmnsY(MSS|;!&#zs?|wNuf%JD7p(hL`>nKL$nLz-RqBD}-FL1C}J_2sns>QndVV%)J
zMzni}*oKyp1qKoqiu6p_BBXZXTY(cm#jgB=i86#l0vZtJZY#Io?y7`^f`Y0#={aE)
zOfOMb88|GJvx_eL*Sz*0R*iT?>~71Hc%X<*2fI;xWAg)7!h`w9$5MI_4A3v|{aw#`
zMwAB~XziQN7vi!>)7qJAWh^HKNw#sc=K%T8{3IA1uig>f-|6ZcNA61rE2wiT*u%Uf
zQ8X4?0b}Lr_Vq*BLJ%%+oPIB><iWOYr*!yokVBP=4+5`)Kz5}L2ffnuU<gyOXyKsr
zlJ9*t9wIUdUXJ|skvq*Sexp%)1vMS~JWah9V?;rT4j*oA*UNblEmJ%#X}P~*PEHOK
z1h0g{{$l57CoLU{(uOm%B(Ry39#pd+a2_LEGI-D;=?Mstb5=|~NGF4d<6`dKyXVO2
zz#*L;mEm70FB3^hzatz>JI#KKk@Bw0$f*ug=>fyMj>pm&*8_=-o;{=EmOeX`b!uiT
zTXTAKjU;FeUr5MJ3HlDGXpkZFin_$Q%7H!Xeh)QcCBvaA(dXiHL5u_CVC<H{6@l;A
zSQzFlTen)xwT`s3hfRlfJ?_>rG?&yPlhzjSJ>G!&S{fwL=xnhh|C32nnK&o-#wIW|
z)K2EKPzB9;j31_##t=mV-w3MSny{-QbHETE?j8J&B9fK!F9!Wkgolp&7T12oRzm01
z8Ah*>KZn0>VsR(_Tk$pSQ0ix=##7_*y)Ak%S@ocgkS{#nZPR1JGaS*RvSX4CATAI%
z*aXqxl-+T!uDCi7sE~x%s+Y$$1e%hN*R0T;?cHqx+7lxZe(#4i#XnI56ItraSjf4P
z@$ZMd!(gpJ?SQ`*Oym2-tpNci=+;+V)O6l@T+wc=nc5sexP+>v<m}9{cn;zWF(~LS
zIBf2)e*6_llo0tkMbjV-fSYedE!eBzkQ>uu{i(zEz@UKDghR|TkEi#U(KT;bdlMmI
zPR%@_>`vE#!}>Pl$`XS_4%iuLM8hXNU98KcBMFW<LoPNK#f{I6ze-3D`)D_+j~zRH
zB~n)(h##wkomS@SF7eMt-xyvOo%6~jgfkSI&Y-0wWy6uMA}2@3SARcbK-ZC+Hkb<D
zMloCZ;i6c@FdiMffPtr)-)fpfxdi!;bIz|vs`{CSiv{d^*z1d0`~OUJ6la?Qcv#yw
zDnr1Ag_)Y8ZNlwsPR|;`X*anSNV_4m+M{W3J!4|%nY~9TXPUZBQ{yw(PedabJY}PR
z+b$<~e7voxMZny(pD);Y*Nm3Z7QqvzNMkPJzeUgAKJC$2J+$qTsle-Y+4j;5S{b&O
z|5Gp^Pnc5bc7|0;<<U^-!83E?XT^K2wttbGEhxrUqMmEAf^P~7A-QcCupw)yfcm6~
z-Pv>JHU$KHfDoP*-(50(cSuMbCP!=+1*iebbh}WgRM%JbjhTlP0l+s=ATuB&K6|+n
zFI?*}TsNq^>h^0fTI5xG4m3AbLGwNia6#h*Hn9?j=(|_B9F<9Oy$VZ&_Z$?qj9BQY
z)9NyZ&QxE?E&T*j5j#Rd0RJeNH=MKBymgoJ<VC7}ga&j&ve^Y?^r|7EvP2Z1#O+N8
z2P=v)Cre?Q(98X)iNkWHk_~r@kh}nw+h1SFR~zXad#FABiYDQf;zPMf>r)5+F5h}V
z*5Xzp4tr|~CUwr)`C`%%VZOyz*g6Qxogy1mc=P6sm{kn*`hJ^VLzTM{d4MkXDRRFr
zU<66Y^u{Y+P)H)U0ITs+6owb249<p@3C5Ek#a|Kdkxl2dJ={>9nTA?r!}w+^6U4G0
z<N`2l`+x|JrxpDf?hoVCLX~n)EX7xm5G(syYJknr1qug0E2{>}f(4h+uGOunEsi)e
zVDR863QDLIdwj7YSDF#X4p|4^?yX1S!hvrbeR+5MB()S}8BFsTtxbdH-)MI7l4SEP
z+ywTOAb5b^b37@Igu`N?iIfT)7}J^R4%8>GYa%h@hS{}4!aYSB0~Hde!bvdzisuLn
zO#-l>abL5S5ay?@8u4?bM-tRp6yL`sue9x9T;}TLW(y+~>_|}%dSLzF2-Vl|9j>Tc
zshZ-SakHXT1UTkp0-1GwCnp*o>CU*YzfV`+1>tFn(H#%<>I!%lPyO(-&Y?4w2^LbW
zClE$Q9>klpS!#|Tyt%l#vQWPMwQ$D;Y^ZvR^T$A{-@~`Il>Dkh0<EkZgbqz^?h2;D
zu@DVOZWQ!o(Lq3=smr!Scn_dp+=;LM2-uP^`Pr-_&~!A?GB+)$eGT1V(jW6(;HKLc
z&7D<8D++Y{z{OBxuxjm0P?%tyM7=Zo4*rl?V?lqq;BrS-ipqP(#n2x~pgNlP`TL9Z
zh$*;b097qNQU292+`sXHjA0?M*f08A@zN#%kY7NwMYl^jChQFMldcGe>;Q{k_Cbdf
zj5SJg)|T|2an@KNjXDn)8oea4WO{+UZkI~Wc*QUxQ+DY2OP8Rf`Ho%-AZ=msm<ZNS
z5oi#_NY}p}rNi~Tve=bPAb$MwfQ(UcrdBWd5U{XRuGuv63V~^WeyQ(>A-^FZIk;B7
zENB%jjChS+f?>miGKl|o>2|{D5U1|hyI0+Bui@H@R2Id_5$OpDLhIi)D5{YwXlb45
zWg9fD)4D;jFArTY;qXX9(d#pqnxRNJwl@`VZkEGgtN@6O*37U|fxpnEMk<RQO3G0U
z7_c?z;|I!e6Xi=-_ijesvlT>pI1EwQ@27FAVzwbf;93ePEnQmt+`(;#I!KJ#i@PUy
zo68HV9*9-^cVw-Z0#9}hkqKy2L5m4`QECoT3}cx^1Ci81_%_3iq}+j#(K6<0Z;5hK
zz+-fF?e(h`esrERC1JLF=dK_=7Ad2@B1ES-AR~gH?}IBlN&Pz{1lZe~&K$})Vnt2)
zUJ!krn2d_x%<hfo1ep;^bo7iyTZK2DKX+GA(GZ<UJ_I!&Xnu+?4XsC>3i<ozJ{plX
zba`-_0g|9@T%1C?k_ml^AVALKI_jImV26$!k85qwS@KK^OF<D~0uFQf&>YXA{&Y&^
zrjlz!FNMXN=iK~BLr!C?v4TuQ+V3X3=&^C_rJZJ|RoXAa%+RZ*zE7O>vbUpEO^uC{
z2&Cz2;o7~E=?-Gy{T121RBnPPZ+(F=H!Z}8ma!4}EzeJuV?2`ZB<yT(Yx~vXAY@}F
z-xU{QyRNQY0+_|3neS#}GMzv{70_ICV_26PtrE9yN{ME&nVF)=d8S#4W|MA`<oj7!
z!+Z5IR#-`dL<?6$+xKp3`7A#{O~^JahhAOtS(AReyzxiVsFl!RMzswIOYy>cu`$FB
z+c_LKo|?u*Sy0sN%BAYL9{C-_8by<J|CN<Ir+r?u+$FL^kO5RpafhUZb`HPze$V$d
zG_Iko@HW5`+<%say2<OcC5D2XM^_LLU-VMZiboZrdi<517FT-C^zoMmfaX_;!k|sH
z6fK74;Q}@IlF7>3zjN+6zIl=hcqPGC3)ODjy8|Aav@Z8?<)UY2vu%`31e=dT6cZ~k
zt|Mfaa%T4MDZVyd-#3c!vm#F%P}QeX%hyfR?_qEjIy(=<i&fKk!%Uz-_G`&Jfjf2|
z5JiK)57aSnMxvj}HfyFN*Kw|QjlJqE3>11R<w5i1;fh5%b(L{hh^6eH8!yD<Xqx-l
zwJlYH#y)78guX5W7k@?1cyHXgv`8h<&aaD^9Wkl9SkFQiqvJ01&T6**UF;Q<$E+1@
zZhOSVCe`dsp}le9#Nb1}`r45+CHmClcT)93HVkd5hOIV!(j@DlDSHME<h>fqDZXn}
z+<bb%Pw#w;qDex`o40S(#Q#17sWoP0Nsb}ko*Wz)I7k}&kgJFkU-sMssyZG9wV++F
zbk&ufQzs|h9xI0){+v_XZYK;fmtCQz<0ee7`Z10-U{~<>Xa)~i4u#v9*XNgBQ()!m
zm&6^_$}ysWWxDOUY0}`vIN_y@7pL20SI{p#y-L>?9ENR`dv7q{#*gy7&}wJyTW{IB
z%1154dyBlBEHS@|TCe>mnkFVTkxY)TuCF?CEo<rS1I>I*acD2&JP3U0M!+P=*7i@6
zV&{~XZ<|`&$oHOL_@iS_xz&SOhY@zIiy*HRAMNF*SXNfX!R|oI>`4+Ykiajp99Dfv
z0!D+n894AdC|f>5j<cgO>m!J;Z>zKozYmG>DP>H^?8JK0XeZq$-@0rY3xx-Z%0vI!
z5Y&7scv2k5m+Y!EC-G@a<#RTh2rX!blC{C~k-vMfKTS$$H-IWSUExiB{t)IF$c~Jp
zx+}IZWemoocxVebD9<lQO2b4~fMa@ZRlhMuXOc658%WQsL}D<JyoZ?KBc#qq%WzoX
z&%h7Xgr%GI(-*%g`?0tRnvh5;L&vvC0MDk2{gA#fnz(aF0s(Z@^+!kEl0ypoT2f0D
zz2!jaTNT5=j0x~X>y7wP?%()R<sz;E%<)qsi$~rs=G-|cT}k@E)1Rcj!5^i1EE#;1
z`trl|LVJc33up}<6z57scX8mQOI5Lt))0(RilP(8_VNnR^^Py?plKzd5%wT{*TLD@
zR@tYS#CT@#!Gi`sQ|uhoTWr6K(7$B$H%aJ<PeerQ19I+Z#mQ)bAo!*a@h9p>V)h&S
zD>$<^3=#-H`oo4jrMyc52*|W%)hex9z2Sd}%7~p^9%McIn3ZNQbLqLce$jF1P=~?G
z9QOAIIz{S=cora#oRj<nzSk^~=v9^2hMNcf{a^(ch(KEi|G9Grj4>B4BIQV+$25!2
zOl<^ms5<47g!bAxG*I}rYaZo@Of4kpf`Vz4bYCbn6TgXmDzHCrM79M_6~%^Sx&?4l
z%c_A*Cd<(jNvP?y()#DuN~*h*)Yrd@-qqc--K`VxPXF%CwmC8D+=6DU)fCLnpBsOF
z>_ew<pDp?Y+}domRbiIR_zicCA2HdqBcRpFo#o%&)q15CeAjOC=WLaKzu88+&M7Hc
z7qZFb=$W`jMOVGoF?5u$B(3KrtOI%jZQF{s1lR|g(AG5y&9jn(Z_uecICQleQ2Udu
zj%Yjq$D^h2IkdK$QDD`pfUH~T%|1JCJ;l|!d@>ZUhb<NNkrJ9gVJNfJ?~>%1B)4Ti
zx?9XxoUz-wgiBLRNfy76NlirCAcO0|<NshgmjeM%Rk(+9yaWq8v$MvaK`S6`IJViY
zqI2iT6IX`ntzW;scEjWlgW()bP(ac-PWMK&(VL-m1DK)DRg`(_&Yg|WV*mMv+|Yv;
zC=_|W`O&dPUM6}}s0hC)BkUx+EWo1@SkwcvF{dtlHsXCMii&~nbGa!t#t<$q@?@Vu
z5b%JevE%rx@3lqIa3{Crjhi?T!pz*Os~jn@*5wjJ5$w<=3t<@IMg2p)*I;P<ANucG
z7X|S5p`^rU{tZ9&t!8t&|3q=ssIFR+f}xoR5uJ_Wu*yQO3ulilQt1I1fS+rmVpNwo
zn+YH|aJr9+Bw`apTE{lEfa6SkOWxCVR39hVvBoe>5K38CK=Y#0{{FXtjQ>zm!?hvZ
zhU&;m5_>W+m7KSFOlCYsiJpX<_!sR)(82H2Xy}>XzXXQ|&_{`)Rsy61zCP#k^j{^(
z{Q-F1(X!8)E7N3lh(d>yQ6jNhFQ6xo5g{E27PNJ%0i(GvO2JcE4J2zW2Rz{JiP10*
z`sf{UIx*%WX|8VBvZYcD!0xM8H0FP{xg%zd5T0@LyLam3K=a7Qk6qfeb0Laj`6<)G
z0~>ub4bCU=_&Rn)bom<MA&{yXS_}J3Iw;Z>&VLd8Q<yRl#0Ef1n1Hlt^20mWbXpLo
z;Pp~fz0O>un>YVJ^93tKLj_zNj)dA5zMO>S2fb*~HAASmtRMnGMUJoq7Lfsv9g9j`
zWpo~xD@Ny;TmEqh_50Y7nx`*u+)3+i(!Auso@X^+!%Qj6q1~h_<+FePO<l6_dZI~H
zgEoloQqAGRukfhghV7NmN+bm51VoV<k-uZUil3i*hrMHe(tOiXMWrU?m0*Ug?Ee1!
zdsxQ%jC{QL{OMDtT6ECN`|zPNld8zX?el{)rV@TvbjrEBa#X|<#xqUvv19RJEk!(0
zX_L@l2c?OAcmHXnqNaxNo-$Xvx+7>cdy)H}6e+qI!c2c(T`nwt@_W${O#eD)3@VwL
zV~I?=K-#3bJFME;lm$N74L#p;0m_<IA1LHQ2};Sneo5^u%#KQ*ib&BM%?b*&!gb$9
z3>&uhzyaELJ=p3vkCeZ`&VFzjcvT=t7a#$W)1f>%K1dz;^Z;tZ8cTpiMSJM(GUju)
zgBPZTdMZ%kKEt(ZC<}clNM#~*dD_^Xq?_~Tm`}Oei!yWKw_^R9OFH$Rb<ArHQSFnS
zLhXRpLf);5x)bXql#IbvxdGrj_?}aqJvg9O!rNDQzYjGk-tOpI4<0P#j0!J+r{JX{
z9Yy7a5s98ZT+h9KbO?cPC--F=Uw=fk1B_N!8MdE&O3+|Ey@6yXROvx@qO>jpgi{9G
z>az#O2CwGv#bq|wD;=F!wkd<L7V#~$*AB77prdB48MKQ94Z=+6CKzdt`fs;5(I^A(
zX!Q~5iymj^VUulRVqt_^hMR*`B~@FCz75X5B?&#N{k1Uj0lSHchWY2>;t-}wxjzFY
zL9`$eAa+q+JSr*V0;aux#OBdtxsm3V*NKT=@}6K;$xFt67GSHyWh2J@qT`b>k<?7d
z&KBF9a~UgqSjvQ~sz0=~U8vL&=udL>0-LL?<tU@czDxUah6NrDup$`BZr-v5tdP21
zmZ=>^TP5c)fhYxOU(0ClO^o@SDj2kNq=4*${qHYhxu|%<>kQsQbzy?hI6ZP;!ktK{
z`YiGNif_o19U-7HH>AiwgUmY*jP|qe@tuAcj*G|8(86UE+4^p?V-AQWLxX*@YX&-D
zuV_M3!f;Q^zP#hYhlbQq64;$86pEhe7XS~umz#D$PKux=`sHV%`kH?&XUv-Qu|WCw
zd?rpBRWc#&$HR3${8LPAl*Du-j*-sVXmXK0y?a}onjt8J>+A<fjN1?rK452UG9$F3
zTCny4u`Fk493-J&lc)^ASdO*F=&(C<1`k#t!v40okJ^Iy^SR=`6?esPI6LpJANP58
z;kFB%oMuJOH4J~|bEfX0)~xep8kjC{_Ta1-tj@=^lYOe+G=C=^S~s@s<Sro`LCZzO
zi}v0{Ecn$c!<z^$PpF7M^)$HeN)@H{?RBWBR@|L9aU#Ud6HhVFdNHFweOlLBqzjEW
zOHp|${I;oVccV4xj(#E$jQJ}nK+tC&SW_>^sJ14#fA;#A*!MfsMr+$-9U0Yp-J$m$
zK5E*`5N*G!K4<E~rQc6aim*Aiz&~c#&3)}9zO!@j695JsmDv6uQz|v%r9JrROE_bk
z{-Es4Og+@NZwgLw`yCkPh@PXqP09c%0Z>&5A@~Tg?0oT`<5P}LPziU>e|K`L0xl;Y
zz~UooQdm%Ud^*PK$K@=18@u#XGF#(f#_QMPQ%hqjI3Rik0xX_QE*MFCKbxkT2SXiS
zUA>U-EH&lf1VX=x*Xg>XY0=8W8dE~9?Hq&8f|?7ozwMJPohae%b>BVg?2eTUs=m~C
zg#dSKi<&R8JHd%0kRH>K(IBR_oYSYtDX{tqef9}Iq1#jl-&BDcX;fkow7>uO3n4kS
z5J>xPZF?-u+a{3=&m=Gbls@)-jTf-Rz8?wFNoLxR!7AP@EkURqT)AQi5eIMKPc@ZC
zW7b7zTAc{Yifr<03-!??5n5VW+Ov1z9>UQ9^m*3`sQ5`H^q(OK&sSUrLfv`_F8$RV
zvCx6iGjTw0HB^>=yVv({=d4?g9u;qBJJRAK*yp?U4xpDeA57}NfwFaYjs@51-@mdF
z2>$ib_I4YK1^gp^vUa@LRR|$!vDeB??f%?)5bfuK3A78xl%1yM;J7jgdhPm-o=7iS
zZKg0mavqU5d*7xP>>n&(SD!z`nE~Ro-)+fGBw{)tixr1kVniwr?I6vYoR~xhCM7U@
zIU%-Z^gYw9UXNcreOeK`L`;2DqXl4_PUoDiPbaT`xb;nwn=PL_sk*)BgxL<Y+OKZ`
zS!gIOc=!cNqCT3#TsAX~cqjD}Y6HOuDDpp8Iul?J3VK9@0|oM;dvP!1TmUD+9;oa)
zf4k~dQa9p2?!%!HEtXwMtA^?LPUo56dp?l>Z*OHn81=gWQ-zU?s9zH5&{*nhB_
zNh|+U6ZVa!>bmJ|)Y{=fm6dz{Ht{NFVi9{irQ|lH5M<_B*UNgMGLeSJWM}yLN|Z#<
zOKCBE!a>m%ELxx*-^yDz`MGYly|qJ^XzLOUV1yT~8LF=Sf#9;^4&s#$Z6S(wejPjS
z`rozLr8g=nTs?HAh8nNu#P?fGR#>M~zp+Y_z?{05`nzu*sLKjubKjwdL;#B-T8(Yl
zb<&$5tBA)@sbpwJ{G^=^hF<TwP@YEnelqhpU{hB=j@j)w-z)C?y9mbKl>LA7>C;CP
zUIn}>0=CjmZ1@Yamcb*SNxF1ViGn2g5fR$JAFrE?->szquj8Z?aLQW>dEc_dp-GpJ
z#ke^Y*bJY2m0>vSfoW#DTf^usqUV3fyDhDqdyHKB^+B?^PVDrp{f4|uDXlv;Kg8#@
zyI$#w^VW3Ipq?|sF`nsqhZ#%9`J#q)v=#I8%C}YB?KfDClNhhD8}>v^L!)rnMeT#$
zO3KPlNeeDzArtobfOEnWHL8`g6O9{wCc}xWuYaqAUwvnrM`egM*`Y(NO)2*p&P$S9
zkm|l9XTbxqA&uwgZb{}p#U^&vnHF*-jaSI|k5rjrxbfes&@FWno|nEq^5#&I-n`4f
z^MAYP3C>UG>0Ca*8G%_R3Ias5YHxFMq2JC%26y!z`F}n5&s}vGFnfmW-CMUh+b;F0
z_9Nj)@P0S^^2Y15%LR`>k^q#_Jf4ggLf4BIFRF?%OZwBI71)~2pdXlCH6c_%lWBuX
zYCRI3Tev$#5rF!jTB1#Uuagq(B^f*8btv0!N_mOBJf)=*r?=Sa;IjR{-uV`6o0Yu8
zz5MZ`N0V&3z8`#WQ`Bzu=Eua{1kP!mQ7x`Elnf+xV|n&xnM~2pO@J$Lcg&-YCD~id
zi2h&Xw;Q~7Io&=`9nU-!IQI^!=<#1>x=Vb!6`-_2Y?}~3u;>i^zQ2Qp0Be22h)}Vt
z=BFQA>!1YOciP$}cG1L}2WELp#3EEw5~$gGm2oQ-7v4;~UBUPWr@W<5eCkHkk$V4W
z9+W&Nm2`cXLF(ES^ysa7MJGUqU3vDaWG!qs=<WN5E;||Bx};A;70g{$qU+4Z07#iH
z%}voD--0PiTP!+0vi1vK<5P2vQ%x>{5~ApB08(^!@BF%_B=8+-3)e1RCf^7U*;^MJ
ztqiNV;zyl<5>jw22_^0EG+w--OENG0`laa9(wuqBM=!h00nN-LBpn<h=3I<<G?#f9
zwwY1W2gVv~yssTZ?;QJ#sLcDzy}oy*Ck(#n;IH!N!^BZFd$e+t8ZADHBK4Ns+pXlL
z@BiMgVVm<b(6V*0l!e!BGwNDQ4va31@(wVH&NGjf#H{<0w+Zr|cJ;Hgv~!eD@7yE+
z?09IB9)L0xrcKEdFKglUoRm@=jL(prolILC<VkUo#i~btqD4oo=hP-wZ-|T{K&xFD
zi;4H9biAk^GN%zG3Clo&aI#yMuB9pc4+o~!MMMLp1kVg1`6(Qmx99_va!TjzcHkYm
z%=|jE;*VXUdqba!wNEixRJQ%GRG*LC&`V!?$dK-2RhEhFWj9{D(7$QVu@Tjb{N;$3
zY_lZoQ~wt!Mp^>^FHA)+4{$JM=&F?~(<TQC?G{eTXgh-R=m||gD`dh?%at@{o&SJr
zWJRK~$f|v{rhqwub@f|qVG;O6g?G1>X+Kf>$FHlMY~*xdR{o2fstCoBT0JFD&`EHh
zvWkb*HmD?*fIQuAwcxtwJ^EPQyHB4k_Yh48d+DvKqvPu8suNPMEhBR3SX*4WQLH^S
z++*x7z5d&c7g3_aF+@EDQGxn8qjm`AQ7|ER(6ZAcVu)4@3J)P|AjV0L7K3?9ONY^h
z7eK*CAf;o5KG-wxa9x=XJL=zU+xnC2fo09D9P*3AWhYAo6>p}gZkLDD=x4{duWYbn
z{uOTf*m6oedJIq_Wx8^5Q;a{QH$-HNBd4F9wOSf{kk@^tMfxe0-kD^q%#dXFJD}(f
znBb1vc_j;2Sneq0KDiGFs>;1syL&rZ9ljg9bEh^*YJS_an>UZ*%kPxO4aU)}y8&uZ
z7*XFPn~KWD*SdBc{@ch?Nq0%&`pt|v7Qky0oiiNuO4u!|CP@s_u)rgSf%(9AKyA7{
z_!<NCNUCj2&(B)M8)I+wgv@~oEPYc9hC$eyp4Qsx>Hss=gM6?or8T*Q<FH>=lAlB`
zZStEWuNN2?C2?!-8o^{@FGtGueDPBZW!sU}#`Q6zr!c0h22cK41_@wn&E>TymLot(
z`b}o3*t2In&>B;!={?;_ZC1sr=+h_U7lWoRA4buxy1JTGh2bh1?nRe9rv!o;p&Ef_
z<f(mw=9}>Hw-uy0U0{(xa4LxQv1unWoIq4MckcWgZ$$vT$&7d<E-o(S<F~-M1R52S
zi($up$cim!YxI2uzJ-8`><f=Tc3(^#dcx~bTeXg*O`h|cEQX1R02E=|$r#hR@8zjP
z11GvSFL?ok%QQlPezZzOxmJkLVOd$=Rr{wxx&;CnY!LRO5KwR!**JvjR&F|adR2^c
zN6K(X&R)>iRxMjj9Ux|o=_p-K?bl0B1wmd;mgzCl8oc|Zvu7=&_)*}7({tmqU%i@R
zN{v1D?c1<52bVg41K~=iJ0XW4wFIAJ_*@n_#y^kXJ%JxBr=MfTMAX*7A+Hn6)1Q!D
zZLRi&;CjyG6{<If5NEb+-MYgLXq#d*MeJ!ZA`lCw-jf$Wrz0}u%y*+L;7t<<&VmJr
zvT|&9PDK{RHazFzvIFt)V)&CY1FpU8IZ?;i2S^>h^RVW6i_^j$eA&BiAB|;5$%S9J
zUcLYIN0yE3dt|IVAl=48Lj8|}fs%~M0MY9f6;(v}H@(~Rx~?=tg?TSsAsQBIso87m
zA1ErUSEl=OW1H^#v$SU&XefKzEn$xtumw5eP+q)1PN>f-t6PMUfFjNH0gg?-o_!tY
z&)_avt7zK+%>q6lUFD96Wk=FLNIH-2gc=Rqv@rAa3XHIMmC#aH9RI8D%iiSHj`961
zLX+@y{0rNnbkdc}m%oF|urakT23#;>`>t)Y-;ry9EpQj2<xQxHe6I%l(Xc#5fd^hA
zJk7Ph;kG_E!R3ozTp9?F%Sj{z*t&iD#h4gPWPSn3>7{7>kD|kF2S9_39=IPw>K8aT
z2>6G47@@D9=(6h+vqU-r)I&}u;TSt+j39&F-xJMq^E^ekMhI-w0?WO3=1r6bNXBdK
zYK@Z&|J5kk_vIVRHoTxRvQ5)lnVCW@%Sp8EOI!>Vy*BCZxahU$N`GaLTsPPC>*;%I
zw>Fa<2>!HNg8UYIrGL{s^?Bj5WD$+A=TGCzXX+t**wU5_3T74gh1@@~p}b-@_P>t>
z4xv11$-1rHC2`k5pa;fc2PHI{ut8*NC!|It`OoS-+P>ZHR<|7>w|arE@>37nieao7
z)YR@6`qa(V`0%``OBnn7^0jOID)Ik<dA^)M<&s`{_ptNA<DH-!x%EkXpBvTvNB(yy
zP#=xaOzFHjPLgza@7~VgkLz=r1WpyU6eZD=`lYDmXEHxiiuf3x39H}Z0#YvhG|8cc
zf0>QfqLM$Q;(~0A_e+;83pRQE>^Di)t}hEyZr$?Bx@5dvhYST9vnd%y?q%3iviS1K
zwz`fzd!D`f4OW;GUZ01-3t9l_gyOY($&o;v57(S4jV^27`qYq1+PRU)SZ@Xej}*`H
zxNud<zht!L`}VH*);)O~4KqRpPz`{TE*7dw4R7=Ss*CEku~1LNauxbaDD=_Wp}MFc
zvg>s%ibA<1I&pu!>Mfaf3))Ih)+PG#r|7rz^?5@e02jeL9$ALE_QERW`16ef+9GHr
z8M4-I`NfZtB*)F{vvTqdAW73>sQ&;P>Bekr`&xA2^A@f}p}B>XFzsz%F#9tC!I4<7
zexnJ)xT)MqKn-RP+jtz$BkU6Cy)-23m3b1fWWY;EP{4Xl6y~=IREtk67}T-1j(jKe
zx~4N^h=58*=dtnWCbNcRVW*|5Yo#%=Ca=BNUF(FX6#HFOjf8FnqI=mQt@3?2p}r&^
zSm`9osa*~YOY?R==d!ehILfzMuIM$`Ol6mA7%w779DOT2Jq91ky_C)ZAVg}WzZw;m
zs;a8#i$05IbstMOazYsv!be1IA9OQtx6q*GW#{$hE5X>{C;69(sw>0@r_t$6ehic?
z5<VqISekIKxUaZuQZgu4$3^TP1r~ADgWvL<9gR;V#8*R=O%-J<Pyn+c{5<}zqte>7
z-w+1sY&$`Yh`RW)6auX_B?5qVd0KpY35dA)o4mYZd~@_4BS$6>2*B`H(<X!!agJ<{
z7dtFper9VT@$NmW*;B|oju7xDA8j@xwgz9@I`FtWO&sovZi=gG`yC7;ktI$^_4L3~
zN!g~h0AV9)Vr+KzA&_$d0u+=Y*=gvp$QD0NY<2T<B(!36JyZIRR9flg=99nR=i$_0
zfY`nR2eu?rT1AFk+)BM@u_SKZY(@gi=$wr0O>;Yl@RW%%N|fZXuC9)vks({Ow)@6~
zf$Ip><Q-t8wDOPs41vdpRT;VZV^Dw<hyxopdiLa>i?)%>1Erp+ra1Mps|qc-_aq<o
zChImn($K)*Y3ruB!T7-8Q1z7H0*eR~r7hXF8Ia_NSjH6pNog;d6|X+b-Y8l%LTQTk
zg!r9fX0{gZ9}DV3&dh+!<JM8+;fV;BBJRdTjENXNpi8VqcLsrrCEIu*cVdBEfZ(FE
zkNH%f1{srtEzTQG1PSGVcX%d%e=Rur??_8R1~WV^&?)`@#MTf1>PDpA8!RY9ujPEL
zUM+D+d8FY)zLU+(5sD-fpAJPcu9y^;plCdKIK46S`C6aa%Wc__*n_53vN&q#1Sur0
z`}dXxnQ<4n>*z<jf7qxVWZ5(iTR~X(Y_YD>KTQH71Yfc1&`(qg)V{zK7TmqnXWoX$
zNZbV3RP(KC9@juT6&j^1b#f6wJvBBhHK=S5p?9-SI_og~LKGWK&G!t5UtMI)jAvOa
z?(vinvLhi7DiKsY;H%jRy`pVcH)222V-$)AA3PsM>Czw4ePhG7#$;381pm=e!@%)-
zTN%s26Xs2kDdBVks@)9!d5=N>he5jokt7-0p-CJGQC7$Q%F?^!Cqn4;_{6UKuZ0U;
z*~lX7<yXF(z*tcd=tE;7k`rX>jxgqz%uSZt@Vr4(%<$3ecVGYshvu>;rN2ZVIU|y)
zjA>E-K%rOgnQUSDm~<kHz0ZXGH(rggba8X*w1f5^UR0u2jivZPwbR!6u*8v7P$I=~
z|MARbz8r;e2_wR0W%~a4=cZ)J%0$yq7*>z*z<F$bHnN5q--J$*6|a8+jx-LK0_<H4
zoGylkkgc?$(xz?Oa39M-b^|z)VjQaY=7ZF<JgGjaS)xD$BntSQnYcSkZst7j!QXKG
z8W-UD@J^mS<o&21#-HwK!!K|&2?~4%BZ^^Ow2WsXz7r>$O_hijy|nBFs@s365kUX0
zpg=I}8+TiX*Ti3d9!$KK=oJ4m4GJ73rrzVEO@($OdfgPwM~ge$EJ)C;y<`p%<_-=q
zFcLSA_WHFgqq&#jG0XGsN67Gp`l(bBemHzeto;}zOn$i}Ss&vmz^bQ!aYA{n?sJ<i
zhbHaGK(A8#_*SWcknxCO(&Xr4SQzTY{Q=>#k4W-dNFu*hR47wb*hS=9Rrq8qx1?Q(
zmmy`x6<Qum19p1?ipIb5qn%q=NabXK(z%!zAuR(x^Xa5!z4qusFZcbWDSQo6p;UmQ
ze$#d7ji39;il>8k$WxsQV2tw6CC@%#F2xBvz}FI<uQdO`@q6sq($0bE{>n9Lycwk(
z0Dp9!bYJy)QCoJmr{J}!VNp1F%qgDAa=-k9D~38_HOrYCF%hIx@5MhqUv3t5=`re6
z;U&+mUsvtjTjVX`J6{|-etdgZy+<hVc!+b!3yAFcyRQ<1Q@I->67+!ckotS~zJ==b
zXA9|bS0ciT{Xi&fQ^3A`S)ksa9^q@$C08trjeVg+SVwi(L1syF^Q#9BK0Um+;O7Rv
zh>Q_J)q=yL)~<VZdZ)@sY@)iiFh28z0ELu9<Mbuy!boYK&uNnNk_;p0*$dDc!tqVA
z$3H-7M(fl++P}~bLprsFV!ip=ZW6oSIauHh_!|u)8)i1K>Av1Med)iIZ-bl+)jHZm
z0=~EqtixM43S@qgm;W20&j(^-%dxDYRQd_@r^m1rq3ny!v5-^I^HI_ur=Wj;{T(UW
zf6;#q8IpmZJ2;Z%T%vDxN=NvoX?h(f3=#9t&4J^yp36N7xhpy}oMZdOHsFbYU6r+F
ztN;xrd!91)41&b)nBnYL^4s0|vrD1jF9VE7_bkfIJ?mC7cb4<|r319IXu2Dx7M8Xc
z9Dh`isJ#bnpdJAIk0FogH*T1iij0r<45aved=vMD@I`<dh=8Wea;EBwy<&o3*rlcM
z9e&1>d$M4x3)*12c1$fxBO*^_5GPCyr-CMdH$Tzx1=a*NZTejH>QY>&;L=@<lyvSy
z6`wkb0m{*)#|gd{-f1N>K<@2%a9~O?HY`{WVGaylIEIo~#>8KH-6bs&AQm0bN8+@m
z9)G}VBI?_vm32^Fg2@|7Sgb;`01<2O9z1eSAQ5&4CF`fHf9o}N7%?S~CMn;^xbrbF
zwGdhQPS`t-D2V25kzT~X-4|UgJLrFhe|7(s-#^B}p$yM}IEH-{pnQJ#rcL9>=7ja2
z4tug|E932=qmEqMn^aK1)+{qcPMF{`sH~mxZ22?JX^E>ORR`s`M>v+EgdxRi{@Z!5
zV8(r1)HD!w^`AB0zD3VC_m&&2-5~5qSs9`K=FQh7acyXzr@Bq+z7qeGBGd86Xd`mf
z+-20Qeb}3@lT*f1Rm6`o{{1Y#lh?~O#-7}kV1)1tbk3g0a0=c4BW;G1gSYTmLZ_c0
z>jkRY!#E7&+hj&2KBW{=iYY`BX&?a{k+#@sYx7?EXpUsj0m)e=REC3c(r7Mrov?K^
zT0cFNjkgOS>vb4QH9CaF^ePNhlngq0W@c`V%1IW8Fd6SLBZgrXg8c|_O{b^HY{c8{
zl9&y^HLN(MrxXGWP5c=>-BzsFt8Kp-S`$<9b|-55ykn5g1ViT>12*oDZnOPQpB65L
zX3aTh8lZCts6nsk$qwz3z8%SMf_TIa*Hu(fS_zYYL=t?u^O;Cmn2_DX`a)pMNclb2
zmgn;!3WkzI<rv{-3OT1_$9E<{m;;ywtyuw^H)|U#pFdxqxq?W?e5stAxmRxA?d!Py
z<-~|5h!toJ9V!G@$o=C0Be7Yq^jl{JX-;$EWpRcHWj)3Yme){?Lll`3i!B9ntRc26
z2<hxwfs-WGp!i%0;2wGC*QCT1IAg#&w5|e@MOzR;#0>{%I$$3dEq!NjJN8e~mnbU1
zE}}0E!@0WnWz8I)^C3TvOMkjQ5qe?HatUr5<`$8a@kIKD$(=atqj`R)9{BAuJbC>j
z{3xyp&__*;Iu<j~9qtG{$-qvuS9Bk?qSvrt)%3@(@=u&D;DAHBunmFV&Jt>1ZPDxd
zajol`1Hb)zbJy4$^`LDeCx+{4tvoufV<-cbGfuQFV+HWz{3*KA5Tw5rvi_IcS2(Uh
z<kdwGdW^Ts4QkFfE37R)anAf-pv=v1lxmE&-u~4^k4L{&Kb1QM(1XN~P~TaeO7i($
z1uC>^lBEdbCXYMkM98o>Qn!8+o=WGNhzldMrf*dLJvQBQ$@1mJB_)%k_LB_ou>7_0
z5#vc;aQL3+D+yGB2G`S4RD$Kn{0Y|+NOXHgJkGY5FlkaprGW#Zc`Y(bRRsEk91&|8
z07)*Ho56EJ<liMSDk`Wq`K#ET(gQD%eCZ~)#C*0_b2FAEGx{(w7wu^DQ_PY%3}4O0
zH`!(oKrb<mlE&=Ir8MPIY?WmhTuxnXrnp?-tNtj~;}6spEMG2YG~k4?fSCK7^S|!1
z*Hl_1dBY%lYG5@ww28}hqr}%>n>>)<qR9y~%P~EoXL#iryb*qyf0Rfve%2!SOV(3h
zE>5M_kJ{Br%xV_%jW+#$(_Z>X-;~DBvW5m}fkVTL-eg7r7soIK!pH^b&;^M^p=X<)
zOKE&@7c*0-)11%Qtv*GqcW*ywNWgPWZ2WSTI&fPcDr!4H{D_x>HKvY{g=_#_qvgpX
zw-VC{M3z2>fPl!zLI5@)(J3<y65skG@|;;bPBKs+sCwj6g2Xj2&-G;Fe&a6xYYmM}
z^GH}IwtCflzY-vnqCgjAu#QAxwnctqxuji7I=6V-t0Es9hJtEM&Az>*y~avcDDZQm
z=aC5Mx>I-nJdxrK!UX?~8)q?sPd4vC(yOW0?9S5P5`Sn4Np*osJ$u2shIF?<zC_Ce
zOmgO4>yzv<sV;#0cZM*LgM6Q%<4QoE;z{X(f7-A+12r0Lr_kdlXNb-|Q(B|ZK!L{C
zU`!>Fh2(9MJ~!ZMTsrxvH?#o&ZsZ+_sC|8X@e?wLkR#Dyue|ofM^`s%-kj3iY?Y28
z)@s=B#@mxVz34{3?TT5m))&sv_vJjmUV0v#BE-E1UJ|%7?-xdfnl@t=m^~%3K<Q7k
zfANcMOUc^+!x}!`^*b~MOrnMCVg`BmLmaaYb;1A|B2#I0zvbsJRJxme53axV-=exl
zGsaKAMCWrz6~$zNhE4C=?bd|PlH?|ktSBZ%W^0J=&NBdA%TDx3Z1`(`d;`ZSjt21n
zDmq5YkxEaul*Q6)$R@8YUHOZRK)e&WMoJ$VKAA8&TS<DY{siJ6eCTuk9RMv$C?fva
zLfJxJS^AVoi8n`iN&idyQ92SQSW)u29vYgOWtizEM&1!`%Qs4{?a@YCy5!)bh^{yz
zT24kEBUvD!2ArBMzwm+3#-Du&Q53DYKhz+U!8@uawIbD#esSr>Mu$1++65Rx<Piac
z7MoBYqcuPZw(!xMZgP*oU)=ver@r=fCtInRrQDyv>>}|~`X5sZ(*HHYpgHgRA#19M
zLMUOw&&zdee7U~fG^zCM8$r|;kPMhlIB(Nnyd-wNG}^*}mCRHUqi5)uQ=<+=+!86j
zWXw;W#*HK|z;#Ce@53$;DUdg%2-uqBOw#*I?$X!=^&^pq#k_(KbQCl%=J29@cv5~s
z?>9QlzFI<PoUrZ{gWOH2rc?b|Y;P~pC-LKZ%Cxb88iT9{L211}93!bJnwbR%&{66r
zv0t4VtwC}VvCl@Vy6jaG6JT}Hs*sS25}IYh8#R8``0YFQgM9?VO+46s2ly~11yF9b
zd{dS*Yjk1ve)J)*RKwPM+8dw@Gf!9dJgKvMCETYs-T(*2j~ho!OxTqe@H{Jvd@7oV
z>gQ+wY5du)Kw0R5u%SAM5E^M0pNh1xKR^2)BaNMK0KZ7h?JOda0W)lDD1=^vWbyN}
z1&yzp$Xp?9DjkH9O6Sg>XY9aO&=a{uD;?SRZTO6hcp{oj!70w3Ev>35d3<Dpe0g))
zH`+L*1JzGo;xg@&sCN7K)Z?+_?>F~HV_y+Ho+}clK$raDbNUj{;Lp2^ZhV(Z(h64z
zl$&IFG|BZUc(YIsrpgcCw=s?O!CZn+lz$dcxD?l9r%h}4oNtT9*NlQ`;RueXw!q2h
zDouZ#I(3qtf)g7WIiI*2q$R+K1JxinQJ&}yfmMDAtbVd|*^XQHQH#I^k0vqHBIBW6
zF85{@yY^)_)b;pa;JS&l)KV=|BQO`y1LLxPe)ZNyUu|d1qYxQT6BP*-&88*|ahPx8
z_fi2z1*ALy!4@UD7HBidhkGWEnF9Ma{%k#{sHj!bk|SM763#Pc&d66ic|hY8s$YB|
zo80_cZS7`(T>!dM84i^nzNLPR?)3X~U^lWNDKQbl2dkc3SZ*PWzy9RD^Z2HwWY2Kw
zylG5SqJZ_9q<<t$8uBQ{PQQ`4FlK#_aif51+MA(aG=qrzTz*d1G%^6jbD7iff?7Fv
zct8L-5h}d2^vkz$;6!=pL%e|qqVEa{Ao@)GmY+T1(x2;)&}bc!uaO`=XAe`+4KO1|
zpySwr4#A)IyX3z{42;qU0p)~^ZW&&A5nO;;i9G0wYuCt7c%)J8qU&W%<;Q7C@a!JI
z0G7ff4W_M|?|-Z7Yf!+%xFzVEu%@GrVIzRzgp8B?@VWe^bD^okxJP(~6T>SPd2Ne~
zbi@P)*OInjU}~p!*>)#8I~zeb@-aFmNE*nQK_U8E9y6ms;NgSxy6-;xhX4_-k_M5p
zO^)pal&7X<5-F&lAi%k+b2_nb2seS9gdvdN)Lgoy<8BbRfSezH>2eAYv;P$mEfkyY
zqoB?TQbY7T;B>IzV4(y|iE(1$an!o#2!P<iB23tLfZWK47x*$^-CD%WTtu}EoQ}*{
zOeyi6NilQeWMXt?Pd)jhL(&e2q#-ai<kIIG7)W}xW$YrIv_jn+As8L^(p;!e4WU%v
zdZL4?moL+IyvYpnM2gEu?Gy41eNb#@i+z;M30iav?3XP&R5y<@Rq$J26$&fRneIVp
zM5D4DltG|<-7V2`t_krI2%t(DB5AV>hvKc$y?Z=sNVR~mn{hlq;cGFulf;dd*By0c
zWPFK<^#|6ir-;)BB0VijyevVjr|iQ`Plfx}u(#@#@KUJZO`15dgOW`4ixRSY<K+$F
zp!lZ1t~^T>EOYi!Cnry&4}aW;nntAV#l#RopdxrM7!m48l@5%z;NbP>-hG*);}y;-
z-?u5iI*R`9G%zDV;abAJ=FOW&cx<6n;^o!V_fu0-&wl%g3QaspkuD#%B{Jk;ZaG*L
z-6199EzW6S{z&@qJuN$E9+NF)@)Pm{J(?cF(WOO4>cT#iL`_*JP@O4FGTmKCcOh!K
ziSimHi|Xp?&LN0u#jh&dDbU-Qj`1DfiR+Lp0v2uIBqy^qpM1(f0o&EJ+}BrPU+ces
zccYdLdgfM>L2W;RHa3XHtr!qvx~UzZ?kk`@_K@t5^h(8rP@SlK_5zj`hk18HtnaHQ
zY<Ql#5WQ(}^W0^%7?1Kr!i-XIFmt^pKvfgRs3BHN_5o5LZ|{y^<M6NyCu*_F9UTkv
z@`5t55ac$)0D?l$y(N=PYtp32&Rx42jIhVwO#gIswYj|FNC2Wy?5EU2`qP=YdPvrf
z-zy@F5Yi<MxKHCY8>gZ<kV){OmmGB{*t!q@WWGW&rT&0VFvfmt#HV#oBHzu=r~i{p
z_8sO+R(AFx4|j|}(B8e5Ia-`U0z7e?(D*eUH*Si|X6DTz1HjMX$T}!dZz8E<WdW73
ziX=M@BJgpEqqib_ke*&<&p5PewJiiaxNzWL3ei}C(W1Ood%uz+PZ8q+lRLsYneY2o
zaJt>v!+~_ew2Cpbya94M(X)q5I{u`v!4O|bVL#IxPFIQPt?X=1qAJjqpN6;WfdC;q
zUIB##sa`0gb@tYc8*hX4zf<KipMur>ah+m~hUi@nd@~JhCrtp8Cr*s#HfV8gHMxV<
zj8g3S3;NhVgGeqo<di`&v9@B%4oT=m-|Ph{j&S$Tl48Lt3!gZ^b#|;<F5RMW>$(2w
zPtVK1@6@2CT;i3poqw*7o(T4>xSK?w6mWAYcoG%b%ViV_eIO?<6{S=1sM>cNiK#6J
z)mjbIvIsTaAH0zY6^(t51e*$sY`}m=Sy{tzY0<To#P~KCWUK&14N*X&WIAWg9O$Ld
z4JT>`P&HdkxqJ8iYOoJzn;H7OVx;t)NpeIe9;!{591rc_uK6R^22iDE&;jb!N=oO*
zY6v0Nb)*{9DJ>GUX?%*Es7z_a9R1(JbxU~#EtS>Ja&JTto6T_JbL;!1#8R|7v8I{C
zB8$6<b&7((B6O$dj+xA|QZw*`ODn+MY%z8~;u4Q>g(7WyWt`{)Me>|z=!AS<RY_Ov
z8J*Yc-T3!!Ijg!GTxj+AR#~J+&mft)N)J69ohP<C-&oH&ZLI;d|H)xNNC({lI*oBi
z?ZH|C8OJZW5qFBc<usNYw-4lhnyo+^IIW9cU9BXJhm3xt=W8jFmHe2pE{ba7^Y6?9
z*%v2v$&x6!y`O#C^fUh2CxUKKcIr~>FmdSU-W+`v5*W^Lx0H)RaZ3AT0bK*;J7%_>
zN*N+pC??ka#Sps_i~-I^c!y#{i?(g2%LY5!*`m1x%*l8a>Px&wa+vWAahvgOL8H&>
zO?JdhrzL);dy$Nez4C(dB#JCJk=m)EG_o0$?^c&^l=JroJ3?YO!b6i>V5jcwEWMHg
z*)(H27>54Jx3`ELV%JPY8Hz{>|F-Jm(@m2%h?nB4gsUgM695kZB!!1MT2#L?%G)rs
z;9EWY#u?*AJieFLxyA4E=g(4ke|h2`x(o6TmOJhpI^kK^gKiyTH@kNlweQLEu%Qul
zlUy&(9QR<RdGph|<|Z12ypH#do;*A*Ju2jO<(JIz8h68xe+zz8e%kao<MV1CP30C>
ziext5{ns5Z=rvD4^riCqk<gSeI*h78Xs&ODKo#~3MUS`c>B9P&f}YZ$rl9BLPk(26
z2*tU~HTR-p7~ynfyhO5Z9n`CA^Vc6gd;kSn>+X)Re~^-5-fxNcrlfwKWHo+D#wQd_
zCQmLAx~t%wFGtH_@3@#%c>~Y@=tF@FzCn!Q0@j_3aq2dhy?TouqsxKxa=7KpPbVng
zX#~F@uxX*yQR3Ulb;SyWR;}VF(R+D)ryvs`#qp9_Qo0zfuRlJ`X6n=@fEt@Vh$<7n
z=#?;hij@Z^s&&hj_vx0`ahQa6wMyMdLPSe%3HrE*WIjb{u5uIvD%E9xD$##OP1}Qv
zFOnRN*XSbKZ!p<_;uc21L}ApZn_};;@Ft$br=4Gh@mjWQ&PLB)B>Bm}5jStXqiS)H
z0q@tOO(ZAG*$NCJNu>6vph~eCU`SM!Q|2^)A^o=JT#{Ji&zv=D5-LFo0(yf}8hIyT
z!oIMmsD5H94E+$(wi$U<Kc=9p!1vNoH|I@dT|E)`wI2l_sr$RdK=^X3gR7vs@9?2R
zr$>yYaFF%!SIJ^q_z93E2}gEdH)?8Xl%(`RU&}0tzldfY`>>3?fKeM)A-!KoXp=xT
z$@J?SCc<D1zr8dud3z`5E_QaNu)0+B7G4Ugh?*SK@lNzK^KJkg#wwg+Sacylv*upt
z4s_aL4cJWyXhYM>aHXHv7`~cgBCY6~_VDmPKk0W>3>S*5*Ew)0bA-+qTW85%lZ*N2
zEDB?yGzUo!q!mdSv(qd56)|_3nwqtU06z*bWOFUf@cQGAqj(e~BxWDy6+}l8$zNr%
zgr_Imr#ViKrPjy&cQn1pRKHQQKlY0IC$K6iEE_L0VY#UmH<}M7UNtm~mnu)=5{K+7
zVOaTs1z!$^rf*fzVVzsHXu+T2og6*-ho6PIGR1J}5pbo8ut#u%?HnBiF;;cdM&%_N
zjzmWf%y>c{vcsVpOrhL=^mNBASo!q7#r_#Ju?=dcyV{()c=4HC!p9LyycHzIMXg%5
zMgV;Smk{l~UF!U$SgreRic!Rdb7(+NI1&}bsGC6!kGfsi(r}>T)q~8#Uk=Ir^E5UI
z4{s81_Uu_?5|ihM2cPT40FDeO(9=+Ty$TqDIMgaPx9BNHH*WQRGn|gh?mc=8@Vg8h
z0K=O<K+x@zTGHi1UR6rCXF`#F3tkfO$x+2&Vy(EQeucCXCe)tTg)oi|?BgXUC`%-Z
zr_kp^hF)G??ytxeMxf;qa{+&z`R!&>F5u(saL8z)On&ktmg0;{ru0#wg9tt!9MacQ
zS2xQ>*P{wY00}Jbv@9ITy)e2~Y){`ct-6n}Dwd9(&+WjJxuN70%&~FIynELHckSu<
zeY1E9PX16YT^TsXUo6HkCd<3}%|DRV_b7%DM6tD1kZk*TbH%colm4pJ7)%k$yUD~y
zenQ9%kA(vC7Ws#bKP1nQeo*>HUE6R=nwl%FH}7ed_5$_%gCkaNmYbhH|0<Dp=t+Fd
zdm9?QVt^$uP<k;A7?qlOgvjpMh(Z=WRv-R@mn~+JY2AUfu{j4Q5k+X)zhoyDU-v&g
zsDUyb!eFX>`;Jv$Y>$xpCndbsU|uAfkN$_q_&)gZ<qPbhq3C~VNX6U;?%GN10arcP
z=<9n$5UIfN0;@L3t}H8K{L|1ug90yU%I?&lWqbo8fWO<76WTH*V$$Xuh>Gyn3@CBP
z7r=gu^u0-c&G<L-aMbLQ;duA&KT#o#e2u&XnOn=^tFr5sHt}?k*#YtQ((Egs&uCS^
z;pzh?Mpxay;3fVb=cMe@p}%{|`-WJ&6CiQ=!-tPcv*92k?n-1C)<6=P_G_ct)ij$s
zHwX4Zmu}roS!wz!qJQy{1|al)UJ7w)enOQSZw5c=1@4}nA;VqAWK-s@=fu<8Dc_iq
z@bEv}{sWYv6VP5E#l#WeD5zs|CO62}Vw!2A&EO)&vJ)}zHpX3p-|#%fehavB+%(z6
za>H@evd2W5ejtZ~0UU@FpFCfbpI?c3bJs>2C&gr}u$c`%bN^y?f~fecT)89c5)hD3
zx-d0umL}VYz9)slk^jsRs3Tz?$JqRuGurYvIbOZik}0xJJZ{ryjTcWfHTB~rmE$uq
zGDNdHCSK-%DS&D>+CHJ(iSVeLtOABUPs8B1#QGnp=tlf}b90V4{}LJl=<7nW4hTLn
ziWoA*1)zWw9fGbA97un<ppz0ZX#NfQFT&@LgQAY|=+-UMpVwK<br$!g4h02nIQm*3
zw^(YtEMHnwTr3j$h(`!)K)o4DT8O<uu^?w%)hnzUl4^TqUnF1u_(uzWp@zpv2htK$
z2GA<2frk*rrYp}RfEF{P4RU9OGj^<&WCS#Z906;ywx(+KIZnJTN(qn$Dz77=dgbFU
zlH5PF=3K2Hyn;*vW};P7B25tDZm=Y}w4VS1m|r9BrgBNf^~GBDqJ3RVYC9%vS@!(w
z8_~vIn)}zevec2^1a}6A#Iomu<mAqjKFFA#Hply-&o3+*c$Ce$I8b1|ZnB{xoqZfD
zO7)e*vt3kFepeO3*;ZC!2oJeE=(DJG<V}H_LcZ3?WNVipCtHkm*3s5R;KCLI&TotK
z#ObLS<BApDeE6X5=7=bo3B_kBz^sQ3oW@_8{>G6Ik|_8UmrUvX=lAQ=hx+bdVv3z>
zumZp_3mqLboTtg&q`DEWDlQ3U1b3yMswt>3jjxrsa_cDT&+vi~sVax7fN_i(dF8~3
z#VF+ndXr)pa0~(KkfB3I;^6?81l<&IL-vUntdKVo-t&1pm?M0PZ&ZA+>{op1>o}$a
zMpdljxPfpoJ`!yU<_l2`G>pytEiEjBBUSa=w{Ip>o-Rh44i}Sj9zE)4La8uehOvso
z_oVtCe=LNlN<et&{CQNkLScykE0`P!lx@(*F}NuB82|YDYXqB1uf=#8SW{YV4Gu!x
zmZvsI#Ug}%SVjVc!i?jhMSZr344Cu7GrIz$)qbbQuHh*tc)7Xr982l`7D}y<89-hl
zCLXZ<q__#<34fo4VDs1RfLREo(pJYC5wUxr$}-wlv870CR#Gqr%m%$WnvhHfh*VaM
ztV1-K=h&;ek+`If0RhZEO(pE~<BSZ^ai}N;KC>J7VQ(^@?F4+3sn;@v12vw;B-o;N
z@96YW`+W7v6;a2vvAI7>7QRX}|K9%cSILPu#uxR0@#+UyL1K1I(=&>xHCF(_D|c|d
zTdKl~fW3cgWHIjS%^Ov}0K|1D5B70jI#L@2g}{qqRB^^-DkYHH6c@02EKiNyxqJ6}
z<Zk?s(>^$8{@=%)V_d2sri5?f6yCS_oiOnn1W&M76wFn+b{+o*OT<vk#js%rXnuVE
zPR5vbT8sdaqzJ7Dz>uO>*Dfi?pC<AE%3_CXY9Io{^}&AS?&Ae{6g=v#dW%4&zm`_W
zh_QzK_wU^40&J>kO5y+oV#~vBgXSd>pZF;fIj5h!8FjL2F&t^D4Rpy7p?+bQ3#@L=
z5=dTn8Pt2GeYaFaQ6d~Clcpb(0d@&HO?6pR_QF*}sXroL!u%`*pE=j`?zXOO`6(SJ
z3OXm$j8Rz0mc>H|=#Ipkef;I?SEQ4dpBqI0EcOD(gu|jm`wtvQM6MolLn%dL=rTHF
zDZ6(bv1&2PKnZjgI+AbSY9o_|Zv+sq_4zT)*0vrpcNcBDfUg6ZLie!Z>g*lfFoR@E
zQ2QA%!jm`2Tv?L<zJ-Aw0I2>ygTD8Bo`_{Vxf_#-UVA=EnNB*D(6vX8<qH@7AwTvf
z%)|kBLcKP;wVRvU#P<xTG0A2Gf@X~qQC9diA`qt;zSLdM@7T5L=*W?fj0s`+S44v5
zk!w@>Hk&YEcI+aAV`&XMlzHEAqMQq$@AhoPIuAlSv16a-vWGL=o}{On_i$(XbAXBp
z3pIjMpw*3Yo_!nbf+0hu1#f<on%Wb=F&Iow3ka$n9yVescomdatDZ(sN!cakU%m*^
za%BghAV6Y@lvxE1Z1VnC*d7ND9%S#t-?I5l61Ndyf<hEl5IEVro1E?<&R>jPvDB#h
z)@d196F3`Q7>dq<B9eDT8rp=2gbjo}i{`fTv`;@|gG@sBdF(%ZTg#ky+}vsdg(RJE
zgJ%jY0-KDJBc6`D@Ng7~+_5ufL<GC>*V*x&)3Sy1WbNhc+aGbvm0R2<{(&gj+i&lX
zB~W$OLk<BfKK<rx3@Ci|`ubL+NL{vW*`gnM-0q#tYD@x_l2!-?TaO}H1lc9bg=l&#
z_tY(TwVQJ=?aNtl_5dHGam1dB0+&N*Gan!(0r=4Tj2j8`Y>TI$IZVj!QWf3fk^wpG
zaW-~84i#4|2-jpz80`dR^<=oM4#BOW(F5T%<IJ2F#DZ#SVb3ou7n><`XTbLD%QH>}
zYkG2_*P=i6z|d_C;ZxF*0OFPqjp4&*Dx@J*XK1*s?#yMd@bgn}k2|A378cUwibfy)
z+xBJ?E8rextj0g34uvQ`M%^1s304>RcSjQ~h+CxbT?P!eCgR~`*;d&j5t}|wEvQrL
z=&ytu0A2~aQDEJZzX=XaPVgF5f}_{L1nq}GBSy?JH)^T6v2D*B8WpIMG;7ujJ}W}p
zC!;Bud;=Pk>X1TV8_<2ETk&jFQwn^1w>KrkQ>V0mDMV6Rlo*KC4(?&dzTYM8`3&2b
z2Pqo`j%!$uRjVqvIZT+J;*~2`gzu$~4i85eXcSF=bb@{;D!M;spjf&9fG|w=0Avxe
z$pybc17kc`ena{s(N@E_6{6#OtmDc3Monq&v&kN$tLyc#nd9sspNA6~Ay-9d&rk8r
zojdIE{=@syHpl@bKLONJHlZ%D05GV_?%EcDp(&)Ok6toSSO?FtC^KsRGJrs9k)R?M
z|N3OuC6fRwv=3zuph!`z!5bf|yRHei7c`W~qwoXJ$RQ2;bDP=9`QetL?1Fb-S633o
z?6<NxwRW&XViF)l1GCaoQ%m?lItCygfu3|QK_eUzIzR9gkfwTk|NecV%O5?p#ZsIY
zMEuKg9HLb9aq2`?N<@u-Q#m)p|HgO>OlKnvFfy{uetfn=lwufdu-Wd<jy^jUPR%OV
z-2PmvhwK`m0K2RpFHb1dcz!&8<cR2IJe-~Ob(>z!42&VS6#fchnt9qV2X8UH&B^*W
z`Zd)2uv()CZ?;SnV@_gX%uywMe**sZo}oN94uoI}5tneKY0CPBhI159<o097I9Z;g
zrrPKKb@t|QHLq{@Z+qM9gs<3To<cN`dCa(s70PNglQEPzBn@g)hR9S)h73_ONTMVS
z#tel>g`^CHB0~x#b>2^_*uQhmKj*C1_vO1~wT92<dG6=Fulu^L`(gM+PW8PG)mVcL
z0F;(?eEIRC2TeL@$72qu$AVazKYsiODBeZu;?kFU`XR%;7KaQwEq{H%{6K`~Vly2g
zG!NoFnaWdgD&VJDn|`=TU<^DMr?HiRVmyj<yNc^!@(1Dh&8t_du*bHKBf0@gb0_xH
zS9`KXB8DH*S*RT{d(fQu^Sdt%-II_~@cOmkL&Nh?Mn*=u*%PudGh^6d;B07(JhI#N
z<RD_5Y}oK8y&3}EV#|}SH(_@+=!L1kj93pR?_C1W2eI;LMoIuZILbOIl2jY<S^9o_
zq+#=p)WBV1+-(bW6>oZh1QHUP=zvVWvl-_d#>egs2<Yj>3>f$I&7uyXR*~MdXn1JQ
zS`Mj<!yz%er$vq_DX;(6UNdy>u8bUbGd5#J!x<3)M<XJh3J@;I)dQz=l+;29`TbK+
znUrn7oIhwx%<v<2-i^8_iZTB<q|$`lN5rGOql#r(YHSdM{IHtCazpA$MrjwZ!OF_Z
z7t>ipk&MQ@S@Y@cWq}MI&1?+A#{9?$F{gmnXeH57Qvn77C590_I%4x_ADCDJPX=Dh
ztEQC)XT8theY<ymK2^c4Pw9VMqD&wwvFI@#1nbD?vBmAQGB;$VWS+7z9Z^yey6|{N
zT130)OHpkFXR0?)`qY&Z8%_=Vi@f+%CyRql?;(g7mMlN=CAPLbaNh?nIVN|A=26D!
zjT^54v4iLR3KepdA{(XjOn!-tkIsSzygdL<ilSg&5#Zehb?Y|H;rNftWnp{w?iJl7
z)A&NspOs#+U6bzMrYp?#N3D21Ci==A-}SLqA3y$H@sG^ify(Ug@Ug{Dg)Mr(={dwh
z!fDq06N(5pt8%>A1om^->xj6v+4o7iX-pZq%EAO78?(H?d+j`f|Kmnx45z(C`N;PO
zxly7+BG?fVRLE9zL5KbHs|Aj9kidyLPSl}LQ_B(22(p20+QLO*<<subq|W{{1!SHe
z-$EknB`Jm1zi<O0w0}ie(yOAF{B_&Zh$`v~S$M^!O`GIB$1Pc~fTYgC(lWF-WBPWr
zJ*UkgvW<?sB<N|n{v2gS+j>@IFSE%FN9=#bx7VEO6f$@AY?D9w=_@wsD{K4Hf1Z|a
z0&k>|gk$&qz`#?ZmMvI72Fa|p&y+VsMW1VWiBFjNjAEg^ns@|P4qne8gyRfD%(^eG
zt1&`@zfT?PxsXHynRv8nSER3gWsx`&M@Iv7?i3uwQnM@}`;D>bdDw8)ETn+h!W$hF
zK?J+?|6(4GcA<qNksPv*yAN&&$dN7l_N(ve-atvF!HUdBY(U1uF6~pk5@drymeUSv
zo1Gr|;zP>zVaYG%^n}-p3xj(=PyWBhkG!o|Q}b!_efW^f?jco~G+}}}ys4BP1#jM9
zO!5fxft*1D2hvupVd>%CAlyfZOA+Za5}YDcuim+H3Yvua*H*TxD9OZRX3~D-TVk|b
zbp}NN#jBQ=$<Ob$8)#-hI<eBh?(xWw<%eUEP;CMSwC)5*9?qHGSOX#`<bl$w73i|V
zx10g3jq9v*OAMtJ<IEmBpfhV2A{}&^NP+7aIvP?}TC{C-1$*$*MFC4O7ZagcOhi<a
zFczf`j&tX_CJMxHjOHXN`7bzp4cE|kL7>UY%OmB4YP6y3dI?yL_?9d~?vd%v<?BDE
zwB|j5=7A?*`Y(BUK2o(S{v?R1Xd2b}YD)*HFTm!H{sOm?X_43)etd9zv#KOa28Hwi
zk~L-MFx#lhZ&}WvsRN80yfkav&%=GRwGD=&s-5Up0V*GEDn_%|h5x1Ge;05Qh@92e
z@t=RXxqll6u>Fdo8H;GeIHFsY8hlyLM<(mnS3}B0#Ky-B*-eDTE8-%3vC>uwinFKA
zzh%8W-(bIZj|`lceh5g3*pDOWF%EW`M|=yGm_(Thy}W{L-=ul-Z(qL(7e@*x%-R_;
zOt!^fp?MnDQ9&$7KioTAe_ih+%$CW`P$PO^YegasFohg}mv<B;+IV&uws6j;hbSEl
z<TN64AtU%e<-*DSQ<DwMba<8=aPuuQA=3AJWHn<3Sl>fbFuh7BVi`_BCqi3I<=2ff
z8_SJrG7xneG+S|oZr@%-s>^%p`hvI-kRgl(J+6YGM4Nv-6Ip50n!B%Gx_GfZy(~(N
zqPK5li^`*txa){a-9VsVFi@8AO%JGEy?%W@WHfSbXau_w#wcBmxSgM$|L~z{)>y*~
z%#e)5Bb_`W%}fw^Js;(w#TZ5llTyz>fGuq3n&OBk0gd;rqb-{?VRQ9aFn|6)WFB1b
zXUHkoINWabd8Fyd9j8x!FnzPOI8$&dF|TGjr>CWz<D5nPQ9K2jGZ>Y!1}Vv`)8|DI
zp{~CFz=7o47ji@*-9#4T<(#c)*1BcOJ#x7ndW;t@=?RebT99W*rF&M-CSQdzVswvY
z_`O@V7BSM@VLrJdD>{6;KS@p57l(%@d^auh7)2+@eyoNU+UXvf#cT<H8bwRwb;O5$
z8eW)v7|9zeX~mLIhn`3CGX!{_wLE;-u-ldb^yh-GZebFTo}YFZJ_$$#r~e1IdH%kI
z<11T@(I!oX<}<wAw82A$c+8Q=xU7bUl4I89tvlIkvqeimb>yJl<Tp$2BFQAyt>3GL
zv7U4XmkJZikMuSDKff{Cj#rd@Yi5`6c<8qN$;W<5m(!xpoN2>B142TIHtvfnA4{n}
zZK)gtX{6{_3H9+6jOi|2E~lj#1)1EkX)m5at*k}f$s^_jU?b6sJ@5h11Q#f`Q2gBa
zSx4M%DP6gJeonsIx8t|kA92xtBa2*tr+YfNReiom;?j5U;Jpd80Ea3EeibYe0!^q)
z2l$vFOV+WeY>N84_mc`q3hy&UpKsrNE$|7sFau^t{V@SMeDI*I@6chxptEHNsJ1Ry
z{*<ctCGlh>TXKtr9LF^;xAh^x>gkKF<sNhkk>OvwSc)Nqcz9YgW<}OLOkyd)Ydt{H
zM;RF!?wLrjIv5C%g_;#QeOz9S;(4EnHGVfg!q2eGMMoG$LB?rG5SQBo!)_9gUyD9U
zr(=BG1Pxxmxvfa$CUtf%c>n&CPoD6l$2^E?VSHVi!esq_(CP4fkRE6n5(Y*PD%wTI
z#R(H?Fw5UB-#~Gz#D_bY>P%?j0Awo7J4!eEU}7K)MBF8*$4@85)*<$Th#iVA!+H1z
ziywE-PhY;kG}$q5?vf=WXa(#{SmCMl3&K{H#JcWR#)O#5WM#CDD=RCR-e}a##l?jN
zqcub&cFXTKVRip{6DfrbUlQuvcY;(%3t~Cd4ns<XG19NQ@u<~*?nc{`l$5*s2fV@l
z+QgSKyc>3fK%@TAb)B6t|Lr7H?-Yjw+X5C-_}fUFs3D=wn>Vi>GaKXdpU-bcGovv^
z))p3zfNKr9fzRfmYA0)FlNLlaQ+(=OZ*TwZop}zWFlHJE&<W`OKn(Yuzj;IZzl*m-
zWBF3WFL)0BeJ93DBNcMx7YMFq@ix#<NYLE)7XMj%%t-ltg>JpV%M4wAA#lH0H@r%S
zj2-*U^}-rxyLa7{Pyg?2i{T4%U@?Y$(jWw40|a^G)lG{C^QkxL_Je0@O6mGLpfioH
z2iMHy!+FXk5T~AN760ZLzM)LPK;WsN$}C2D=x87#9D)HpczbSb#=}>w>+WS}|L=W&
zy{Lb;5UKE_riTTXl+`7^ZZFAZx&{Ag38ix5A8;9f`2f4@v&7~Aa2ofa?iI}f1T4$L
zIY~`jop}9$TEnsRpZw%#hr-vdahtk<&nS{@*@We+8-hpRzI|_By&6|X15h$+&9aHw
zfWXFE@_<cy>{!>fX}9j&iHV7!1QJT_l4$yQ(CUe_LZDLBvqz5;9a5PrFb$BI_||#A
zf+?Iplsa;n)av4T-#?4XDp+aGlP6OdE==i>ag=36EMdFr!@lm=as3d*jqwckHNIeC
z%fKb|nt&ZUrV+A973f*JlNj#VOqbed0Qi_uvY=@cWg&8oXao`$a)tD9@AhSMU}P@k
z90NJ-YeQws3`In@O2g~Oky-29R(O1QJ=I&%qD`CcSNHSL_kk3s&LxQ9`=y;_CAt`7
zRm7l$c6LeggV@f?*lD0z5SwDl2FW>nxr<oI9PDv{FLhb>75jo4%mQN#2&avKT7}CP
z1CMXGnu>R#y148-tXXR6lfq=M^?m{^l4^ZS{BjTB2bEDwtg}>_LC}tgZI`-?Qcqeo
zOj|oFF+CR7Hj)gQP9Pm;64T2iT<_SJm_!gF+HtQZ&AhaJ<j|p(%Aa3s?o8*moGU@{
z8YUCU&s?2`N=GFiq)tT{b~xD%8#YX`g{y9KX*G6qA&o_!?UM>B1sL<9|A8w0;lqdC
zs%$9Apix*?_$SE8Q)(Y$4o=QLmP=heGqpu4b#)xCyXfeoW@no#1I_fK80&p_5nVCs
z8O=l?cfj99ATZ~LFAQdG1`!zwkw<X^?V)xkYXy{28NPMvZ4ON@RT3gDaJ$2w$xt#1
zgsG2xhS#Kze<;~%DwY`O6MIOa>@9ggCjvU%KmS0l7&rEx8GN#%K9mF78EZ}+f<0b2
zlvovwrm%z(6Pbo^0a6(OCUs^P#kV->{#T;ON^T2%hRvHN!Qo-ryehIhVe4Zz>&fHC
zTuM6HCxe6i-3Q5f1BOt(b89;xH#nc2NIc}y2<uT?oCWb^kgbV{uyKL*2lJ?i)GK{>
z6@h$jt0xrTn-&TQRSXjVO*dWrLn(AMT-1CTkSrivhAfpIZ~=hh2jNw4{dzFvDm$7H
zIYG6z#zfe>3nzR!Z-PV&*&0=;w*cu_1l((;B$G5H<qoP_VTImhz30zfx^yJ&&l=Z3
z(TRNlXZtK-QU>G9c`@M!^FAWTK}-)&#hm&X-Bg>!V%9Agl`TyoTDpks1(!$TIx9&~
zAKtzDfxEk4P=NTR4^Jfb=S(ooI<d8?%FTY9v2oO$!Ujqv+LL>WNP@&$t9tP|SIiJN
zC&OMNe(MGypmwK>$tvaMZ1VB~PJBu10gm=-M6rfnJQfnd?~<}V%|=6Ur%duD=Sw=#
z*~-{t?vk&fK_cJX%Q9}+Oma~3R;MNd8k%aUfwAW>RfIqTw#CP7-@3I0%{>alA$iW%
zZruu{Y!sNvI^hZM=FWZ=$w5d$?HI9uWnTNiQ(R81>Iamg;w=d>IfbP`K?JQ?{$@BK
zf>Lbi@B)%Lh=aHRY=R8R0;ewogVdFHF?X8?GoO!e(F`|&Bb$Z7>~W2cD3ig#MFkQr
zF7eE5X(}2ZLqK9;9ydobsd-}!I?r$*#bqK*hvxsWVzZDlC`XeM6Jr<N6~wsWv%O|q
zFj-?;o~HiKKc=OZsZe#*e5k&aH2|wbLc-D}r4R%&zNcpU4*&>Aj!O@%%>#Pu-?uMK
zg0HS0lW;w-lF4LCPSApq1i>k5rlJxKCeQF?fB}T&(9RyBI;YBo^CQl@l+kujqrL|2
zjnnM;@GsH22-OkVbv3p56g!w1a2DINW2{>d)gg=e&E12Ty=yG@R@}TXDMxR2EeJaP
zrDjh)Nu#yZC%R37`O4-&MK>G@O5`c4EBqphFr<ki?~h#j{<CL4u`k<Y`z&tx5saXj
z?NO<La6(L-OHGyfQWC%7-|2gI7N%R;O(NtubU2xrl?0?`m_UI>0qVPL+Yh`_X3m%q
z<LDEx3U!P9HO4CkP77M!Y|y2{teBP(wN|azMKEhG%+UD<_4#prkDIpFiBx|%VIE!p
zxt`hiB32{{l?wFLvPr1lMx#CAF=B)pSvnAYuk%Z&AK)E?lwiRqXa#NBgkH@iYQVe@
ze6^E9Xpa{A8`{wjWV&eA!e#0zDfIe8!|u)6uz&;4iR&xiiIBMOI`CW=U$DERHX^bq
zl`{55<$wRPO>kN|ZQ2fp`za}Q`0Y^)W@W$GRd=Dy;27N+CvO-N8=IB7f!1ex+;3FH
zXm@L?GcSqKvsq)<PUN8WQ+=6Frp(m?>lc~-#*N;p_+9!{CK?9EEHgLX8y{am7-SXD
z38HNi(xF%+zUUH{W(_960|Vp3c44|W=>$=mbAk$*L(k#}HXFSPJy?i+nsI$}ve0(e
zN9br=r+o8LrQ8jr5^WWrE&<1KW859(Ct<Ig3*oKG-Nl%}arNpBR5au&hdrG6P$63?
zz!Q!PTF6%lPdhvN%AZ;5viev+iaxw*FBlcF(NlOz!u-Vbl9nnpqwL78BS-St3#3|9
zUT!q-U;y(8iPLWo5O}GIyd}HDa?gPS$S?As!xg=8W$6)nfr{C7<i(cCjEAhKifgXq
zyi^`uxYS_a6_H8lw1?+~u8GDJhnEM94r!S=O;jR1=j0U5k%I@@*KurU4HCK<!8+t*
z6kQT`0Hme8*)`4`J02n|AEpaxv)H^lP#P4EQ}lLW6Wiw{#;YEq`$N{rOxzoY?}P7%
z+9~LOFvfZR9`Yq<J}D5Q5i5lD7-xce^UIAjnEuj4`@7VM>OrVT0UVf<ln2R-I?LeS
zOil}I%U^{Auw?g21`yzkXTG34El~!Z4><iIjEOkS8EXxI5r6Tb=zB5;cdP0#GI=sl
zp-KT|1&g@|dF0Kh*$xgmnrZm09DkN0qd!HajBy$)QwL5tz+KU*CKCba37_Rmqvo*w
z+G)zsd<9@KST|^}IV*{J2E%gb4`C|V+D2d&91Y2++aSILw*b0928hChI$lKsEl&lb
zAA)?3bWwUP50dGSQ`Q|df{Qt7)F@<uANh!3*}D%s$C-koD5S>)^c#9<9*$z4I5#FV
za5FefI5x$BF@5@O)vG8N>1u&bLFfS)lqXEA+f;)gqIo_t0+FQ?#GsKMR2s?}RLaGV
zqM{;GAm0cP0WpKfO)r1CO`7>WBE`F+pO;4Ofl`Km#3#Jp8A(tmsW*F$ExXjdd!Ihb
z+1|B`qBRT^bni&q0g_%#!b$Iv2!@qYFW05e6d-<>b}j-Y1ljE4bpbWbjP#8->Ciq>
zmoRb5C_$HaaRE{R_n8k7peiOVXIw~5u7=x&E%6zImx*VdF5`9(AQ7>|+8j)!S0}t1
zH6ycp@dS%a!hB1^3z(P_ZrfCNn_|ALYS;e#GqSSGeul8cjoPzRSW`ssaGUAne}Clw
zvcB{_%9AZ0@i7}%E5;*5_)--It)npHwn`&ca4vIU!gc~C$Q``lt<C)wNs>ZS;<YYl
zFB7lEx(g>K8mD?@ZOM0TrKekzz+phC6fDzaq#?bj2w@n8lf}$%b*h7tm9P4U84+M5
zuXRU|H)Xw>nGyVBl|^Pf7r@{j7K>DV#a;77vNSC4o0+wB4v1)QZlvllaxKyW-K8JQ
z4`u$fmOd&RYH36y>&2*9QBk<q-Tn|5Pm4uM<BnJ<ZH6JtB9;bQ5w(pxdaz<$9Np8!
zeVgMq4uic-4HzfsLTH6`ku#FPi+;eVc*kW!+_*uvF=1Ok95_qp`H*RGnv+E9^%-KL
zH?Lo-Xuxrp?x>yhoMSe2Px2ZkC%wT^zN!7VkzO)ua0!}AMMj;gKfM@NG%S;2q)*Z<
zQydC=cJDsoT^868xpLBq7GXNtoo93+k%b|-ss~jbjSi{uXYebKN_E$)>}*k#691$%
zv)807OC;E3Z7Gg_kr8Qm<H+-flPt;N1h)BYm{F$T>Tt#o+w{vijiJ=4Whp1Elik)u
z-?$UvKEoB%6l^7U)Sg|tOiKIw`{sg2&H8Ag&)^1=M+Oi2M$;;yryB6dz=~BT?=d`t
z=6I|PHs?j#Iv)^o%OG84O%g#xs;QQa?odE%JK<i)zvhqnpxgMH+^{e)ir#4OC`-y5
zxDi${i>BZ9a4|G;*q{%qk1Z$$DLHINV2`qDQp@iO>h^OvP8c;boH2|zP7im`TR1MT
z@rbo%9~JjYNoly)$J5#KJs`;fEks7;(nTjiK|trInf7km^2cY!PQPk;ElP(HQ)k^|
zgaxKONqSp9)f={l9%D)P)p5p4a~@G|(1<Iyx0CfxJrzDRY3uBUrsa8|fj<WaUoR%6
z!+=Vi=dQ1x@Mi6BxO%tXlBHiTkiF7ezUNJ@Pk6JeNM$8YRf!KH<q^;i#Zl+%q~&JA
z*M6v?-4o^n54M!DWAS3)8LG60;srJd%!Jh+a))t2A=gHj-%U$<$6lorV$s!OTS{x(
zY&SVRBG^HM))-G6N+AhPjx#_=Qdm#K07Lf^J9bQ5xkB5bQl{cP7xyuCFeJ2(BD~*7
zwJSR{NhiXkS@_}ds?60e+!({|;^cH*ExDI|GmnAlRJ>DiDE@mr`X`IyHD+d~6c4i^
zJWV!R#lEE<?p*ElCck_5Xa4Aus2+6h=FLg&aU41!GlS>~7aK==m^)Su6CJcu+cO;K
z6Gf{Fm!xoYp#N==&)>ea%;`^m=|rCO)7!VhVsycD#*GZBu8OHN<;%Bn?MAK^VjV>7
zm16F^^@#R2xz`WFkd}ECYz0<0svjMQ;|2!n-bKbSCr0F=^xJamwi!&G+=dDt$RT(p
zg9q^sBu4eu6JX1jLNXnZ4)K4mhGZLwj<Aznl9EK^zDiLnAWxNX)a*CGVlbd1{90HH
z>E3-lxfMBKk+2Eaut_83jd6R1Wg$36)ybqpn`|GWQzPJnN;`5m9(&eKCNn@e9k?e-
zN1n?2StyHPMlUviHasw2df*^O@}~57V(C{Zlx@xEw$hTIg0D{aH&sj@lWc)0A%9p&
z>oNVvr>cYF2c52%DwVeN*bsG?NznaI!(<HXfBN~07t?e8zTf&)L4oYX5Ug`-zZwB?
zSl&=i-PcN8!X<rG(d^!=P|kR}OKQ0mEkEeW4P9p;Em#TAPwuOyrG=LORN0%HWaK$D
zj_(Lp$ThI7msJ|8ap)q)pne(o#>P@yo`DuXmgw)G1R}Mb`E%n~LN^CC(B<BT-)`68
zI2IG_-Y?v0kw-`6U1nG}uYHJu5tKO0$V!c2Dp|qw@crb`ZuegDXNSQ{_SL4YIXV|w
zhFe4(j@ca;IHNR+&d=n|2O>^1j?Fw<Rxor86%@>rLg}J)998nOkrbV~684_l74AS8
zIisOyj;lN$Ro-K|Z$y4~h)6LUls{SRoFCG(r!?VVz@VmoU05i03S$i<0WhHG$nZ!f
zA1ESTQ9f$8h*jc`jNu#SKCLm-DkzG3|FnqXWq=MOlB;}K`keK6s8S!W$NtPKv|=oy
zYS&@IDj>v}`Z74e*yRrXQVzF|f1o?CRG5@jyfZ#9(4e;3qXb`?H3L1)!-)>=?I>Sq
z*B-6h7|ejE^0G27RWeS!!8Jfx##2WGOrS;&_T|$oylVvqK{_Da;b0}TePa!Phm+4<
zrs<J6q@F(yAOCbaVk-3CE?$tkcDZOoBv6rN9<0zjJovQqt4O8Hj-5O?gvCQFFzUS=
zXkp`q4LNbkJ&H56n_0lqr>nDO&6=MacV4QjKE5`g2M$%C<q$5-CzqkC-_i$J2wFvo
ziv+tz?XgKG5@#)AnarPB>KFKBrd(umYaVC-QMRQ7y43sy3k)~Xu>&YplZcU5nhh>+
zpVcJD?SRGf(`V(<79|s*$4P7Lo!p|OC~iPuR9}J<3EM#rZ7LByT`S2x68iY~up4VQ
zMG8^@MzcEi?CB*un6q`^CY_JLaIIlvpVM8RH3fl!ezJGdW(F9}3?hA0Df(Zpo^U*v
z{nAoGxQ(y6S1KX%RhNiN<DzCJSiV7fY1E>4jD1W$ch+uJB?<Z27^>prH9q8e{y~;X
zo)g)loS)GM<ccDO9vE_Eq|^I0y=&q9$h13mpx!J6c_8M|c<wrK&Xh@$PU0N^dJpyS
zi2K(k?UyHNtlckm6_H7upi_GP(Ie5mR%Nb|*~r#IJ*wR_u2Veo*}rm5ETrlzEIc&Y
zV?A)F)ZYox5&W<9FDj)!p7LD+9<>ks3!~pf&~?0+l1x5r%zVSUcToW?0&B&1v8xx|
z35K6dAAKE=LRkz76}lfz4hkmWWjM;s8FF%re=AW8lXgh9(9#vN*`a?Vyffm1^jOLq
z+EXL4J?y0;DFr!Y_e3|P<w|DNL_-WFpd-mG6-i&bqnOMm4SdFIAMce;N=o8zcNGim
z;B&*?%jtVhNfSW#m~GRT^G<5Z8}fxdKp4+>&c%80?j3%eS)g-#m7+OTwHJz4*#_V>
z05E~xvc`2jyP0uF&GU`T84nN^73Bfh4#R}Y9AEeEp8=qe-c0?L=h}x+wtgwh>s^F$
zZ&|E`3KCP|-H?9$42+G@a>YQl=Dj%WDQmn`Nl5l%1sOeV!M(sI4A|DVPOQd8(0<V(
zL~)m>u!L9ffddY4i6neX{4-MCy=PAb6AZ}W!zx0{#yx6<WdPDfjqBvku@{y+5kBIQ
zEu67dmX^0HC$gtG+Opu@W7$+&BprmR@Z{-J*ep#nND(@;XrVE09i6p<o2IU?Unc_7
zqJYfz*&LRnanQCQGGtuv^D||q!wW}jn^qoH>w5}3{mzqXbo}JWZ#WkzYbdM`Bw61H
zD3E&lG75xNImvs21{=BN<sb7sD850<&u!{$O#t98Hnt%a?FfTt`BM3etaLk=SRAm}
zB!GN(2G8XbFw;^?c@5ckfQH7{VrL|9v*^I?995~Mp)m)MU*YsTL>lqr;^gx-R#x&^
zo)as_G9r8cM5w@N)2$K7`(NC;z0*kR57y%;fmo#pz>u|N;Hr@$lX%<m_fG`-gFDxE
zzyQD6{`vcg-xtK?+*7nJ<Jt{GjQsiYFgOUrXF4>&H|tb2N?P5#rD6UFjS_>oWrp$B
zeyQcU?4(X116i<eA=kWCljP{7Iv217YPLPF)7-gT-G3-4sS5l8w`8UvcA<Bu%-0-E
zk|lafl&^bx4As{sKT2jzoZVY-pdD)|vvm!(Logo`1pmrE2L>NF?zO8|=|7$tmB(?*
z|NhsuKN!UL({=4oPYZP<OR}Y)d1O)Ql3q6FPxW^@)P)5_5~AFXuCNd{O$JuH#Mob;
z4>2ZQc{)fmdP{9#!&gp+v(qp~g%ZoeT6*p&Uyr#R<J6M`2#i1zWML(pP2_^Abs8|G
z)CM$t%<{-FnQ1D(b4FgoFK&SZYKpNjE|>^VF672S8I0$&dJTX;A9NtXu!%xC9XO0*
zb4v1AfkiJ8Qhm-@BDXzM08$4*E&j|IHEH~KG9_x6@s+RJD|T_M@@0%1BKvuS92(c@
zLTvN#nSZJ~Bp7TrBwO&1+THa}rK8=$8h-$k7F;9iTHbyL33}^}9ZO!n&IaC~Mn|V>
z(2g!7#-+nxPm(<)B_@(c=q*2_c(2Rv#t4Dwm?m^XK%3gcN2rn?PBt>~OMo$^qYoZJ
z4={D6C<;yN0!O*yyz&yraWZ2on?VamMDzkrhy1(yvbqFE<+sMo(@;<^i0U-SM>B%2
z(izq^^F+k(AohQ}L!BhicGd6BX=ty~FJFX~7JdmR2$adJ3+(8F2M=PT3GfX2aeI9S
z>31*Qvx+#`5my108lYC_PPA_o*mJwM5Q$RF>aMJk5J)M$xYDk>B)FVDXF2~^YUQ8V
z?e4G$C~<`zHKMM>RWepcEyZtn`h2B3B_#Q|7sUBs(Rs+=!A@(}GXCZ}sZ9i0Sc`Ra
zFMoLB?~bRsKssuwe}byOdr76d_kE{OiZgUo@d<0$MdB^~(@z~U0To;tf^-1x;8Bk;
zK8Jb*ABK}AwU!Sfo-eH@3fQj1G+QgnjkJZLi(DI_@ZP=V;p4=`4zGZ51g;Y@oN<cx
zu*s2aUQtT{?eNHO6*NM#2aD}+An?h{26;zsvtrGfr2>8vC<?t$w-?&BM;4pi(Oor5
zarvHSjj3xZ%l;s57qNxv9Tz5=Ei=RC@kw|~&Q`?fy|Oew8@4g+DN93mgl1I{l&&%T
zM7Ej0Asj*e%h?rFKp)!(mPic6tq^b8+I~#>mPcHTH*e=+_d!>2eG}W&i4r|iBak-2
zE5q#>MU*9=Z7paRG8HkT$vSRmXPOru>iYRg{`Fi8leTsX7Kp*uaP}|IlB#qjhTH#m
z+^qAcMA?^FFI!{?PV0}F*tKuwOqZ85xY_x93#$nY160)mEUxtKzP3fm<dw}IW(*wk
z&7$AkOxeI0(y(5$X5BTso9WmmPP+1sN4M@Pw^nP`aDtlJYR|Ea+N}2o%wKbQoz<p4
z8Z|2WQSt8Zs>5GKeCx2VQ)1ccQRVLf%EDXPlMNc=Po@`ccv0n>p+~On&y`VSH<L1E
zWhGhUgv9Z8vuCdA^!Wn=@k6`0r-x$uc&6TAfu%nlcGMpS>!FNE(6}y~Xn|MfofJ<w
zTD%TE2=}p<7kNZD<L9Fv{!u*(G!Oig#>;7$PT@xKSkT80c(RegQX+5P2T9DcTG%=h
zqQQec<kwVxi-<~gc)wb44`#PfXrGmo<hRhIer5VhNO17vJ|ybA&<Ly{NDSD11R!3i
z<vNfi7Kyhi+u)xbQ$8`73HJe#Pv1kLox&s4rXL>g^oli{cRlF*kS9F%D!&|mpc
z__O<BUdnOq;>A!v%RpR+os)z+!hX;JRtuK&{<CVe_4H>lzTfOv`*O40Sdb!C&u~Gx
z1>OOfV2mM=U`&>sg9GU2N9rASx_L0TB+7C_^$s0I0Gz}>99{eCO5t6@6=B1(UtvK3
zJVqCoWjb<0WyX5M5H^U%OeboiKZvI{=sReT(8Y49V|F+dvXaJiB=Mrh$vT~PvW^oZ
zJyy4W+L)qwq;08&cW4RIDdFy45Yq+Q1k!&d+Z(dvF?fKM7kLb}a2Z_Z1@99TAIqej
ztWNMdk0Dal%<2MSW^=uwV@!B$uiuSpRVte3!l$0ZNV0`Oi4Rjl3-!jdT1K#vlIFy6
z1vR5@e-Ht<Z{L6XXfMWKc{*Ak$00~Irz@xoy*!SuD$u;g28tkXbbe@^DEEd1@cTSr
z=xH*R<dWMlVCBp4MTM+FeD0iHo%pNv6MMD{`b|bTdJM|LO4ful5%@_AcYR%V8vLC8
zn^H3SFR`-F6L_lvG2mP>&}q2h+usj}kN!vTpZw`;0EC-NEuWW{N5AF9J#k?1sI0%v
z{P*QWl&tpe>y&yX{ZHJOR{Na){1Mu}{ww|dVcCSb3-tR(NBh(*h~Ga_a{te#uKV%Z
zZT0Ip8CqD>gGJj9)qO0UFuP6N-Sc!@O^1uq7cwer<=n^dzhAY{n7a4YngoWdE<sD9
zudRLi|NSX_i19SqYKf)TCMx#6;<ef+Wsx88IGPBRSVLFWNcqK^Hw*cN`pqo<vBA;0
z+v6X=$!jo{R_OMFwY)g-uS&DEetqJg2aCk~5DhP1U!fP;e&~OcDsunBhnLk~-d#t2
z-JU)tXeI%us0i`x>}B^xkLAVuF9ZM5WM5yHDhF#71}ZnEDZC1lYBttXDmwUucpFYs
zp#wiBcM880-hrZx9BTwVMf&<r>n}r>(Wdiyicbh7Zwm0h-VHRh+YMoLc!mpnN3w<5
z5=rX3H;PyFtQlK>SzBY16!0fJ%vTHeHxu>S9$Oh!*LPVD#mn8=Tz?1k3VJLhGgADE
z+3u<Ut$HQ6Z18IqQlq}CPr#tI3a=frbiSVNp+9ZsLot;o(?P@^t@iy*%4v*F9t$I6
z<p9M#8Z%0kpRCLjje8#ZYqx+-d%2;e*BRu~g9@MMdyCdOCExCkv&p6?nHT%J*-qw0
z^A722SCCN(RDR;{&{E6_9}0qO{uR+X0(+l$`{bV;hOgG*kuSz9W6S(Nt75XwLUYe6
zRyH<9|CT+fI~Zl1WclWEu;nGY8R%5YnzH>1Im3-<fAr}^5My&U_zs<tVHS9xfLxMa
zZa;Kyn?aaN4>K)_C&`vM*EYeWn^d!y+?Hg%L#Ix{Zxs|4@XkDf9uY-cAu4o^E;KzD
z@oaqKMvnjj`~-9hH*eynawj)eFiPMN(!?Tv0C&B3#lqje(Do0l77dp5xsem*r7DnI
zN?a;_Qd0;~w7w1!8ZombKcZbPHI9&mAw^@tyjD^!#9YW=P5n!2{AxOdJXq9Je}LlN
zrfw;Is&favF^|WJ0{YXQj70-sl_P$j%MPGqFAn43<2Vi(u@&r0V+u-2*2rp<4HQBK
z)289TD8?8V?OqjjXq3CP3h<Y%<Fm70s1(b~)g_@>#k4Ql=pX7jGudre-@c#FQ)(u`
zQgx!7<6S7Q)7KUYaVEoX$zW)}zkU9^FE>=Q&b4OkK76>EGF9j>avIK|feOsWCnF!`
z?h3SHUG5iYzV1d`X_7OiR&ZQ0(QV@GpbgF1Aj<XpL&}sw``r2S<pUyGZ~r5%^}G_9
zWD9*=A^f0R$@v>r77ywXBor~2FnauaH!I|z8v)6&sE)B`JGlOs@t?S8(ObYIZ&i>e
zE)!XOpO5MRy{uZcZ?Eh1_`=G0EKoXnHhH>+Ko%NitvMDiT)b$0=UAVexw*Lr&z7+w
zLf$PN3OgJxH|_O>a*K)E+kDPh97X3a9Yh`Ydc=q-QDLCMoI3SqfuAvSgx0+*O}_5P
zM2Ntp!oZ-l#L~(NGu0E^4o)aan+gH<knOQ4um<YerHg3ub6PEE8{jAZT(^8CK4JNA
zsXGo?78qkSXvyOhVDFF>(6i&Alr82{av+fp>>wO-nBv5%Ra8Kf+FRXDv1mL4{;7|o
zGjIl<9wysc`0qXW3=Z+2l@$+84?VR{U6b1*y-%8<VpFF1nJFSjS|?7J&{%`@U(RBY
z8fQ`2!0u!frA?w#1iQnE{RmSbtLYs|w!k`n6lS^*g@fJgVTkTIPo;wnqD|sF?Kxz<
zxt1Cq420_%z;ta^S2sL@>L~5(vFo1SR6->PqM?CJ(WX;0kD^0o)Zc6$_4B5UHP|si
zGHHL#;lZ&KUV(`m!KU1BI({?J>yx;EQ*t8{V2V*A7mr?NZk`QVjSe_<ALy}ogzNn$
z;sS-NhmBR(bK@mvK?H+h0fxc#y9;FTr#S}>k)av9CZcc&sQe$gh6Tko;9^`KJ3;k8
zkV^YYTg}I#=0`pan%?CyvQbplkMtJ#!9Y6&ahn#Ri;(C9d>&Q=lX#a)-zFNPaPo*R
zGk;3-p|94>>wfL#WtZ4lO>a|HchA-1`Oe(cFBEJ|G_Y@Dy><=AOXc%vmdxe^L@ulI
zpn$b}3H7Wb2z0Q|yu*sG8G5Wf9nflqM3wu9@5urnAl62Nr~1>{!FS0lNk)9jE<5hw
zj5Y|_%aug6YvvvKqd)bJKnjo*lZDYqxqIi1;Ydav|5RzU_cn5AES5dA%_Fh|+*&SS
z?E?<@&4tK7j;TJ#k!vH}wQt`AZjCfbtKajaKTWze%C*&5a<Zia8X>c&4Sb-Dtbt^_
zVIE%%xqQzQfTO0(n?G7r3QWNZj8pChro~I1El?cgp5gUriISQ@!skd3*Oh+jt}vO6
zO=&Nh^FelVG^Ovay&XAnWZ}^M{m*7LA=KDLnj5X87rc5qu<yCt7p0WKA-2=!UUqGG
zbbQkbNwUY&G`*A?Hzv*BuSWeI7di);p8wi#x;XC!oU@L!1%0j^T;~ekC2ddrN*cJT
z{_OTl)dkH_5lLF+c-t{Rkn|$wwC!0zPqVD;YqDT2(`dbNM<M#|@NQYf5?nZni<yQs
zNQ1#r?|Dyh5$obL%^ahY{I~oO7od{Ie&^KN^(Qq1SYzu!r^sRHb0#}~K2|FY!&J@6
zs_3xjObBK=d>^{xQ>GRzS7ZvGO6&7S*&HSsQNs&AV^Ufw1r7zS*Q<{oKfZV|=Y3}N
z=84B~uo;h!0-*>B8LBEyCX~zl>DiV#FIqzj9so|%oPfY!>40NT=}Ln?aN$MwjKYkf
z5Mn&^9f>m3kd9eK&~lm)_L?_G^oo=zoK_Fy0cdMHIUv9SN>$S@r+xf(?_P<L2K<hI
zOMGv4PXA_dJr{tq1<BgYn>X=H&C9c`SqE26D9V8LVM}m;^#y;+$wAtZlkv0&9aT_j
zq|VvEfq)G=Aom%LG<I}6+u;!`i*L_4`6p+gXQrWhR+ir`q--#yH|+=Fg&3fXXAF@9
zq?mR5ln5xzy+*p`<?TIT;>1yWV~i9@(1B^`Cvd`(Z1mb(7BQwr;Tu3facL1j@#$=B
z4<o-?9F;T}*(C5)w*?_G9TK}vty(!DC576+=ScjTvsj!cAfUec%$~G$aP9Y825$ty
zz@C?oBU#W>Z$76mGRTT^h#fv+L?Va9@J9IrL;wlN7NA$Tow1_kxZK3EKGA!^%%7Vn
zDTVsqaV}w#2^>#H$8Z!Ans_*~qd3Fa?~8VdUOO^v+1o14E|Qm1U$L$1T(LFlEV~!i
zh<CScBGbEH!2t`$r?CHc4KM3R9CLk{*n^EQ#C-%sDoG}|PWR^obNid^I|MDrhK=Ct
z!uAYcZ)&9Z;?Bk(HwFgra6~}U2<8`=I16X9*iz;Q**&aPIC1CoYws$3sf&wn<Utg7
z>cOIdjiBH<avf|VsUvwUp;$mdBD}Iqw1UHsLC5wKqf9shVs`?!swMU$srmfni>^)<
zcTI3P69x|?DDmjuzZ6Xc=Hcr&-?zD6BCF+tFcIV?(Jyu<t<zdMm@B6eX9g+ZsZ@1s
zhqb#ipPe(0gK9DCd`wK{NXrE80(PHaQt0$06#kqFIr+z#1<ixSLdOcVjkG2VBnBCi
zhWgnw;UP<&LU3OG<ZKG2W)$8esijs8K3Le@xqe2tOc2Q3{)YX76SjUyH^GJ^pNied
z?m~Ms>xiaICkq%&B9X*2zk%E%C51S#{MIfF-f{2OW2Lu>fl!wxex8^dmUDh-0w4kn
zH;&@e{PD5Y?yPID3<Qe8g}ULvZJq10X^@OJ`iz@r^v5#oYFaEj@uYh;eq7<YS+7e<
zB7aV3C25LDSKq!S{(FPYw{9>^AUCLrQn<_6BQHQkaddl^-MeE}95i$;{O!AUk&NEv
zlrTDEyA>>GT;*0VkIu)6Vq832e#IgK*jxDJ3>-29IzbYV0Oq)O)u>sW+qTvAdd$dj
za$fH1!uJCwK;RJ6xo{IwW8M^7*_#-=4jAx-l_D&(`M+l0-xkw*=EQBHfg%A)UCFCg
zUs83WaAyTg4IJs}#BC>WVlGz|AJ7BF>0G0Al5yigj?T8*SL>HU;t8cmgsp?@Easg*
zLMuw2M^EnqmY|?)CJ|Y*OOXxGtZ2^^fS|kuqhKD95T@O}-9p0n7*}{0ATi+a3+N6z
zJAa3A)!@%R1+RDRn+n_t1@ltFi!X~TvQhI+h!+Jtms3G3Og6RXv%=0{rCuWCfGy(x
z@Tqol!6bbpsTo)vJk#VyRmG}j8)J<u=(Ly?PteV?nB^|4g0%&VJ7n1m4iMrt1&?vy
z&}b4M)`l@Ylt;ptI00MwxRe70NE9t&B8?HbJc|;9uE;t2hduCZpio1@%?f9@j!WUU
z6FSB{3VVY6vT9K`KoAA~7DBs;$2oMx7lL<+2i<zeK+rb@^>KS|YolUddUh3hxK4wY
zSh7iwlvNXq=rz42_Bh14VA0R2?6q^$TlH%uPq+PXvXa`N;4%Jq-)5Kmk<+?W3@|uz
zmC0FzW9#Edx1Nee!rW}du{LI_Ngc$ZBV2O~gh-^IT`H|-mfb7bsR2e75SZv&3J+ZJ
zVHJ(R;7}k2^nDJMLyi0L1)%E7w{Nw%ILtN@($M>xVxe-XNx<%Kr(^y%W2TaL|BGL=
zWlWF~M8*soZ^>tJ+_a9C>M9dRI5{5Ag2BUec$|~dYLFN=!b5HC+xue^(w0oeFoiH!
z;}Ec!eku=6m9?bB(kAJs&^^$b4(~w;wR6jsUD6Vuz~pnB=3=s)%msk`0ym}NMG<ah
z3L{;F!h%FDE9vR?)M*Fgrs2ac-VXua*6Zy~9g0`w+x6$b17PF@@dEGPBOhH59~tH6
z_mC3~!-XEx0x`OfJo~Z;S0#8f{#7E<bN;x(`4hk!cP%2TnB*Q_!>n%)@94_Uoaw8*
zh%|u|pL3Lyq2N19>f+yv<5cw?2gYUq&hMMyB92NpIjIT4)Iellc)L-qSHBruqwN$9
zUVzn!kzFcGZELKLa@WLST!tM%+1IbQ!AZz9*yDUy#<BN`w7dK>eEa@3+3<UVmuK^f
z5E`s9&(N!usGt<3mv^kDicQVbE8FO~IFs&lyMOK4xXPkWWY<juZQ>ia>Y~=h?fWuj
z*TKLwKvh_sIOabA=A`6$%{V!^tzch);d6HZhtu!fBZp<{q&_NgqUsd%^Z>IfI}&q}
zF*h5wYxK2-w-f$FYs4p=eywa_Xi18vsTiHIL$C~8&oH1RsP$`>UCrdqL5K$~lG+Rh
zGZv4;zD=_&6{y+Kt|fU%`hENMo%QH3rBo6k8YwC*XEF{qi~ald0bClb<SmkKZh3Q#
zd6&KuABLMnYzSH6l0(Wy)DIELMaQB@r%?nQKnmQ)wV$>0c6oDXtxCzA;(~(iqishp
z!FsLVCGao1g$rpm!_lI7=(e!t%aNK}%a_tY=LJyF#(^uzyVkU{Vs2lq*BYGq`1zfj
z92d?BX|tG7xzC=(8ZEbjT*j|J5|C`cx)gL(ZO}F918UwE7#E@=0hSv1sP|}*v=j$>
ztI~tpWt3qQ#I<&GLzeYyW1;aD)vmKzIE8?s6ojvauSKNZFd>u@1P#UwF8>;1Ip8dW
z@S<z>ix(%%zuhwax!G5u_nm`7jD6stz<$vyIkzb%D)q2|*=IC>bW;{n0zpyE!HZry
zkNM42zM)EhJLDO=`PfW!aWsC2*saJyIlKdvW1~PF%vp9csjf=B$>5_Mn6{{NJpN!2
zR3Pd}W^U0qFOc@EL$_Nut}l`eiy<T8T5{zT*FQiU<gJBWm{Po55g3rqfzc3;sE{RF
z;__S~R=1_WO9#TnhbG^QR)IUQ!XQO5j2k~)pzN92$d=>Xi?DiGGo|adF1^0rc}Ktz
zTp8(mTAn1=$9gbdX2q4^qZ=UKYSz4Yz)LcOR@SR}c`YKcQN5G}n%;ayeJc!1)sE<|
zJ$=6)r!P3S&(^IJ^HlDRLk#ELPD{gz%2$nGaA|h?S1pS&!~ABnT#E^F$8KED6EjV1
z&BB*chFHYgMb&z=nFU>I^uG?4Z>x`!F~28pN=KlF+OXzDOE4*V_S(3QEKGi#-RMbF
z(>(@e{ouSxb{^cf@0M9Pbp=_5`rbX#F!)}UK_?43Thd%`aGVj_W`%3N_tWJ2_iNuw
zscxFl$s}`HEEx}1kzf6*Q6ux9({B{pM9ZQco7D*m^+|8ktlh1wf4xUZa!@rwNZj=j
z2Ezb7z0>#CfdN9tn7e#A^N6AX^seEx7oQ(a&u6-wn?#wwlhJ$neQXrlw`dXc>}X7v
zg$oyI1NKdvJ}=XuoC?@W6$_Mx4~3#6%OO^+^{KmA3zjbJ{q10{=G6D{dC%geA2~Pz
z5&#_;$R)-%_s#ui%QzVh(t-do@ZL_(9L0$n)-;JcfqmBd)e@E)Rbs$P@>pVAc<657
zgt9*D+03*3^%?Zq;obK;Fq)j+Wl#MkbM(G=-zuZG*Dj-rFfCNuwhePn?earDwtra5
zb~tr<jB8Sy6B|K-q}Ca_LD#fx+rl;FX@|U3K}RA&S2w-k`E=WywL0+{7gafZB^;Q=
zHYH8$K-I)>D%Go?f!D4b-yW~PW}kc4L&c$cm?PR`BtXxyg$sM8<fI?YKJl#6*-$4@
z_2Yttq$4xZhOB-0Fum`+oR`9w!E!|VnNCQTZzV=kr}lQZLmNgTR>=*hTiD`!TteHY
zLu;EEBIo2{u_*)k&pZ^<<gYagh55n8Q(E~+^r~Rys&`-<`N)#IV;J~1vWi<xLKFpF
z)mQ@wXfo@6?vcx<cM4`tPL8#Y#VO0_i$=a>dtuua<~}|-xW8#cmgDqT#qgCiO|yS)
zO}%?pJ~|LC^~KSRa+;{0TG#v4db`V0bVfh((1PMF>=@?C4a<qhvdPk`_wJnzU4PB7
zr=y0jnMx^`rv(VnG}+WX?j#nG*&{7o$6U$(2lNHbphN7%X@`cB4BAbpGi2z~@>hj#
z(vG@$!l6BB%T$)m?di4X$=MJx;>-mf!4jZ@rCF?4MHWY=?<wU-e*UG*wt1M0?N}2t
zuXeg%sPzl!mYq9Cvw1izky*e~rHW}1`A_fSYGsYubo74q`JAP*VqJ%rkNCO#Ebgp_
zSBa?x=o$GkXb+vpEcp^aBm3icP9SK~N*}7d2c{!rrZBmysXD)6Ui>!x^c89aAT0f!
zJ#P}%iMJ{1mCFkl(hW|^8INq05wheF4#2_`v)u5s)Kr@D*+8V2^?0dLoqix=35n1-
zo1B=5eyH%oJubjS-H)~Bo=!!July1umWTZ;69x`|?4AFl{|AdkcY;eF{@mQviwJUs
z@+0^=@nn-u>Z7Sgj+jf{Y?{%y?Fv`zoreq)o+MU<=owOU2emfDs<ExQx~3OCf&cF4
z^i>fSoEz77rNX;+Y<~99Nt+Uz<w@#lYTdmi8yk-%xx;ZPCf%@E5Aw+>jtu|d*FYju
zS4M|y-V|O$ZY8`8gq`@J_@$#J_%cI+6)AtYAuA46oTkfSk=`07%(q(GqjI3>owT&4
ze1EcbzEj#BOYWsdWzosAcTB{1KX_vT8zN^0YeWj%bm%&`hr%@#`=#zUwS`C{DQ8Hh
zavIR5WKN0j%qf1#5!|Nlzk8$+S#;#Ji^Wt35=UJxXI{+h%=Ub%k{cNm7MMo4oTvm|
z-SBtX*;#D||I-nS<LL1LZ=vQuZZSR9*eK|+uPov3b9Cz8tgb$?<L;y%C*`9KI%&?i
zToMxKq*L;7d9>Z#oCc+9il)I39URAJDSA9kaV0-F{;s|*>JM%8j;Q*blP9D&0{qj#
z;+)<5`5l<RQuVPxzR}o(4W<cK<^%y?r=8u~9S9ek8Kl<VmKe5_8kryh!kH7dn9K^i
zF0-F8?m4pPM}SY9bcOn&OewKFlad@;>bHM?*rBZ-!WX-b>I*gsx_VR%WypNaGgvp|
z64{5>v3CRvXm5JD3xx%dcAB9f5(}Xh;T|+g_F}LWi5&%;J6JONOCOn-u5KC_eo9IR
z@e9P6;j5QCYLZ6?vR0$?E6)o$MN4*eUH^>h8)i5@A!ere-k+&w_RAbomIdehFbn%Q
z#h~qc*QgOdfByL}+qWLZw8{0%S5LP?YsqFR!5qLSgf?+gYBR+(?9;TaZF)Iva*zqW
zfhG=jc(=Paa8avGKDnL)Rc<KYQ3`{-c8yE)w&yfIuQ!_S;Agb_CJZHjvW&a`v})C(
zA=qLs=WNCm&@;eg!G(1da=iJ<_;5N#$X920MwmyW?YT{|U6^5m4FueUOO*qspZ0=-
z4uNsczI~x4TFAt<m1-r)Ib1-9&)sjnb$mkQ*I6}{7Q;g3jLBKrXPdiqQne`_!H(3w
zcKaiH{gBTY5LO%$H+S5c4jy5g&)Sa;g_8;rL=PGSwGNksbDnC9N2f^hw9INPPIWnb
z{%+Pz4H;y*x8gE{`qZP!g>!+EIC<E?fM0{1^t{^x>?69<8u{LplEiV))7o}RZ6Bk3
ztDL7#(^k5;T>Nk)$Jf#54S5h{#h$qep_<%9C!QffPZOS7QNEHUA`36F<Jn*J2b~HG
z?9g%H)G1T&3pz5=wMWtvBO_v;uEUDxwa%Q2jx<!?>sY(suE=l{f28RL_wSE{e-Zo0
zcEH^J<b}Kc#Dv1*(so3_lg(CNwD;IU>|NbF`{z(s=#s<mxzTbenk#p?)Ze+s1SZ^V
z8#N?;{OXTN4U4MBwWvu@fA?5#wXVIn+}t2v0JEieVVtIg`7P0>p62T`V5uy4ELyxg
zKK*&Tz7LJzP*eOv*G`V~qEDi~rnrpSzKI4oOg~&W3B5CDs|_BUef8?x`$xyx1qwPo
zB%?iZOL`pj-eWY`sD#kz_^&-!6{V!%5aTIR?m!x@Rbbx#@MmZw1q_O-T`!@El`M37
zmktw5g@krnJJTguj--VJ8!mA*-rWj*tnY>qt@fF7=N3bZ`fx@5kYbPQvqS6F6vbyj
z<X~yF8FB;+#pqv*k(mm%((eK0Yr-G-VjdnQ5m^o$Cp=~kW{s5NVInK3#BaoRqUjBX
zYUxLdM(NkP6hvLpdiuXz#?iCBuqSV?yfQq^F*W?<?d1&ad;9&LJ`dXWd;V4Whdxq$
zGXT~;wuxF^aF?7|K2T<T+PRS=Ys26cb^PJ+M>dqA)1XY8{F&|KQyE_e|F_=xxh6!t
z_DWy2%yW6yTN`8s4ymeyEig&>0b;<CO}j%L(#IQyJ}|o<<_Tc87q+KSs|?G2_Hg=I
zgVl=`g{qbD{Lp?m?ka080l?GYpkMH#tjO=<1uO*A-oCvVHdDt!>*w-PQvWr9?$uTC
zvQ^rtPT#gq4wI21U}0^p{FxF6KDqkvI~XlId`MYwHu1~t)KsR{1iYk>^#A7ANR0S+
z{klg=JKxvP(S|pwp;8=v4`LD3WrP$<LCv^7^it_tg^X@z-%UqW^fjXNR`7eS=Kc5v
z^pyh^S(Da)SeEXc3q9?O;n~2m;m(zbyUAB?AGD2DG%pHYz3SGGw2|1Q<6w!QT~$PD
zE@aI4^LphzwO32Y`AI!&VrvV033z^3?c>w9K6tQ(kCaE`re5VMh#>`iW?l?FzV{u7
zgx#!rI6~||3A1`T)-1bsoO3LX#G7IIJkYtG-~N){Yz+*7p%X5yoFi^0frm6262vFW
z)Ei>l+sCOe8!U45_Dpv_rUVo&yFzSh=X)Y__r;Hgqp@n5a&T~OC=$y3z>R^t<oVWI
zEsjEHbHfj8RM9|xqdREc*H53`a_CkR&a<raI3~C#uHi!FwgZ$a$W=6@y;Ui__@xQH
z+U4u*K21rTR$bXdBY45nhMW}YYr?hY`0~~h4vmDyIb&o#Ny3JXbJDaU`jdd^7HQ@z
z4-Rd1{_!)j1)QZ_*XqDOsj$#4$ct%+8YXq=t7fIg72`#e`ZWJ_2&ADmB>|CEQ2{a+
zjsjVDc0nj<v|r30+B-MaI*9^7ve|y%0d9rAdptN*s(Q4zMa8>ca-b$6y_@dWy?gh`
zBj+y83eO_@IJODm_$tm=DU(Zj#|`cMRw(E<Y%rTzIxnA~CF;vh*7SXrgv<tQDNMb`
z@pG)5W)Drxj-%Jl_$H$%;lBZYZhl6JS7fRq#_rLgbY8daF2m6hdafKsrO58*oVutw
zS0H9Le09rZM9H@Fm60C`E7Y_09k&UM--JViU5w4osp8ouMmY_CU4h_F9M`CaU~9x@
z+P61h^AOLpQ`ptI;ftzyf#J!M_OPVm7Cn&otS!k2Ij&O%s|qTuM416=&t@($iksDT
zgx^_FNhHn^j@bA62~QX9;novO$m#8H5_ufyScKZa0|&_960e@s=>`d9*m^eN*D!8B
zi**%hi3+{lYb`Xqavu-sJo3^?(Oz^uc{Vmy9x#+mpSg6%5e^S~8jf&ju}uld6znt}
z&o#%_k!YO5nspjsxy(hU{7?UPhH1-?Pl@44FS(KG*6kT#dJ#l@!^}2DG;zb}XGu@N
z2cbwg^Y3B|F7XT6VNfvfJZul7LJ#^mf|ixqU;(2Olk}3Dy_Y;4&DHIVc06I%s|^iT
z5eBfa&(hP^O=W|z3H4HhvB2YR7Ln%6>=pOyuo#@?NO3gdjDz{{2lS5x$baz-Mm&hC
z%(1^T3>gk!!Kvfp-o$P<uB2jKiMk(W!(p@yRttX6rNd2+-hrUfU}y4dBN`B^MN2=E
zDlNRT4~_5ubj0uo>AfCXFqG{_&CHcP;O`&9WV2(bf9Xm|?!k<^r5J`dA_8)ZlUbpW
zqPMs0=e+}z{ztI-9fs>ZDbI?>T(OYlAvnJIYn!f4x4QT<d_;fkZgjfH+l*E+GVV$<
z;mAT7;`em(yW8v>{*%<Pm$L(*NSbD*aUEc%$xq9@rsj4ODP3=paytexba~AS(br8j
zA{*@8+8A?5fy(E5QgGj-=aX=y9Uw#MG{_Kriynz#fy@vNg!!}lGL3<d{GVr{qxb8t
zU1lX!K*yJjw$ydXK1m}uZAo(q0>HO`kq2dBJNTNMQX00`I`S}?1ThYr{kVAq`%d&d
z86Zx7<_bf)@7|4kpEec6kzNy81CJp&Ls*EA=}?s6td9jmUuqK&?2hUE0wR_R>4Ix-
zpyR@TS%d8}#w#O(6N*BRcI5ILtZ+7A#$aFsrmmua%XBs8ih5uW*(F!U?<JwN(cgdH
zym2Eus~BYiOnf}X3!%%1KG)Zo7@~;&4xe7K1!z>bLIS9hM5o@nC;XZTkj(L1$?B)b
z9`kJkYe38xR?7#}VF&Ryrq@zao9W0Yj}D=lq?q64u7X4FF(>b3(>o4qCJ8T-AL&Ad
zy*~Zr+kLe4j1KM8Ov%2A&-QGTKsYok=Gs#iP@l?VnP@{qnBWs1*uP)U(s|2A{RDJa
zFcget#z%QIREwWTWeq;l?t+rwiUN~256dd%;|N_nfr*93r%#zyNYRQrPOapI$Wx9$
z4k+~pxh0KJYBjv=DQ_WSxl>bxgh0vDrFFf=*s}O%>ht~n&uICCs4TEwE_%~I)g(xC
zD4BjIl0~6hM-VlBmEdgC5~TEJPwv>dbscfWFa)IH(kQx2MAgajXfY2*)b4X9d^_X^
z?HbR=)X_Aqs8wFRe96M=CC_SA)s)pPbT2*_kEpAve!_b=OcqXG3aB1Op_4)%hVA;2
zB}-PHT70=A#Zq4`2_lp4LyA~SO@cnf;s?qkP7^Un<8m0jq<S?s*9~5VXp}{VwwQTo
zy$I}7;_{qEiZ1Zc(!OEO-^{O#Z=M0y8CV6z0B?nyZRU-z*Xc((w`?iRd;%;Q7JMf?
z9W&9e`~AT5_ZUT~_-x<afu@iez1PJ1(&<ZS_K?_5I5ZQjDj54$u}Fwct;{qALI>b$
zPdhL$Y<AHLV1ADL7K#?|vAwI-tdSRQ?oK^RUF2so;x$7<+N!FmwraIEYDh4h2<WiP
zs1am0--)P{R#kmp8J}|Ee`Gg8bc4Hy%GW1yj;h|aiI4O0lJB3x%BPwAdL^Z%eg@;m
z%EADcwPKEil7Dd-_eVyS005L8Vyj!gA5Pv#n~GMXc3A^j#7yP;@bTlUPhl@BJ{&d-
zu||eZ*R#ppY-~G;uc8m@mue@*7z!_b`mG6+?~YCKFp1Z*jtRZr80n9m#jPSp=@-Ce
zTa7sE?~nHFWO~yI1QgM9`Wo&unq{L658(?pqREcNjL047dj3O&O{_yK{up$2i#j=#
z78sPQ_v79#qs$HJmbO01P|QjYCumS_Ka|7@44Ot^=^>Ql`*Md{I8oDp5KSCEUM<#~
z-y%-t0(85#)Gb<X_H_y?HTd{KC$2E?M9-hA?tf+80p=-ZXRn>3A#OWw3`Vk=t?hP_
zC7nh8RC4aX(3NvJr?J-P+_B@#71N4zs|0?p=Hr+O6a3vfn==jK#wT0w*18wY-DYPd
zu`w5S-nxA|Be24)BlU(llkDJ5sO3C^LENw=McQM`3sA@jr1^RCsg!M0?}pJ(^^U)n
z^~d>+nLripg<^ObO{}7d62G_c9oIQKyJ5IeX>8+9Sm<MOeu{E5eW{X8C5k+?@9<xU
z+wA#bjZ?A6qQleY<r<|vc_PpP3YQU}Xlw=gCFp^0)F~02P&Tsn-d6GhFa-e4DYL+F
zw*ohxJaIxrgGs#*SA_C__7iTE-Fo#pb$>C+t>CUXR2apkE?8YhM>lTT6d<IGr_nPN
zVmKxTV^mg9kjYCDE%@|rFXUdiGKbGW*n7tgpy7|>Av+DT1raY`eb7dx1}dt!LD3gO
z(m?Bk%c!HLH!VztU0xQUA2fuozZSoUm=(P{Lpfu>6k{DkHp#AM`#Z$uz^~yg0ZRCN
zq1qvVTy4zo>(5N4g>1mc(ICQJ%GcaT85+rWT9H7{3=Qkix9^$z{Ya*SJOrH|tBGam
zci;d~_95sLnS<dG2k0JfyT}9P27LMaxh-S`8cp!9XyN@lX1Lt<CB3qetcXXDIhm^)
z0DnX-FNF9<u)~{=b}Ein2ysV>tp9=FO~9IM*Up7=hIxA-Qpjn{RuzZpt5*-{#fIgF
zk4hETmfu<Cu}$Wlu{d!~389cdm;ep#B<LfNhaN%iXMv!&2#AazB=HYi;$Vp-lPO@F
za{J;F654A<^v8AJ8E^wf^Jv>-H!H*2d3n{qACKta;TzB-NzAk-b-|&L0#RsCS)CNp
zOq7EAX5rycV_~~4WJCUjIYsA8Z*y5-+pOVY(Uf8WhT)+HoFak@#5H3;^0TbRiuE~W
zdcFN<s`nqR`N!rWf1cdXTkzG$c+sXPJ-3@wHyq5<FyiI#wu;|7nn5XwchN~zV2IU4
z87Ys86#u}VGTVBV>X<<NwR-ycHp&lz#{Df=*R>j$+9#T<+uvSak-jRmIa1qdj*33V
zKY5dS_H(LJM11dDuWKp1O0P>91N;2}0Sn&?3##x=P_r8nEr<NSP{cWryj0uy_5jeW
zrKZb95|*W1e#X|lu>9}(w%821-G;w7;GsUOezjX3k#*g9%{j~7D-;u#w}sIpiAt`-
zF&J4*Mae8U?Jc+M+!?TeQm2;UF1}d7JcNG>-wQ#d;_+!T>+}l_(u#o6PQOlJQ`Ouh
zz5Y2$GChbrFzQ-KaCpA+n@s;7ZG%n#sgmKpfb_q}>SOD#Gp{4lqq)3$0~vl>!L}C<
zwfd@n{~n{c=g^_rnTi57!N*(NTJadO{=YBMmGhMq8%cixKp6c<t(-tSuXF&NxB*Ux
z3+f)<J5w4{q$6UxF->ualq%{W?)oPwDZNe0eEeItQ1^4QMs-*J_wziH>yfzMKl}eG
z_s}^o@%XQ$!^@1(uKzqbGDenQiu_Oe_KV+-|Md=@8GZlrt)AO`Ce;)|)JXG$3m2#m
z<B>z$xTpN<`C4u3lm?zYEP{ie>rFK1C2-!!bmY+k6l+F2+i+7ohixT&@2?b}h#(tl
zFm;p;$b0PD=Jot7Kk@&i2$8UwDRO>BG4??9B|JPzTOLE4q`aoOm@vw~?K*<I($RtS
zS4{nGR@NG_M!=`qiAX>RpDD6xbwx*Cc0er;Q3KC#cp?Lwgs~<Mt$uZD%d-V;L6rxW
zC$C(wLRCVuhrw&SQEIKU$Ugyh5Iz)s<fP;bGt$Bh@^M?m(m8s){sY{20If?Tb5XNq
zDew08M*<K`w<Mz)v^nuIPI{s@EC!x=yje$w-<Eu#{sMTm9PpwhruHLwwm`GyMuIIm
z@V3ct3OS)yI5`boX3oEFqO-mhVkP{P;E<5U8q5^}O?dn9<MRKf2&)@*0y9~_oN~b)
zCs1#Y4_0yeu>c!XtNWU*1ck905k)t&1OY^9ksCtZT6wN+--rELIo_Ng?hNP=Mjp0^
z0-PD+abqe=`sjApf^c!5P51Mi*v*T$C2X;P2B2|oKUBw3H{4mj60AaH;Zc{WUFeud
zwovj7amc(?dnx@lO*~#f*cXf%u<zX58AZ=^qY?!X5m8{RWKUclf9Pqo*XgL^R$t#F
zf-e0dyV})kIFHl<845eHv;`P62R@0FEXbryKc>})M;o7C8ldl;(RA?8Oe&@*BrI0$
zw9r!H1#};@<OM|1^gxs9hf0!viO51DsA+wSz(Q+S^P-n)$mkF9>I)W?uWXBUD*mGL
z+!ikq3%#T@j*jg>&5_(MwX-ujW}tX)ll>2vhb|t8%Cvu00ge8=2_@|ttwv4jxztAj
zaCxAKnAe^WOVzIs@jkxfQPZ+_KMrH)|Nd`V{@>q{_mA*&Ehkz0>+ipDkhdob7T0vp
zpxzG(6>l3_Re!Yc_t`=dwC<JfkVQ_yqTs*CcA|1sRTbVMQ8mssHc$8Hz4-7!O!1Sy
zZHeN#rEeUlV^K@e7R`2jE?8(@Ny_N_e+WVw=0=dR2b2jDcpkFeJ!r?>xL0?yDSpVG
zq3x#l)(O3T{R|m*J&O7JXTOwczkf7l)_-Dj-H-pPrmOqW{nr1#qiR%5gMYkycE6ii
S@I$<q!Gvk!PfKQR_`d+SPvjf`


From 3cfa63ad991665b2440155cd29352342024072fd Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Tue, 25 Nov 2025 05:02:21 +0800
Subject: [PATCH 380/578] [XPU]fix Kimi-VL-A3B-thinking on xpu (#29309)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 vllm/model_executor/models/moonvit.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index 2e3e6dc166ad..63ea6b259a71 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -56,10 +56,13 @@
 from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.models.utils import maybe_prefix
+from vllm.platforms import current_platform
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_varlen_func
+elif current_platform.is_xpu():
+    from vllm.attention.utils.fa_utils import flash_attn_varlen_func
 else:
     flash_attn_varlen_func = None
 
@@ -106,10 +109,10 @@ def multihead_attention(
         q,
         k,
         v,
-        q_cu_seqlens,
-        k_cu_seqlens,
-        max_seqlen_q,
-        max_seqlen_k,
+        cu_seqlens_q=q_cu_seqlens,
+        cu_seqlens_k=k_cu_seqlens,
+        max_seqlen_q=max_seqlen_q,
+        max_seqlen_k=max_seqlen_k,
         causal=False,
     )
     attn_out = attn_out.flatten(start_dim=-2)
@@ -291,7 +294,12 @@ class Rope2DPosEmb(nn.Module):
     """
 
     def __init__(
-        self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda"
+        self,
+        dim: int,
+        max_height: int,
+        max_width: int,
+        theta_base=10000,
+        device=current_platform.device_type,
     ):
         super().__init__()
         self.dim = dim
@@ -437,7 +445,7 @@ def __init__(
         self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
         self.attn_implementation = attn_implementation
         # use fa2 in vllm by default
-        if is_flash_attn_2_available():
+        if is_flash_attn_2_available() or current_platform.is_xpu():
             self.attn_implementation = "flash_attention_2"
 
         self.norm0 = nn.LayerNorm(hidden_dim)

From f32c7d6f5455de2684686c7238f9c7ecca6b58b7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Nov 2025 13:54:59 -0800
Subject: [PATCH 381/578] [Model Runner V2] Simplify Eagle bookkeeping with
 num_rejected (#29347)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py             | 19 ++++--------
 vllm/v1/worker/gpu/model_runner.py            | 30 ++++++++++++++-----
 vllm/v1/worker/gpu/spec_decode/eagle.py       | 19 ++++++------
 .../gpu/spec_decode/rejection_sample.py       | 12 ++++++++
 4 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 3ac43ea4952d..2a7048ae3c0e 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -344,8 +344,8 @@ def _post_update_kernel(
     sampled_tokens_ptr,
     sampled_tokens_stride,
     num_sampled_ptr,
+    num_rejected_ptr,
     query_start_loc_ptr,
-    cu_num_logits_ptr,
 ):
     req_id = tl.program_id(0)
     req_state_idx = tl.load(idx_mapping_ptr + req_id)
@@ -360,17 +360,10 @@ def _post_update_kernel(
     query_start = tl.load(query_start_loc_ptr + req_id)
     query_end = tl.load(query_start_loc_ptr + req_id + 1)
     query_len = query_end - query_start
+    num_rejected = tl.load(num_rejected_ptr + req_id)
 
     num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
-    num_computed += query_len
-    # Consider the rejected tokens in spec decoding.
-    if num_sampled > 0:
-        # NOTE(woosuk): We must skip num_sampled == 0 to account for chunked prefills.
-        logits_start = tl.load(cu_num_logits_ptr + req_id)
-        logits_end = tl.load(cu_num_logits_ptr + req_id + 1)
-        num_logits = logits_end - logits_start
-        num_rejected = num_logits - num_sampled
-        num_computed -= num_rejected
+    num_computed += query_len - num_rejected
     tl.store(num_computed_tokens_ptr + req_state_idx, num_computed)
 
 
@@ -385,10 +378,10 @@ def post_update(
     sampled_tokens: torch.Tensor,
     # [num_reqs]
     num_sampled: torch.Tensor,
+    # [num_reqs]
+    num_rejected: torch.Tensor,
     # [num_reqs + 1]
     query_start_loc: torch.Tensor,
-    # [num_reqs + 1]
-    cu_num_logits: torch.Tensor,
 ) -> None:
     num_reqs = idx_mapping.shape[0]
     _post_update_kernel[(num_reqs,)](
@@ -398,7 +391,7 @@ def post_update(
         sampled_tokens,
         sampled_tokens.stride(0),
         num_sampled,
+        num_rejected,
         query_start_loc,
-        cu_num_logits,
         num_warps=1,
     )
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index e0ed183d3c5b..e34a45f97980 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -46,7 +46,10 @@
 )
 from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
 from vllm.v1.worker.gpu.spec_decode import init_speculator
-from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
+from vllm.v1.worker.gpu.spec_decode.rejection_sample import (
+    get_num_rejected,
+    rejection_sample,
+)
 from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
@@ -311,12 +314,14 @@ def _dummy_speculator_run(
             device=self.device,
         )
         num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
+        num_rejected = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
         self.propose_draft(
             input_batch=input_batch,
             sampling_metadata=sampling_metadata,
             last_hidden_states=hidden_states,
             aux_hidden_states=aux_hidden_states,
             num_sampled=num_sampled,
+            num_rejected=num_rejected,
         )
 
     @torch.inference_mode()
@@ -606,7 +611,7 @@ def sample(
         input_batch: InputBatch,
         sampling_metadata: SamplingMetadata,
         grammar_output: GrammarOutput | None,
-    ) -> tuple[SamplerOutput, torch.Tensor]:
+    ) -> tuple[SamplerOutput, torch.Tensor, torch.Tensor]:
         sample_hidden_states = hidden_states[input_batch.logits_indices]
         logits = self.model.compute_logits(sample_hidden_states)
         if grammar_output is not None:
@@ -632,6 +637,7 @@ def sample(
             # No draft tokens (common case).
             # 0 if chunked-prefilling, 1 if not.
             num_sampled = (~is_chunked_prefilling).int()
+            num_rejected = torch.zeros_like(num_sampled)
         else:
             # Draft tokens for spec decoding.
             input_ids = input_batch.input_ids[input_batch.logits_indices]
@@ -642,9 +648,13 @@ def sample(
                 self.num_speculative_steps,
             )
             num_sampled *= ~is_chunked_prefilling
+            num_rejected = get_num_rejected(
+                input_batch.cu_num_logits,
+                num_sampled,
+            )
             sampler_output.sampled_token_ids = sampled_tokens
             # TODO(woosuk): Support logprobs with spec decoding.
-        return sampler_output, num_sampled
+        return sampler_output, num_sampled, num_rejected
 
     def compute_prompt_logprobs(
         self,
@@ -750,6 +760,7 @@ def postprocess(
         input_batch: InputBatch,
         sampled_tokens: torch.Tensor,
         num_sampled: torch.Tensor,
+        num_rejected: torch.Tensor,
     ) -> None:
         # Update the number of computed tokens.
         post_update(
@@ -758,8 +769,8 @@ def postprocess(
             self.req_states.last_sampled_tokens,
             sampled_tokens,
             num_sampled,
+            num_rejected,
             input_batch.query_start_loc,
-            input_batch.cu_num_logits,
         )
 
         # Update the number of computed prefill tokens.
@@ -779,6 +790,7 @@ def propose_draft(
         last_hidden_states: torch.Tensor,
         aux_hidden_states: list[torch.Tensor] | None,
         num_sampled: torch.Tensor,
+        num_rejected: torch.Tensor,
     ) -> torch.Tensor:
         num_reqs = input_batch.num_reqs
         idx_mapping_np = input_batch.idx_mapping_np
@@ -800,6 +812,7 @@ def propose_draft(
             last_hidden_states,
             aux_hidden_states,
             num_sampled,
+            num_rejected,
             self.req_states.last_sampled_tokens,
             next_prefill_tokens,
         )
@@ -958,7 +971,7 @@ def sample_tokens(
         self.execute_model_state = None  # type: ignore
         assert sampling_metadata is not None
 
-        sampler_output, num_sampled_tokens = self.sample(
+        sampler_output, num_sampled, num_rejected = self.sample(
             hidden_states, input_batch, sampling_metadata, grammar_output
         )
         prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch)
@@ -979,7 +992,7 @@ def sample_tokens(
         async_output = AsyncOutput(
             model_runner_output=model_runner_output,
             sampler_output=sampler_output,
-            num_sampled_tokens=num_sampled_tokens,
+            num_sampled_tokens=num_sampled,
             copy_stream=self.output_copy_stream,
             copy_event=self.output_copy_event,
         )
@@ -990,7 +1003,7 @@ def sample_tokens(
         # This sequencing may slightly reduce latency as async D2H copy does not
         # need to wait for the postprocess to finish.
         self.postprocess(
-            input_batch, sampler_output.sampled_token_ids, num_sampled_tokens
+            input_batch, sampler_output.sampled_token_ids, num_sampled, num_rejected
         )
         if self.do_spec_decode:
             _ = self.propose_draft(
@@ -998,7 +1011,8 @@ def sample_tokens(
                 sampling_metadata,
                 hidden_states,
                 None,  # aux_hidden_states
-                num_sampled_tokens,
+                num_sampled,
+                num_rejected,
             )
 
         if self.use_async_scheduling:
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 59d0f313d96a..3c8621cc69c9 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -60,6 +60,8 @@ def propose(
         aux_hidden_states: list[torch.Tensor] | None,
         # [num_reqs]
         num_sampled: torch.Tensor,
+        # [num_reqs]
+        num_rejected: torch.Tensor,
         # [max_num_reqs, 1]
         last_sampled: torch.Tensor,
         # [num_reqs]
@@ -84,6 +86,7 @@ def propose(
             self.input_ids,
             input_batch,
             num_sampled,
+            num_rejected,
             last_sampled,
             next_prefill_tokens,
         )
@@ -139,8 +142,8 @@ def _prepare_eagle_inputs_kernel(
     last_sampled_ptr,
     next_prefill_tokens_ptr,
     num_sampled_ptr,
+    num_rejected_ptr,
     query_start_loc_ptr,
-    cu_num_logits_ptr,
     BLOCK_SIZE: tl.constexpr,
 ):
     batch_idx = tl.program_id(0)
@@ -149,17 +152,13 @@ def _prepare_eagle_inputs_kernel(
     query_len = query_end - query_start
 
     # Get the true query length and next token after accounting for rejected tokens.
+    num_rejected = tl.load(num_rejected_ptr + batch_idx)
+    query_len -= num_rejected
+
     num_sampled = tl.load(num_sampled_ptr + batch_idx)
     if num_sampled > 0:
         req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
         next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32)
-
-        logits_start = tl.load(cu_num_logits_ptr + batch_idx)
-        logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
-        num_logits = logits_end - logits_start
-
-        num_rejected = num_logits - num_sampled
-        query_len -= num_rejected
     else:
         # Chunked prefilling.
         # Get the next prefill token.
@@ -182,6 +181,8 @@ def prepare_eagle_inputs(
     input_batch: InputBatch,
     # [num_reqs]
     num_sampled: torch.Tensor,
+    # [num_reqs]
+    num_rejected: torch.Tensor,
     # [max_num_reqs, 1]
     last_sampled: torch.Tensor,
     # [max_num_reqs]
@@ -201,8 +202,8 @@ def prepare_eagle_inputs(
         last_sampled,
         next_prefill_tokens,
         num_sampled,
+        num_rejected,
         input_batch.query_start_loc,
-        input_batch.cu_num_logits,
         BLOCK_SIZE=1024,
     )
     return last_token_indices
diff --git a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
index 8a7bf28bacbd..43c6ac518bcc 100644
--- a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
+++ b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
@@ -69,3 +69,15 @@ def rejection_sample(
         num_warps=1,
     )
     return sampled, num_sampled
+
+
+@torch.compile(dynamic=True)
+def get_num_rejected(
+    cu_num_logits: torch.Tensor,
+    num_sampled: torch.Tensor,
+) -> torch.Tensor:
+    num_logits = cu_num_logits[1:] - cu_num_logits[:-1]
+    num_rejected = num_logits - num_sampled
+    # No token is rejected for chunked prefills.
+    num_rejected *= num_sampled > 0
+    return num_rejected

From 84371daf75507c849a38a9a44b2fb2af89e96dd3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Nov 2025 14:04:31 -0800
Subject: [PATCH 382/578] [Tests] Verify gpt_oss package is installed in
 harmony tests (#29336)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/entrypoints/openai/test_response_api_with_harmony.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 6251e1776c30..8fd3545eccff 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import importlib
 import json
 import time
 
@@ -35,6 +35,10 @@
 
 @pytest.fixture(scope="module")
 def server():
+    assert importlib.util.find_spec("gpt_oss") is not None, (
+        "Harmony tests require gpt_oss package to be installed"
+    )
+
     args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
     env_dict = dict(
         VLLM_ENABLE_RESPONSES_API_STORE="1",

From 4dd42db566097cc2cacb2dddff3a8f3b0c007be0 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 24 Nov 2025 17:16:05 -0500
Subject: [PATCH 383/578] Remove VLLM_SKIP_WARMUP tip (#29331)

Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
---
 docs/features/quantization/inc.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/features/quantization/inc.md b/docs/features/quantization/inc.md
index 5e86e9388f32..9875bc44c914 100644
--- a/docs/features/quantization/inc.md
+++ b/docs/features/quantization/inc.md
@@ -22,9 +22,6 @@ export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxab
 vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
 ```
 
-!!! tip
-    If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
-
 !!! tip
     When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
     `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.

From 71df2a57effc15b5f67cdbf55f3d1e1b71f90e86 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 24 Nov 2025 14:28:32 -0800
Subject: [PATCH 384/578] [Hybrid Allocator] Better layer padding strategy for
 gpt-oss eagle (#29303)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/core/test_kv_cache_utils.py | 59 ++++++++++++++++++++++++++++
 vllm/v1/core/kv_cache_utils.py       | 11 +++++-
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 24611a4aaa1b..12ed59b6e863 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1436,6 +1436,65 @@ def test_get_kv_cache_config_one_worker():
         ],
     )
 
+    # 6 full + 5 sliding, pad to 6 full + 6 sliding. This is a typical case for gpt-oss
+    # eagle where there is only one more full attention layer than sliding window layers
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+        "layer_3": new_kv_cache_spec(),
+        "layer_4": new_kv_cache_spec(),
+        "layer_5": new_kv_cache_spec(),
+        "layer_6": new_kv_cache_spec(),
+        "layer_7": new_sliding_window_spec(),
+        "layer_8": new_sliding_window_spec(),
+        "layer_9": new_sliding_window_spec(),
+        "layer_10": new_sliding_window_spec(),
+        "layer_11": new_sliding_window_spec(),
+    }
+
+    kv_cache_config_hybrid = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 6 * 32]
+    )[0]
+    print(kv_cache_config_hybrid)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_1", "layer_7"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_2", "layer_8"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_3", "layer_9"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_4", "layer_10"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_5", "layer_11"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_6"],
+            ),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer_1", "layer_2", "layer_3", "layer_4", "layer_5", "layer_6"],
+                new_kv_cache_spec(),
+            ),
+            KVCacheGroupSpec(
+                ["layer_7", "layer_8", "layer_9", "layer_10", "layer_11"],
+                new_sliding_window_spec(),
+            ),
+        ],
+    )
     # different hidden size
     kv_cache_specs_hybrid = {
         "layer_1": new_kv_cache_spec(head_size=128),
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index b18ba8e8b2c7..a0033fa650ba 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -971,7 +971,16 @@ def _get_kv_cache_groups_uniform_page_size(
     # is the minimum number of layers among all attention types. Need a better
     # strategy if we want to support more complex patterns (e.g., 20 full + 30
     # sw, where the group size should be 10).
-    group_size = min([len(layers) for layers in same_type_layers.values()])
+    min_num_layers = min([len(layers) for layers in same_type_layers.values()])
+    group_size = min_num_layers
+    max_num_layers = max([len(layers) for layers in same_type_layers.values()])
+    if max_num_layers < min_num_layers * 1.25:
+        # If the number of layers is not much larger than the minimum number of layers,
+        # use the maximum number of layers as the group size to avoid too many padding
+        # layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We
+        # pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a
+        # magic number to avoid too many padding layers.
+        group_size = max_num_layers
     grouped_layers = []
     for layers in same_type_layers.values():
         num_padding_layers = group_size - len(layers) % group_size

From c17610e2baf5e40b3b0638b272bfe7e04e471bfe Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 24 Nov 2025 18:22:46 -0500
Subject: [PATCH 385/578] [Bugfix] Only use triton_kernels for MXFP4 on SM90
 and SM100 (#29339)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 255b5aad1785..198feb03be3e 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -132,12 +132,15 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
             )
 
         # If FlashInfer is not available, try either Marlin or Triton
-        if (
-            envs.VLLM_MXFP4_USE_MARLIN
-            or current_platform.get_device_capability()[0] < 9
-            or not has_triton_kernels()
-            or not is_torch_equal_or_newer("2.8.0")
-        ):
+        triton_kernels_supported = (
+            has_triton_kernels()
+            and is_torch_equal_or_newer("2.8.0")
+            # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
+            # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
+            # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
+            and (9, 0) <= current_platform.get_device_capability() < (11, 0)
+        )
+        if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
             logger.info_once("Using Marlin backend")
             return Mxfp4Backend.MARLIN
         else:

From 699bca76c00b81ba6c7ead38fed01712f5f56aa1 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 24 Nov 2025 19:49:01 -0500
Subject: [PATCH 386/578] [UX] Raise error for attn backend of batch invariant
 (#29348)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/batch_invariant.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 8b33727f05fb..be7f673e5618 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -812,19 +812,19 @@ def override_envs_for_invariance():
         # "TRITON_MLA",
     ]
     if curr_attn_backend not in supported_backends:
-        warning = (
-            "Forcibly updating attention backend to"
-            f" {supported_backends[0]} for batch_invariant. "
-            f" Supported backends: {supported_backends}."
+        error = (
+            "VLLM batch_invariant mode requires an attention backend in "
+            f"{supported_backends}, but got '{curr_attn_backend}'. "
+            "Please set the 'VLLM_ATTENTION_BACKEND' environment variable "
+            "to one of the supported backends before enabling batch_invariant."
         )
-        logger.warning_once(warning)
-        os.environ["VLLM_ATTENTION_BACKEND"] = supported_backends[0]
+        raise RuntimeError(error)
     if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]:
         warning = (
             "You are using a decode-invariant form of batch invariance. "
             "This will not be invariant between prefill and decode."
         )
-        logger.warning_once(warning)
+        logger.warning_once(warning, scope="local")
     os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
 
     os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

From 5f9679a43bf92fc0fc8610f0ba5cc9c857148ccf Mon Sep 17 00:00:00 2001
From: Hanjie Qiu <50634613+hjjq@users.noreply.github.com>
Date: Mon, 24 Nov 2025 20:13:12 -0500
Subject: [PATCH 387/578] [Spec Decode] Add support for EAGLE3 heads that do
 not use_aux_hidden_states (#27688)

Signed-off-by: hjjq <hanjieq@nvidia.com>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
---
 vllm/model_executor/models/llama_eagle3.py | 38 ++++++++++++++--------
 vllm/v1/spec_decode/eagle.py               | 19 +++++++++++
 vllm/v1/worker/gpu_model_runner.py         |  4 ++-
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 3eaf2d80082f..7a57644db1b1 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -142,6 +142,12 @@ def __init__(
         # Get drafter's quantization config
         self.quant_config = get_draft_quant_config(vllm_config)
 
+        eagle_config = getattr(self.config, "eagle_config", None)
+        if eagle_config is not None and "use_aux_hidden_state" in eagle_config:
+            self.use_aux_hidden_state = eagle_config["use_aux_hidden_state"]
+        else:
+            self.use_aux_hidden_state = True
+
         current_vllm_config = get_current_vllm_config()
 
         self.embed_tokens = VocabParallelEmbedding(
@@ -161,20 +167,20 @@ def __init__(
                 for layer_idx in range(self.config.num_hidden_layers)
             ]
         )
-        if hasattr(self.config, "target_hidden_size"):
-            fc_input_size = self.config.target_hidden_size * 3
-        else:
-            fc_input_size = self.config.hidden_size * 3
-        self.fc = ReplicatedLinear(
-            input_size=fc_input_size,
-            output_size=self.config.hidden_size,
-            bias=False,
-            params_dtype=vllm_config.model_config.dtype,
-            quant_config=self.quant_config,
-            prefix=maybe_prefix(prefix, "fc"),
-            return_bias=False,
-        )
-
+        if self.use_aux_hidden_state:
+            if hasattr(self.config, "target_hidden_size"):
+                fc_input_size = self.config.target_hidden_size * 3
+            else:
+                fc_input_size = self.config.hidden_size * 3
+            self.fc = ReplicatedLinear(
+                input_size=fc_input_size,
+                output_size=self.config.hidden_size,
+                bias=False,
+                params_dtype=vllm_config.model_config.dtype,
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "fc"),
+                return_bias=False,
+            )
         self.norm = RMSNorm(
             self.config.hidden_size,
             eps=self.config.rms_norm_eps,
@@ -332,6 +338,8 @@ def combine_hidden_states(
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
+        if not self.model.use_aux_hidden_state:
+            return hidden_states
         # combine multiple auxiliary hidden states returned by eagle3
         return self.model.fc(hidden_states)
 
@@ -357,6 +365,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
             skip_substrs.append("draft_id_to_target_id")
         if not includes_embed_tokens:
             skip_substrs.append("embed_tokens")
+        if not self.model.use_aux_hidden_state:
+            skip_substrs.append("fc.")
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=None,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 3de418f1d13c..afa16573eea1 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -83,6 +83,9 @@ def __init__(
         self.draft_indexer_metadata_builder: AttentionMetadataBuilder | None = None
         self.attn_layer_names: list[str] = []
         self.indexer_layer_names: list[str] = []
+        self.eagle3_use_aux_hidden_state: bool = (
+            self._get_eagle3_use_aux_hidden_state_from_config()
+        )
 
         self.use_cuda_graph = False
 
@@ -1169,6 +1172,22 @@ def _get_attention_metadata_builder(self) -> AttentionMetadataBuilder:
         )
         return builder
 
+    def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool:
+        """
+        Some eagle3 heads (e.g., nvidia/gpt-oss-120b-Eagle3-v2) do not use auxiliary
+        hidden states and directly uses the last layer output just like eagle1.
+        They might indicate this by setting "use_aux_hidden_state" to False
+        inside the "eagle_config" dict of their hf_config.
+        """
+        if self.method != "eagle3":
+            return False
+        # Assume that eagle3 heads use aux hidden states by default
+        use_aux_hidden_state = True
+        eagle_config = getattr(self.draft_model_config.hf_config, "eagle_config", None)
+        if eagle_config is not None:
+            use_aux_hidden_state = eagle_config.get("use_aux_hidden_state", True)
+        return use_aux_hidden_state
+
     def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Validate that all eagle layers belong to the same KVCacheGroup.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index cbafc9c993cc..6a83ac14e0b3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -375,7 +375,9 @@ def __init__(
             elif self.speculative_config.use_eagle():
                 self.drafter = EagleProposer(self.vllm_config, self.device, self)
                 if self.speculative_config.method == "eagle3":
-                    self.use_aux_hidden_state_outputs = True
+                    self.use_aux_hidden_state_outputs = (
+                        self.drafter.eagle3_use_aux_hidden_state
+                    )
             elif self.speculative_config.method == "medusa":
                 self.drafter = MedusaProposer(
                     vllm_config=self.vllm_config, device=self.device

From b8328b49fb9954575bd1d7b30b22bb626ee47624 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 25 Nov 2025 09:34:47 +0800
Subject: [PATCH 388/578] [XPU] upgrade torch & ipex 2.9 on XPU platform
 (#29307)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 docker/Dockerfile.xpu | 13 +++++++++----
 requirements/xpu.txt  |  6 +++---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 5d5b82c4fa5a..adac43c6accb 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -1,4 +1,4 @@
-FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
+FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
@@ -25,10 +25,14 @@ RUN apt clean && apt-get update -y && \
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
 
-RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
+RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
+
+# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
+RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
+RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
+    echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
+    echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
 
-RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
-RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
 SHELL ["bash", "-c"]
 CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
 
@@ -72,6 +76,7 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 ENV NIXL_VERSION=0.7.0
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
 
+# remove torch bundled oneccl to avoid conflicts
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip uninstall oneccl oneccl-devel -y
 
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 59ea710684a2..c1dc4195b523 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -10,9 +10,9 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.61.2 # Required for N-gram speculative decoding
-torch==2.8.0+xpu
+--extra-index-url=https://download.pytorch.org/whl/xpu
+torch==2.9.0+xpu
 torchaudio
 torchvision
---extra-index-url=https://download.pytorch.org/whl/xpu
 
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl

From a178a0b40b50bf448ab50a853b7eb1744af18f31 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Nov 2025 17:54:26 -0800
Subject: [PATCH 389/578] [BugFix] Fix duplicate id tool-call race condition
 (#29355)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/serving_chat.py   | 13 +++++++++----
 vllm/entrypoints/openai/serving_engine.py | 10 +++++++---
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 6cc685acd672..2a870dbc3afa 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -273,6 +273,11 @@ async def create_chat_completion(
         try:
             for i, engine_prompt in enumerate(engine_prompts):
                 prompt_text, _, _ = self._get_prompt_components(request_prompts[i])
+                # If we are creating sub requests for multiple prompts, ensure that they
+                # have unique request ids.
+                sub_request_id = (
+                    request_id if len(engine_prompts) == 1 else f"{request_id}_{i}"
+                )
 
                 if self.default_sampling_params is None:
                     self.default_sampling_params = {}
@@ -301,7 +306,7 @@ async def create_chat_completion(
                     )
 
                 self._log_inputs(
-                    request_id,
+                    sub_request_id,
                     request_prompts[i],
                     params=sampling_params,
                     lora_request=lora_request,
@@ -316,14 +321,14 @@ async def create_chat_completion(
                 if isinstance(sampling_params, BeamSearchParams):
                     generator = self.beam_search(
                         prompt=engine_prompt,
-                        request_id=request_id,
+                        request_id=sub_request_id,
                         params=sampling_params,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                     )
                 else:
                     engine_request, tokenization_kwargs = await self._process_inputs(
-                        request_id,
+                        sub_request_id,
                         engine_prompt,
                         sampling_params,
                         lora_request=lora_request,
@@ -334,7 +339,7 @@ async def create_chat_completion(
                     generator = self.engine_client.generate(
                         engine_request,
                         sampling_params,
-                        request_id,
+                        sub_request_id,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 7dab5dbacd28..de22c48809dc 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1242,16 +1242,19 @@ async def _generate_with_builtin_tools(
     ):
         prompt_text, _, _ = self._get_prompt_components(request_prompt)
         orig_priority = priority
+        sub_request = 0
         while True:
+            # Ensure that each sub-request has a unique request id.
+            sub_request_id = f"{request_id}_{sub_request}"
             self._log_inputs(
-                request_id,
+                sub_request_id,
                 request_prompt,
                 params=sampling_params,
                 lora_request=lora_request,
             )
             trace_headers = kwargs.get("trace_headers")
             engine_request, tokenization_kwargs = await self._process_inputs(
-                request_id,
+                sub_request_id,
                 engine_prompt,
                 sampling_params,
                 lora_request=lora_request,
@@ -1262,7 +1265,7 @@ async def _generate_with_builtin_tools(
             generator = self.engine_client.generate(
                 engine_request,
                 sampling_params,
-                request_id,
+                sub_request_id,
                 lora_request=lora_request,
                 priority=priority,
                 prompt_text=prompt_text,
@@ -1295,6 +1298,7 @@ async def _generate_with_builtin_tools(
             sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids)
             # OPTIMIZATION
             priority = orig_priority - 1
+            sub_request += 1
 
     def _get_prompt_components(
         self,

From a4ad43ad5a819aabc7d9b48b46a7f11e2552befc Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 01:58:58 +0000
Subject: [PATCH 390/578] Scheduled removal of `ParallelConfig`'s direct child
 EPLB fields (#29324)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/parallel.py  | 50 ----------------------------------------
 vllm/engine/arg_utils.py | 24 -------------------
 2 files changed, 74 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index ad438a8b464e..913e97250d3d 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -141,22 +141,6 @@ class ParallelConfig:
     - "deepep_high_throughput": Use deepep high-throughput kernels
     - "deepep_low_latency": Use deepep low-latency kernels
     - "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
-    num_redundant_experts: int | None = None
-    """`num_redundant_experts` is deprecated and has been replaced with
-    `eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
-    Please use `eplb_config.num_redundant_experts` instead."""
-    eplb_window_size: int | None = None
-    """`eplb_window_size` is deprecated and has been replaced with
-    `eplb_config.window_size`. This will be removed in v0.12.0.
-    Please use `eplb_config.window_size` instead."""
-    eplb_step_interval: int | None = None
-    """`eplb_step_interval` is deprecated and has been replaced with
-    `eplb_config.step_interval`. This will be removed in v0.12.0.
-    Please use `eplb_config.step_interval` instead."""
-    eplb_log_balancedness: bool | None = None
-    """`eplb_log_balancedness` is deprecated and has been replaced with
-    `eplb_config.log_balancedness`. This will be removed in v0.12.0.
-    Please use `eplb_config.log_balancedness` instead."""
 
     max_parallel_loading_workers: int | None = None
     """Maximum number of parallel loading workers when loading model
@@ -516,40 +500,6 @@ def __post_init__(self) -> None:
                     "--all2all-backend command-line argument instead."
                 )
 
-        # Forward deprecated fields to their new location
-        if self.num_redundant_experts is not None:
-            self.eplb_config.num_redundant_experts = self.num_redundant_experts
-            logger.warning_once(
-                "num_redundant_experts is deprecated and has been replaced "
-                "with eplb_config.num_redundant_experts. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_window_size is not None:
-            self.eplb_config.window_size = self.eplb_window_size
-            logger.warning_once(
-                "eplb_window_size is deprecated and has been replaced "
-                "with eplb_config.window_size. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_step_interval is not None:
-            self.eplb_config.step_interval = self.eplb_step_interval
-            logger.warning_once(
-                "eplb_step_interval is deprecated and has been replaced "
-                "with eplb_config.step_interval. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_log_balancedness is not None:
-            self.eplb_config.log_balancedness = self.eplb_log_balancedness
-            logger.warning_once(
-                "eplb_log_balancedness is deprecated and has been replaced "
-                "with eplb_config.log_balancedness. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-
         # Continue with the rest of the initialization
         self.world_size = (
             self.pipeline_parallel_size
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b7c8f56e18c5..a7c6b11ccd5a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -855,30 +855,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--expert-placement-strategy",
             **parallel_kwargs["expert_placement_strategy"],
         )
-        parallel_group.add_argument(
-            "--num-redundant-experts",
-            type=int,
-            help="[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-window-size",
-            type=int,
-            help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-step-interval",
-            type=int,
-            help="[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-log-balancedness",
-            action=argparse.BooleanOptionalAction,
-            help="[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
-            deprecated=True,
-        )
 
         parallel_group.add_argument(
             "--max-parallel-loading-workers",

From 6f1355a1b74e4502e6a4e6ba9a811cc50729ee1f Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 24 Nov 2025 21:01:40 -0500
Subject: [PATCH 391/578] [Perf] Disable DeepGEMM MoE by default when TP=8 is
 used (#29346)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../model_executor/layers/quantization/fp8.py | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 9e2718057038..e033032903e8 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -28,6 +28,7 @@
     FusedMoeWeightScaleSupported,
 )
 from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEParallelConfig,
     FusedMoEQuantConfig,
     RoutingMethodType,
     fp8_w8a8_moe_quant_config,
@@ -118,7 +119,9 @@ class Fp8MoeBackend(Enum):
     TRITON = 6
 
 
-def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
+def get_fp8_moe_backend(
+    block_quant: bool, moe_parallel_config: FusedMoEParallelConfig
+) -> Fp8MoeBackend:
     """
     Select the primary FP8 MoE backend
     Note: Shape-specific fallbacks may still occur at runtime.
@@ -159,8 +162,19 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
         logger.info_once("Using Marlin backend for FP8 MoE")
         return Fp8MoeBackend.MARLIN
 
-    # deepGEMM on supported platforms with block-quantized weights
-    if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
+    # Determine if we should use DeepGEMM with block-quantized weights:
+    # - If explicitly set by user, respect their choice
+    # - If not explicitly set (default), disable when TP size is >= 8
+    moe_use_deep_gemm = envs.VLLM_MOE_USE_DEEP_GEMM
+    if not envs.is_set("VLLM_MOE_USE_DEEP_GEMM") and moe_parallel_config.tp_size >= 8:
+        moe_use_deep_gemm = False
+        logger.info_once(
+            "DeepGEMM MoE is disabled by default when TP size is >= 8. "
+            "Set VLLM_MOE_USE_DEEP_GEMM=1 to enable it.",
+            scope="local",
+        )
+
+    if envs.VLLM_USE_DEEP_GEMM and moe_use_deep_gemm and block_quant:
         if not has_deep_gemm():
             logger.warning_once(
                 "DeepGEMM backend requested but not available.", scope="local"
@@ -641,7 +655,9 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
         self.quant_config = quant_config
         self.weight_block_size = self.quant_config.weight_block_size
         self.block_quant: bool = self.weight_block_size is not None
-        self.fp8_backend = get_fp8_moe_backend(self.block_quant)
+        self.fp8_backend = get_fp8_moe_backend(
+            self.block_quant, layer.moe_parallel_config
+        )
 
         self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None

From 77e10c9cab751c83de0b2200977212922cc3776f Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Tue, 25 Nov 2025 10:05:46 +0800
Subject: [PATCH 392/578] [Perf][Deepseek] optimize
 gather_and_maybe_dequant_cache kernel's perf for extremely long sequence
 (#28029)

Signed-off-by: ganyi <ygan@amd.com>
---
 csrc/cache.h                             |  11 +-
 csrc/cache_kernels.cu                    | 178 +++++++++++------------
 csrc/torch_bindings.cpp                  |   3 +-
 tests/kernels/attention/test_cache.py    |  12 +-
 vllm/_custom_ops.py                      |   6 +-
 vllm/v1/attention/backends/mla/common.py |  28 +++-
 6 files changed, 132 insertions(+), 106 deletions(-)

diff --git a/csrc/cache.h b/csrc/cache.h
index b162a4a2bc31..f2a5ec0acf5c 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
 
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
     torch::Tensor const& scale,
     std::optional<torch::Tensor> seq_starts = std::nullopt);
 
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 32960cc8073b..8a5457206c70 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -905,91 +905,79 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
 namespace vllm {
 
 // grid is launched with dimensions (batch, num_splits)
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
+          int ENTRY_SIZE, int CTA_SIZE>
 __global__ void gather_and_maybe_dequant_cache(
-    const cache_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE,
-                                              // ENTRIES...]
-    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
-    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
-    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
-    const int32_t block_size, const int32_t entry_size,
+    const cache_t* __restrict__ src_cache,     // [NUM_BLOCKS, BLOCK_SIZE,
+                                               // ENTRIES...]
+    scalar_t* __restrict__ dst,                // [TOT_TOKENS, ENTRIES...]
+    const int32_t* __restrict__ block_table,   // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,   // [BATCH+1]
+    const int32_t* __restrict__ token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNK]
+    const int32_t num_tokens, const int32_t block_size,
     const int64_t block_table_stride, const int64_t cache_block_stride,
     const int64_t cache_entry_stride, const int64_t dst_entry_stride,
     const float* __restrict__ scale,
     const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
                                                // batch
+  constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
+  using ltype = vllm::vec_n_t<cache_t, vec_size>;
+  using stype = vllm::vec_n_t<scalar_t, vec_size>;
+  // We are adding this for code readability which will be optimized out when
+  // build in release.
+  assert(CTA_SIZE == blockDim.x);
 
-  const int64_t bid = blockIdx.x;  // Batch ID
-  const int32_t num_splits = gridDim.y;
-  const int32_t split = blockIdx.y;
-  const int32_t seq_start = cu_seq_lens[bid];
-  const int32_t seq_end = cu_seq_lens[bid + 1];
-  const int32_t seq_len = seq_end - seq_start;
-  const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
-  const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
-
-  const int32_t split_start = split * split_blocks;
-  const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
-
-  const bool is_active_split = (split_start < tot_blocks);
-  const bool is_last_split = (split_end == tot_blocks);
-
-  if (!is_active_split) return;
-
-  int32_t full_blocks_end = split_end;
-  int32_t partial_block_size = 0;
-
-  // Adjust the pointer for the block_table for this batch.
-  // If seq_starts is provided, compute an offset based on (seq_starts[bid] /
-  // page_size)
-  const int32_t batch_offset = bid * block_table_stride;
-  int32_t offset = 0;
-  if (seq_starts != nullptr) {
-    offset = seq_starts[bid] / block_size;
-  }
-  const int32_t* batch_block_table = block_table + batch_offset + offset;
-
-  // Adjust dst pointer based on the cumulative sequence lengths.
-  dst += seq_start * dst_entry_stride;
-
-  if (is_last_split) {
-    partial_block_size = seq_len % block_size;
-    if (partial_block_size) full_blocks_end -= 1;
-  }
+#pragma unroll
+  for (int token_id = blockIdx.x; token_id < num_tokens;
+       token_id += gridDim.x) {
+    int64_t batch_id = token_to_seq[token_id];
+    int64_t batch_start = cu_seq_lens[batch_id];
+    int64_t batch_end = cu_seq_lens[batch_id + 1];
+    int32_t batch_offset = token_id - batch_start;
+
+    if (token_id >= batch_end) return;
+    int32_t offset = 0;
+    if (seq_starts != nullptr) {
+      offset = seq_starts[batch_id];
+    }
+    batch_offset += offset;
+    int32_t block_table_id = batch_offset / block_size;
+    int32_t slot_id = batch_offset % block_size;
+    int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
+    int32_t block_id = block_table[block_table_offset];
+    int64_t cache_offset =
+        block_id * cache_block_stride + slot_id * cache_entry_stride;
+    constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
+    scalar_t* dst_ = dst + token_id * dst_entry_stride;
+    cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
 
-  auto copy_entry = [&](const cache_t* __restrict__ _src,
-                        scalar_t* __restrict__ _dst) {
-    for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
+#pragma unroll
+    for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
       if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        _dst[i] = static_cast<scalar_t>(_src[i]);
+        reinterpret_cast<stype*>(dst_)[idx] =
+            static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
       } else {
-        _dst[i] =
-            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
+        ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
+        stype store_val;
+#pragma unroll
+        for (int j = 0; j < vec_size; ++j) {
+          store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
+              loaded_val.val[j], *scale);
+        }
+        reinterpret_cast<stype*>(dst_)[idx] = store_val;
       }
     }
-  };
-
-  const auto loop_end =
-      std::min((int64_t)full_blocks_end, block_table_stride - offset);
-  for (int pid = split_start; pid < loop_end; ++pid) {
-    auto block_id = batch_block_table[pid];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
-    for (int eid = 0; eid < block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
-                 block_dst_ptr + eid * dst_entry_stride);
-    }
-  }
-
-  if (partial_block_size) {
-    if (offset + full_blocks_end < block_table_stride) {
-      auto block_id = batch_block_table[full_blocks_end];
-      auto block_start_ptr = src_cache + block_id * cache_block_stride;
-      auto block_dst_ptr =
-          dst + full_blocks_end * block_size * dst_entry_stride;
-      for (int eid = 0; eid < partial_block_size; ++eid) {
-        copy_entry(block_start_ptr + eid * cache_entry_stride,
-                   block_dst_ptr + eid * dst_entry_stride);
+    // process tail
+    constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
+    dst_ = dst_ + ENTRY_SIZE - tail_cnt;
+    src_ = src_ + ENTRY_SIZE - tail_cnt;
+#pragma unroll
+    for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst_[idx] = static_cast<scalar_t>(src_[idx]);
+      } else {
+        dst_[idx] =
+            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
       }
     }
   }
@@ -1001,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache(
 // SCALAR_T is the data type of the destination tensor.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                      \
-  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE>         \
-      <<<grid, block, 0, stream>>>(                                         \
-          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                 \
-          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                      \
-          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
-          block_size, entry_size, block_table_stride, cache_block_stride,   \
-          cache_entry_stride, dst_entry_stride,                             \
-          reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                        \
+  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576,      \
+                                       thread_block_size>                     \
+      <<<grid, block, 0, stream>>>(                                           \
+          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                   \
+          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                        \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(),   \
+          token_to_seq.data_ptr<int32_t>(), num_tokens, block_size,           \
+          block_table_stride, cache_block_stride, cache_entry_stride,         \
+          dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
+          seq_starts_ptr);
 
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
+//  - token_to_seq contains the back mapping from token_id to batch_id
 //  - Optionally, seq_starts (if provided) offsets the starting block index by
 //  (seq_starts[bid] / page_size)
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
     torch::Tensor const& scale,
     std::optional<torch::Tensor> seq_starts = std::nullopt) {
   at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   int32_t block_size = src_cache.size(1);
-  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+  int32_t head_dim = dst.size(-1);
 
   TORCH_CHECK(block_table.dtype() == torch::kInt32,
               "block_table must be int32");
@@ -1038,6 +1030,9 @@ void gather_and_maybe_dequant_cache(
     TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                 "seq_starts must be int32");
   }
+  TORCH_CHECK(head_dim == 576,
+              "gather_and_maybe_dequant_cache only support the head_dim to 576 "
+              "for better performance")
 
   TORCH_CHECK(src_cache.device() == dst.device(),
               "src_cache and dst must be on the same device");
@@ -1055,10 +1050,9 @@ void gather_and_maybe_dequant_cache(
   int64_t cache_entry_stride = src_cache.stride(1);
   int64_t dst_entry_stride = dst.stride(0);
 
-  // Decide on the number of splits based on the batch size.
-  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
-  dim3 grid(batch_size, num_splits);
-  dim3 block(1024);
+  constexpr int32_t thread_block_size = 64;
+  dim3 grid(num_tokens);
+  dim3 block(thread_block_size);
 
   const int32_t* seq_starts_ptr =
       seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 5af74c2c2a6b..14913bef1312 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -695,7 +695,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
   cache_ops.def(
       "gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, "
       "                               Tensor block_table, Tensor cu_seq_lens, "
-      "                               int batch_size, "
+      "                               Tensor token_to_seq, "
+      "                               int num_tokens, "
       "                               str kv_cache_dtype, "
       "                               Tensor scale, Tensor? seq_starts) -> ()");
   cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 028e164cb801..acf46d75d62e 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -921,12 +921,16 @@ def test_gather_and_maybe_dequant_cache_mla(
     )
     _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
 
-    seq_len_tensor = torch.randint(0, max_seq_len + 1, (batch_size,), device=device)
+    seq_len_tensor = torch.randint(
+        max_seq_len, max_seq_len + 1, (batch_size,), device=device
+    )
 
     total_tokens = seq_len_tensor.sum()
     cu_seq_lens = torch.empty((batch_size + 1), dtype=torch.int32, device=device)
     cu_seq_lens[0] = 0
     cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
+    token_to_seq = torch.arange(0, batch_size, dtype=torch.int32, device=device)
+    token_to_seq = torch.repeat_interleave(token_to_seq, seq_len_tensor)
     print("seq_len_tensor", seq_len_tensor)
 
     tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
@@ -977,7 +981,8 @@ def test_gather_and_maybe_dequant_cache_mla(
             dst,
             block_table,
             cu_seq_lens,
-            batch_size,
+            token_to_seq,
+            total_tokens,
             kv_cache_dtype,
             scale,
             None,
@@ -990,7 +995,8 @@ def test_gather_and_maybe_dequant_cache_mla(
         dst,
         block_table,
         cu_seq_lens,
-        batch_size,
+        token_to_seq,
+        total_tokens,
         kv_cache_dtype,
         scale,
         None,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0f625a794524..4a1bcc761f99 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2201,7 +2201,8 @@ def gather_and_maybe_dequant_cache(
     dst: torch.Tensor,
     block_table: torch.Tensor,
     cu_seq_lens: torch.Tensor,
-    batch_size: int,
+    token_to_seq: torch.Tensor,
+    num_tokens: int,
     kv_cache_dtype: str,
     scale: torch.Tensor,
     seq_starts: torch.Tensor | None = None,
@@ -2211,7 +2212,8 @@ def gather_and_maybe_dequant_cache(
         dst,
         block_table,
         cu_seq_lens,
-        batch_size,
+        token_to_seq,
+        num_tokens,
         kv_cache_dtype,
         scale,
         seq_starts,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 43aef8a7cca9..87a3aac21d2c 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -340,6 +340,8 @@ class ChunkedContextMetadata:
         max_seq_lens: list[int]
         seq_lens: torch.Tensor
         workspace: torch.Tensor
+        token_to_seq: torch.Tensor
+        chunk_total_token: list[int]
 
         # for mla DCP
         padded_local_chunk_seq_lens: list[list[int]] | None = None
@@ -839,6 +841,19 @@ def build(
                 torch.cumsum(
                     chunk_seq_lens, dim=1, out=cu_seq_lens_cpu[:, 1:], dtype=torch.int32
                 )
+                chunk_total_token = cu_seq_lens_cpu[:, -1]
+
+                max_token_num_over_chunk = chunk_total_token.max().item()
+                token_to_seq_tensor_cpu = torch.zeros(
+                    [num_chunks, max_token_num_over_chunk], dtype=torch.int32
+                )
+                range_idx = torch.arange(num_prefills, dtype=torch.int32)
+                for i in range(num_chunks):
+                    chunk_token_to_seq_tensor = torch.repeat_interleave(
+                        range_idx, chunk_seq_lens[i]
+                    )
+                    chunk_len = chunk_token_to_seq_tensor.shape[0]
+                    token_to_seq_tensor_cpu[i, :chunk_len] = chunk_token_to_seq_tensor
 
                 if self.dcp_world_size > 1:
                     local_context_lens_allranks = get_dcp_local_seq_lens(
@@ -906,6 +921,10 @@ def build(
                         seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                         seq_lens=chunk_seq_lens,
+                        token_to_seq=token_to_seq_tensor_cpu.to(
+                            device, non_blocking=True
+                        ),
+                        chunk_total_token=chunk_total_token.tolist(),
                         workspace=self.chunked_prefill_workspace,
                         padded_local_chunk_seq_lens=padded_local_chunk_seq_lens.tolist(),
                         local_context_lens_allranks=local_context_lens_allranks.tolist(),
@@ -922,6 +941,10 @@ def build(
                         seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                         seq_lens=chunk_seq_lens,
+                        token_to_seq=token_to_seq_tensor_cpu.to(
+                            device, non_blocking=True
+                        ),
+                        chunk_total_token=chunk_total_token,
                         workspace=self.chunked_prefill_workspace,
                     )
 
@@ -1638,16 +1661,15 @@ def _compute_prefill_context(
         output = None
         iters = len(prefill_metadata.chunked_context.seq_tot)
         workspace = prefill_metadata.chunked_context.workspace
-
         for i in range(iters):
             toks = prefill_metadata.chunked_context.seq_tot[i]
-
             ops.gather_and_maybe_dequant_cache(
                 src_cache=kv_c_and_k_pe_cache,
                 dst=workspace,
                 block_table=prefill_metadata.block_table,
                 cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i],
-                batch_size=attn_metadata.num_prefills,
+                token_to_seq=prefill_metadata.chunked_context.token_to_seq[i],
+                num_tokens=prefill_metadata.chunked_context.chunk_total_token[i],
                 kv_cache_dtype=self.kv_cache_dtype,
                 scale=k_scale,
                 seq_starts=prefill_metadata.chunked_context.starts[i],

From cb7214d8eaa231c67416282668f1ca274f8068ba Mon Sep 17 00:00:00 2001
From: gbyu-amd <Guanbao.Yu@amd.com>
Date: Tue, 25 Nov 2025 10:15:02 +0800
Subject: [PATCH 393/578] [ROCm][MLA] enable fp8 MLA decode on ROCm (#28032)

Signed-off-by: guanbao <gyu@amd.com>
Signed-off-by: Guanbao Yu <gyu@amd.com>
Signed-off-by: gbyu-amd <Guanbao.Yu@amd.com>
Co-authored-by: guanbao <gyu@amd.com>
---
 vllm/_aiter_ops.py                               | 10 ++++++++++
 vllm/v1/attention/backends/mla/rocm_aiter_mla.py | 12 +++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index db79b3f5e8bc..a8f472d147a0 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -294,6 +294,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
     kv_last_page_lens: torch.Tensor | None = None,
     sm_scale: float = 1.0,
     logit_cap: float = 0.0,
+    q_scale: torch.Tensor | None = None,
+    kv_scale: torch.Tensor | None = None,
 ) -> None:
     from aiter.mla import mla_decode_fwd
 
@@ -308,6 +310,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
         max_seqlen_qo,
         sm_scale=sm_scale,
         logit_cap=logit_cap,
+        q_scale=q_scale,
+        kv_scale=kv_scale,
     )
 
 
@@ -322,6 +326,8 @@ def _rocm_aiter_mla_decode_fwd_fake(
     kv_last_page_lens: torch.Tensor | None = None,
     sm_scale: float = 1.0,
     logit_cap: float = 0.0,
+    q_scale: torch.Tensor | None = None,
+    kv_scale: torch.Tensor | None = None,
 ) -> None:
     pass
 
@@ -806,6 +812,8 @@ def mla_decode_fwd(
         kv_indices: torch.Tensor | None = None,
         kv_last_page_lens: torch.Tensor | None = None,
         logit_cap: float = 0.0,
+        q_scale: torch.Tensor | None = None,
+        kv_scale: torch.Tensor | None = None,
     ):
         torch.ops.vllm.rocm_aiter_mla_decode_fwd(
             q,
@@ -818,6 +826,8 @@ def mla_decode_fwd(
             kv_last_page_lens,
             sm_scale=sm_scale,
             logit_cap=logit_cap,
+            q_scale=q_scale,
+            kv_scale=kv_scale,
         )
 
     @staticmethod
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 56f9c7a281e7..00a0a77a1c2f 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -49,6 +49,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
     paged_kv_last_page_len: torch.Tensor | None = None
     # The query indptr, shape : [num_decode + 1]
     qo_indptr: torch.Tensor | None = None
+    # The dtype of MLA out tensor
+    attn_out_dtype: torch.dtype = torch.bfloat16
 
 
 class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
@@ -74,6 +76,7 @@ def __init__(
         )
 
         self.compilation_config = vllm_config.compilation_config
+        self.decode_attn_out_dtype = vllm_config.model_config.dtype
         # kernel block size is always 1.
         max_num_pages_per_req = vllm_config.model_config.max_model_len
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
@@ -162,6 +165,7 @@ def _build_decode(
             paged_kv_last_page_len=paged_kv_last_page_len,
             qo_indptr=qo_indptr,
             dcp_tot_seq_lens=dcp_tot_seq_lens_device,
+            attn_out_dtype=self.decode_attn_out_dtype,
         )
 
         return attn_metadata
@@ -242,7 +246,11 @@ def _forward_decode(
         assert isinstance(q, torch.Tensor)
         B = q.shape[0]
         o = torch.zeros(
-            B, self.num_heads, self.kv_lora_rank, dtype=q.dtype, device=q.device
+            B,
+            self.num_heads,
+            self.kv_lora_rank,
+            dtype=attn_metadata.decode.attn_out_dtype,
+            device=q.device,
         )
 
         kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
@@ -260,6 +268,8 @@ def _forward_decode(
             attn_metadata.decode.paged_kv_indptr,
             attn_metadata.decode.paged_kv_indices,
             attn_metadata.decode.paged_kv_last_page_len,
+            q_scale=layer._q_scale,
+            kv_scale=layer._k_scale,
         )
 
         return o, None

From 22b42b5402f887c7d4b9f9aa4e82c970a6fd11a9 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Mon, 24 Nov 2025 20:15:39 -0600
Subject: [PATCH 394/578] [CI][ROCm] Install arctic-inference on ROCm tests
 (#29344)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 requirements/rocm-test.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 2d57e7e16786..f9bddc23420b 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -45,3 +45,6 @@ multiprocess==0.70.16
 
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
+
+# Required for suffix decoding test
+arctic-inference == 0.1.1
\ No newline at end of file

From 7012d8b45e677a4316e38be6fb9547de2993b519 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Tue, 25 Nov 2025 10:54:00 +0800
Subject: [PATCH 395/578] [Docker] Optimize Dockerfile: consolidate apt-get and
 reduce image size by ~200MB (#29060)

Signed-off-by: princepride <wangzhipeng628@gmail.com>
---
 docker/Dockerfile | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index e03b9989a190..84a1802dbe03 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -85,7 +85,20 @@ ARG GET_PIP_URL
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
+    && apt-get install -y --no-install-recommends \
+        ccache \
+        software-properties-common \
+        git \
+        curl \
+        sudo \
+        python3-pip \
+        libibverbs-dev \
+        # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+        # as it was causing spam when compiling the CUTLASS kernels
+        gcc-10 \
+        g++-10 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
+    && rm -rf /var/lib/apt/lists/* \
     && curl -LsSf https://astral.sh/uv/install.sh | sh \
     && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
     && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
@@ -110,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
-# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
-# as it was causing spam when compiling the CUTLASS kernels
-RUN apt-get install -y gcc-10 g++-10
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
 RUN <<EOF
 gcc --version
 EOF
@@ -268,7 +277,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy
 
 # Install libnuma-dev, required by fastsafetensors (fixes #20384)
-RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
@@ -305,8 +314,15 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y software-properties-common curl sudo python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && apt-get install -y --no-install-recommends \
+        software-properties-common \
+        curl \
+        sudo \
+        python3-pip \
+        ffmpeg \
+        libsm6 \
+        libxext6 \
+        libgl1 \
     && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
         if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
             mkdir -p -m 0755 /etc/apt/keyrings ; \
@@ -321,7 +337,12 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
         done ; \
     fi \
     && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && apt-get install -y --no-install-recommends \
+        python${PYTHON_VERSION} \
+        python${PYTHON_VERSION}-dev \
+        python${PYTHON_VERSION}-venv \
+        libibverbs-dev \
+    && rm -rf /var/lib/apt/lists/* \
     && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
     && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
     && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \

From 9cf4edae6ef2c972429560ca8f72d40688d10495 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 25 Nov 2025 03:15:13 +0000
Subject: [PATCH 396/578] [Metrics] Scheduled removal of deprecated metrics
 (#29330)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py |   3 -
 vllm/v1/metrics/loggers.py               | 134 +++++++----------------
 2 files changed, 37 insertions(+), 100 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 4e7b765d7713..65a6fd20bd0d 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -183,9 +183,6 @@ async def test_metrics_counts(
 EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
-    "vllm:gpu_cache_usage_perc",
-    "vllm:gpu_prefix_cache_queries",
-    "vllm:gpu_prefix_cache_hits",
     "vllm:kv_cache_usage_perc",
     "vllm:prefix_cache_queries",
     "vllm:prefix_cache_hits",
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index e2d82241ce21..bd18a152ffc0 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -440,57 +440,6 @@ def __init__(
         # Setting default values
         self.record_sleep_state()
 
-        # GPU cache
-        #
-        # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            gauge_gpu_cache_usage = self._gauge_cls(
-                name="vllm:gpu_cache_usage_perc",
-                documentation=(
-                    "GPU KV-cache usage. 1 means 100 percent usage."
-                    "DEPRECATED: Use vllm:kv_cache_usage_perc instead."
-                ),
-                multiprocess_mode="mostrecent",
-                labelnames=labelnames,
-            )
-            self.gauge_gpu_cache_usage = make_per_engine(
-                gauge_gpu_cache_usage, engine_indexes, model_name
-            )
-
-        # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            counter_gpu_prefix_cache_queries = self._counter_cls(
-                name="vllm:gpu_prefix_cache_queries",
-                documentation=(
-                    "GPU prefix cache queries, in terms of number of queried"
-                    "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
-                ),
-                labelnames=labelnames,
-            )
-            self.counter_gpu_prefix_cache_queries = make_per_engine(
-                counter_gpu_prefix_cache_queries, engine_indexes, model_name
-            )
-
-        # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            counter_gpu_prefix_cache_hits = self._counter_cls(
-                name="vllm:gpu_prefix_cache_hits",
-                documentation=(
-                    "GPU prefix cache hits, in terms of number of cached "
-                    "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
-                ),
-                labelnames=labelnames,
-            )
-            self.counter_gpu_prefix_cache_hits = make_per_engine(
-                counter_gpu_prefix_cache_hits, engine_indexes, model_name
-            )
-
         gauge_kv_cache_usage = self._gauge_cls(
             name="vllm:kv_cache_usage_perc",
             documentation="KV-cache usage. 1 means 100 percent usage.",
@@ -735,39 +684,41 @@ def __init__(
         )
 
         # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
-        # TODO: in 0.12, only enable if show_hidden_metrics=True
-        histogram_time_per_output_token = self._histogram_cls(
-            name="vllm:time_per_output_token_seconds",
-            documentation=(
-                "Histogram of time per output token in seconds."
-                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
-            ),
-            buckets=[
-                0.01,
-                0.025,
-                0.05,
-                0.075,
-                0.1,
-                0.15,
-                0.2,
-                0.3,
-                0.4,
-                0.5,
-                0.75,
-                1.0,
-                2.5,
-                5.0,
-                7.5,
-                10.0,
-                20.0,
-                40.0,
-                80.0,
-            ],
-            labelnames=labelnames,
-        )
-        self.histogram_time_per_output_token = make_per_engine(
-            histogram_time_per_output_token, engine_indexes, model_name
-        )
+        # With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
+        # TODO: remove in 0.13.0
+        if self.show_hidden_metrics:
+            histogram_time_per_output_token = self._histogram_cls(
+                name="vllm:time_per_output_token_seconds",
+                documentation=(
+                    "Histogram of time per output token in seconds."
+                    "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
+                ),
+                buckets=[
+                    0.01,
+                    0.025,
+                    0.05,
+                    0.075,
+                    0.1,
+                    0.15,
+                    0.2,
+                    0.3,
+                    0.4,
+                    0.5,
+                    0.75,
+                    1.0,
+                    2.5,
+                    5.0,
+                    7.5,
+                    10.0,
+                    20.0,
+                    40.0,
+                    80.0,
+                ],
+                labelnames=labelnames,
+            )
+            self.histogram_time_per_output_token = make_per_engine(
+                histogram_time_per_output_token, engine_indexes, model_name
+            )
 
         histogram_inter_token_latency = self._histogram_cls(
             name="vllm:inter_token_latency_seconds",
@@ -966,20 +917,8 @@ def record(
             self.gauge_scheduler_waiting[engine_idx].set(
                 scheduler_stats.num_waiting_reqs
             )
-            if self.show_hidden_metrics:
-                self.gauge_gpu_cache_usage[engine_idx].set(
-                    scheduler_stats.kv_cache_usage
-                )
             self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
 
-            if self.show_hidden_metrics:
-                self.counter_gpu_prefix_cache_queries[engine_idx].inc(
-                    scheduler_stats.prefix_cache_stats.queries
-                )
-                self.counter_gpu_prefix_cache_hits[engine_idx].inc(
-                    scheduler_stats.prefix_cache_stats.hits
-                )
-
             self.counter_prefix_cache_queries[engine_idx].inc(
                 scheduler_stats.prefix_cache_stats.queries
             )
@@ -1050,7 +989,8 @@ def record(
             self.histogram_time_to_first_token[engine_idx].observe(ttft)
         for itl in iteration_stats.inter_token_latencies_iter:
             self.histogram_inter_token_latency[engine_idx].observe(itl)
-            self.histogram_time_per_output_token[engine_idx].observe(itl)
+            if self.show_hidden_metrics:
+                self.histogram_time_per_output_token[engine_idx].observe(itl)
 
         for finished_request in iteration_stats.finished_requests:
             self.counter_request_success[finished_request.finish_reason][

From 87185c88d54bd97c4c08f1fd3c5a8564e4924e2a Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Tue, 25 Nov 2025 03:19:52 +0000
Subject: [PATCH 397/578] =?UTF-8?q?[Bugfix]=20Make=20deprecated=20`--task?=
 =?UTF-8?q?=20embedding`=20consistent=20with=20`--runner=E2=80=A6=20(#2931?=
 =?UTF-8?q?2)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 vllm/config/model.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 49688e17cf93..c37dd7c15f2a 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -585,16 +585,26 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
                 else:  # task == "auto"
                     pass
             else:
-                debug_info = {
-                    "architectures": architectures,
-                    "is_generative_model": is_generative_model,
-                    "is_pooling_model": is_pooling_model,
-                }
-                raise AssertionError(
-                    "The model should be a generative or "
-                    "pooling model when task is set to "
-                    f"{self.task!r}. Found: {debug_info}"
-                )
+                # Neither generative nor pooling model - try to convert if possible
+                if is_pooling_task:
+                    runner = "pooling"
+                    convert = _task_to_convert(self.task)
+                    msg_hint = (
+                        "Please replace this option with `--runner pooling "
+                        f"--convert {convert}` to continue using this model "
+                        "as a pooling model."
+                    )
+                else:
+                    debug_info = {
+                        "architectures": architectures,
+                        "is_generative_model": is_generative_model,
+                        "is_pooling_model": is_pooling_model,
+                    }
+                    raise AssertionError(
+                        "The model should be a generative or "
+                        "pooling model when task is set to "
+                        f"{self.task!r}. Found: {debug_info}"
+                    )
 
             self.runner = runner
             self.convert = convert

From 92effb07a48e56c531a95b696acd5f699baf16da Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 25 Nov 2025 11:28:51 +0800
Subject: [PATCH 398/578] [Model] Add HunyuanOCR support (#29327)

Signed-off-by: manayang <jackmanayang@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: sergeywang <sergeywang@tencent.com>
Co-authored-by: manayang <jackmanayang@gmail.com>
Co-authored-by: manayang <manayang@tencent.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 docs/models/supported_models.md               |    1 +
 examples/offline_inference/vision_language.py |   26 +
 tests/models/registry.py                      |    4 +
 vllm/config/model.py                          |    5 +
 .../layers/rotary_embedding/__init__.py       |   13 +
 .../layers/rotary_embedding/xdrope.py         |  102 ++
 vllm/model_executor/models/hunyuan_v1.py      |   11 +-
 vllm/model_executor/models/hunyuan_vision.py  | 1028 +++++++++++++++++
 vllm/model_executor/models/interfaces.py      |   51 +-
 vllm/model_executor/models/registry.py        |    4 +
 vllm/transformers_utils/config.py             |   18 +
 vllm/transformers_utils/configs/__init__.py   |    8 +
 vllm/transformers_utils/configs/hunyuan_vl.py |  322 ++++++
 .../transformers_utils/processors/__init__.py |   10 +-
 .../processors/hunyuan_vl.py                  |  233 ++++
 .../processors/hunyuan_vl_image.py            |  477 ++++++++
 vllm/v1/worker/gpu_input_batch.py             |    2 +
 vllm/v1/worker/gpu_model_runner.py            |  104 +-
 18 files changed, 2415 insertions(+), 4 deletions(-)
 create mode 100644 vllm/model_executor/layers/rotary_embedding/xdrope.py
 create mode 100644 vllm/model_executor/models/hunyuan_vision.py
 create mode 100644 vllm/transformers_utils/configs/hunyuan_vl.py
 create mode 100644 vllm/transformers_utils/processors/hunyuan_vl.py
 create mode 100644 vllm/transformers_utils/processors/hunyuan_vl_image.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 404519f887dc..25579835faf6 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -680,6 +680,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
+| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 624de2a2debc..65ea4df4a309 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -538,6 +538,31 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# HunyuanOCR
+def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "tencent/HunyuanOCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    placeholder = "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+    prompts = [
+        f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=None,
+    )
+
+
 # naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
 def run_hyperclovax_seed_vision(
     questions: list[str], modality: str
@@ -1820,6 +1845,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
     "glm4_5v": run_glm4_5v,
     "glm4_5v_fp8": run_glm4_5v_fp8,
     "h2ovl_chat": run_h2ovl,
+    "hunyuan_vl": run_hunyuan_vl,
     "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
     "idefics3": run_idefics3,
     "interns1": run_interns1,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 758ec54493aa..f8b3470e6d39 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -626,6 +626,10 @@ def check_available_online(
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
         trust_remote_code=True,
     ),
+    "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
+        "tencent/HunyuanOCR",
+        is_available_online=False,
+    ),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo(
         "HuggingFaceM4/Idefics3-8B-Llama3",
         extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},
diff --git a/vllm/config/model.py b/vllm/config/model.py
index c37dd7c15f2a..caa9a3440c41 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -33,6 +33,7 @@
     try_get_safetensors_metadata,
     try_get_tokenizer_config,
     uses_mrope,
+    uses_xdrope_dim,
 )
 from vllm.transformers_utils.gguf_utils import (
     maybe_patch_hf_config_from_gguf,
@@ -1615,6 +1616,10 @@ def uses_alibi(self) -> bool:
     def uses_mrope(self) -> bool:
         return uses_mrope(self.hf_config)
 
+    @property
+    def uses_xdrope_dim(self) -> int:
+        return uses_xdrope_dim(self.hf_config)
+
     @property
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index 152d9401b8e9..0f10bff6ac4f 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -17,6 +17,7 @@
 from .mrope import MRotaryEmbedding
 from .ntk_scaling_rope import NTKScalingRotaryEmbedding
 from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding
+from .xdrope import XDRotaryEmbedding
 from .yarn_scaling_rope import YaRNScalingRotaryEmbedding
 
 _ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
@@ -184,6 +185,18 @@ def get_rope(
                 raise ValueError(
                     "Dynamic rope scaling must contain either 'alpha' or 'factor' field"
                 )
+        elif scaling_type == "xdrope":
+            scaling_alpha = rope_parameters["alpha"]
+            rotary_emb = XDRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_alpha,
+                dtype,
+                xdrope_section=rope_parameters["xdrope_section"],
+            )
         elif scaling_type == "yarn":
             scaling_factor = rope_parameters["factor"]
             original_max_position = rope_parameters["original_max_position_embeddings"]
diff --git a/vllm/model_executor/layers/rotary_embedding/xdrope.py b/vllm/model_executor/layers/rotary_embedding/xdrope.py
new file mode 100644
index 000000000000..2432273faf19
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/xdrope.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import torch
+
+from .common import apply_rotary_emb_dispatch
+from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
+
+
+class XDRotaryEmbedding(DynamicNTKAlphaRotaryEmbedding):
+    """DynamicNTKAlphaRotaryEmbedding extended with MultiModal(XD) Sections.
+
+    Based on the original DynamicNTKAlphaRotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+        xdrope_section: list[int],
+    ) -> None:
+        self.xdrope_section = xdrope_section
+        super().__init__(
+            head_size,
+            rotary_dim,
+            max_position_embeddings,
+            base,
+            is_neox_style,
+            scaling_alpha,
+            dtype,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [4, num_tokens] (P/W/H/T positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = torch.cat(
+            [m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+        sin = torch.cat(
+            [m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    @staticmethod
+    def get_next_input_positions(
+        context_len: int,
+        seq_len: int,
+        xd_sections: int = 4,
+    ) -> list[list[int]]:
+        return [list(range(context_len, seq_len)) for _ in range(xd_sections)]
+
+    @staticmethod
+    def get_next_input_positions_tensor(
+        out: np.ndarray,
+        out_offset: int,
+        context_len: int,
+        num_new_tokens: int,
+    ):
+        values = np.arange(
+            context_len,
+            context_len + num_new_tokens,
+            dtype=out.dtype,
+        )
+        out[:, out_offset : out_offset + num_new_tokens] = values
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 9fa5e2bd33f2..53fb444ed622 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -576,7 +576,16 @@ def forward(
         return hidden_states, residual, ori_kv_states
 
 
-@support_torch_compile
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (xd, seq_len) if xdrope is enabled for hunyuan-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
 class HunYuanModel(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
new file mode 100644
index 000000000000..e83addd0c092
--- /dev/null
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -0,0 +1,1028 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# coding=utf-8
+# Copyright 2025 The HunYuan team.
+# Copyright 2025 The vLLM team.
+# Copyright 2025 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only HunYuan-VL model compatible with HuggingFace weights."""
+
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from functools import partial
+from typing import Annotated, Any, Literal, TypeAlias
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BatchFeature
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import MultiModalConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    ImageItem,
+    ModalityData,
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ImageSize,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.hunyuan_vl import (
+    HunYuanVLConfig,
+    HunYuanVLVisionConfig,
+)
+from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
+from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+    SupportsXDRoPE,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+# === Vision Inputs === #
+
+
+class HunYuanVLImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: Number of patches
+        - ni: Number of images
+        - cps: Number of channels * patch_size * patch_size
+    """
+
+    type: Literal["pixel_values"]
+
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", "cps"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+class HunYuanVLImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size
+        - ni: Number of images
+    """
+
+    type: Literal["image_embeds"]
+
+    image_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+HunYuanVLImageInputs: TypeAlias = (
+    HunYuanVLImagePixelInputs | HunYuanVLImageEmbeddingInputs
+)
+
+# === Vision Encoder === #
+
+
+class HunYuanVisionMLP(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = True,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.dense_h_to_4h = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
+            disable_tp=use_data_parallel,
+        )
+        self.dense_4h_to_h = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
+            disable_tp=use_data_parallel,
+        )
+        self.act_fn = act_fn
+
+    def forward(self, x: torch.Tensor):
+        x_up, _ = self.dense_h_to_4h(x)
+        x_down, _ = self.dense_4h_to_h(self.act_fn(x_up))
+        return x_down
+
+
+class HunYuanVisionAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        self.tp_size = (
+            1
+            if use_data_parallel
+            else parallel_state.get_tensor_model_parallel_world_size()
+        )
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size
+        )
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+            disable_tp=use_data_parallel,
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+            disable_tp=use_data_parallel,
+        )
+
+        self.scale = self.hidden_size_per_attention_head**-0.5
+        self.attn = MultiHeadAttention(
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+            self.scale,
+            prefix=f"{prefix}.attn",
+            multimodal_config=multimodal_config,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+        out = self.attn(q, k, v)
+        output, _ = self.o_proj(out)
+        return output
+
+
+class HunYuanVisionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.input_layernorm = norm_layer(dim)
+        self.post_attention_layernorm = norm_layer(dim)
+        self.self_attn = HunYuanVisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.self_attn",
+            use_data_parallel=use_data_parallel,
+        )
+        self.mlp = HunYuanVisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_fn=act_fn,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        x = x + self.self_attn(self.input_layernorm(x))
+        x = x + self.mlp(self.post_attention_layernorm(x))
+        return x
+
+
+class HunYuanVisionPatchEmbed(nn.Module):
+    def __init__(self, config: HunYuanVLVisionConfig):
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.num_channels = config.num_channels
+        self.spatial_merge_size = config.spatial_merge_size
+        self.interpolate_mode = config.interpolate_mode
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=True,
+        )
+
+        self.max_num_patches = (config.max_image_size // self.patch_size) ** 2
+
+        self.num_positions = self.max_num_patches + 1
+        self.position_edge = int(self.num_positions**0.5)
+        # first token is cls token, skip it
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+        self.patch_pos_embed = None
+
+    def forward(
+        self, pixel_values: torch.Tensor, grid_thw: list[list[int]]
+    ) -> torch.Tensor:
+        num_patches = pixel_values.size(0)
+        pixel_values = pixel_values.reshape(
+            num_patches, self.num_channels, self.patch_size, self.patch_size
+        )
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        patch_embeds = patch_embeds.squeeze(-1).squeeze(-1).unsqueeze(0)
+
+        if self.patch_pos_embed is None:
+            patch_pos_shape = (
+                1,
+                self.position_edge,
+                self.position_edge,
+                self.embed_dim,
+            )
+            self.patch_pos_embed = (
+                self.position_embedding.weight[1:, :]
+                .reshape(patch_pos_shape)
+                .permute(0, 3, 1, 2)
+                .float()
+            )
+
+        patch_pos_embed_list = []
+        for grid in grid_thw:
+            _, h0, w0 = grid
+            # we add a small number to avoid floating point error in the interpolation
+            # see discussion at https://github.com/facebookresearch/dino/issues/8
+            h0, w0 = h0 + 0.1, w0 + 0.1
+            patch_pos_embed = nn.functional.interpolate(
+                self.patch_pos_embed,
+                scale_factor=(h0 / self.position_edge, w0 / self.position_edge),
+                mode=self.interpolate_mode,
+                align_corners=False,
+            )
+
+            patch_pos_embed = (
+                patch_pos_embed.reshape(self.embed_dim, -1)
+                .transpose(0, 1)
+                .unsqueeze(0)
+                .to(patch_embeds.dtype)
+            )
+            patch_pos_embed_list.append(patch_pos_embed)
+
+        patch_pos_embed = torch.cat(patch_pos_embed_list, dim=1)
+        embeddings = patch_embeds + patch_pos_embed
+
+        return embeddings
+
+
+class HunYuanVisionPatchMerger(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        spatial_merge_size=2,
+        rms_norm_eps=1e-5,
+        prefix="",
+    ):
+        super().__init__()
+        self.spatial_merge_size = spatial_merge_size
+        embed_std = out_channels**-0.5
+
+        self.proj = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                in_channels * 2,
+                kernel_size=spatial_merge_size,
+                stride=spatial_merge_size,
+            ),
+            nn.GELU(),
+            nn.Conv2d(in_channels * 2, in_channels * 4, kernel_size=1),
+        )
+        self.mlp = nn.Linear(in_channels * 4, out_channels)
+
+        self.image_newline = nn.Parameter(torch.randn(in_channels * 4) * embed_std)
+        self.image_begin = nn.Parameter(torch.randn(out_channels) * embed_std)
+        self.image_end = nn.Parameter(torch.randn(out_channels) * embed_std)
+        self.image_sep = nn.Parameter(torch.randn(out_channels) * embed_std)
+
+        self.before_rms = RMSNorm(in_channels, eps=rms_norm_eps)
+        self.after_rms = RMSNorm(out_channels, eps=rms_norm_eps)
+
+    def forward(self, x, size=(16, 16)):
+        x = self.before_rms(x)
+
+        h, w = size
+        dtype = x.dtype
+        x = x.permute(0, 2, 1).reshape(x.shape[0], -1, h, w)
+
+        x = self.proj(x)  # b,c,h,w
+        b, c, h, w = x.shape
+        x = torch.cat(
+            [x, self.image_newline.reshape(1, c, 1, 1).expand(b, c, h, 1).to(dtype)],
+            dim=-1,
+        )
+        x = x.reshape(b, c, -1).permute(0, 2, 1)
+        x = self.mlp(x)
+
+        begin = self.image_begin.reshape(1, 1, -1).expand(b, 1, x.shape[-1]).to(dtype)
+        end = self.image_end.reshape(1, 1, -1).expand(b, 1, x.shape[-1]).to(dtype)
+        x = torch.cat([begin, x, end], dim=1)
+
+        return self.after_rms(x)
+
+
+class HunYuanVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        vision_config: HunYuanVLVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+        multimodal_config: MultiModalConfig | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
+    ) -> None:
+        super().__init__()
+
+        num_hidden_layers = vision_config.num_hidden_layers
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_attention_heads
+        self.spatial_merge_size = vision_config.spatial_merge_size
+
+        from vllm.compilation.backends import set_model_tag
+
+        with set_model_tag("HunYuanVisionPatchEmbed"):
+            self.embeddings = HunYuanVisionPatchEmbed(vision_config)
+
+        norm_layer = partial(nn.LayerNorm, eps=vision_config.rms_norm_eps)
+
+        with set_model_tag("HunYuanVisionBlock"):
+            self.layers = nn.ModuleList(
+                [
+                    HunYuanVisionBlock(
+                        dim=vision_config.hidden_size,
+                        num_heads=vision_config.num_attention_heads,
+                        mlp_hidden_dim=vision_config.intermediate_size,
+                        act_fn=get_act_fn(vision_config.hidden_act),
+                        norm_layer=norm_layer,
+                        quant_config=quant_config,
+                        multimodal_config=multimodal_config,
+                        prefix=f"{prefix}.layers.{layer_idx}",
+                        use_data_parallel=use_data_parallel,
+                    )
+                    for layer_idx in range(num_hidden_layers)
+                ]
+            )
+
+        with set_model_tag("HunYuanVisionPatchMerger"):
+            self.perceive = HunYuanVisionPatchMerger(
+                vision_config.hidden_size,
+                vision_config.out_hidden_size,
+                spatial_merge_size=vision_config.spatial_merge_size,
+                rms_norm_eps=vision_config.rms_norm_eps,
+                prefix=f"{prefix}.perceive",
+            )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.embeddings.patch_embedding.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.embeddings.patch_embedding.weight.device
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: list[list[int]],
+    ) -> torch.Tensor:
+        # patchify
+        seq_len = x.size(0)
+        cu_seqlens: list = [0]
+
+        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = self.embeddings(hidden_states, grid_thw)
+
+        for t, h, w in grid_thw:
+            t, h, w = int(t), int(h), int(w)
+            cu_seqlens.append(h * w)
+
+        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32)
+        cu_seqlens = torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32)
+
+        cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
+
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        hidden_states = hidden_states.unsqueeze(0)
+        for layer_num, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states)
+
+        # adapter
+        split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        split_items = hidden_states.split(split_lengths, dim=1)
+        image_embeds_list = []
+        for grid, split_item in zip(grid_thw, split_items):
+            image_embeds_list.append(
+                self.perceive(split_item.contiguous(), size=grid[1:]).squeeze(0)
+            )
+
+        return image_embeds_list
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv", ".q_proj", "q"),
+            (".qkv", ".k_proj", "k"),
+            (".qkv", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+def _hunyuan_vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+    return dict(
+        pixel_values=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+    )
+
+
+class HunYuanVLMultiModalDataParser(MultiModalDataParser):
+    def _parse_image_data(
+        self,
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
+    ):
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields={"image_embeds", "image_grid_thw"},
+                fields_factory=_hunyuan_vl_field_config,
+            )
+
+        return super()._parse_image_data(data)
+
+
+class HunYuanVLProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(HunYuanVLConfig)
+
+    def get_hf_processor(
+        self,
+        **kwargs: object,
+    ) -> HunYuanVLProcessor:
+        return self.ctx.get_hf_processor(
+            HunYuanVLProcessor,
+            use_fast=kwargs.pop("use_fast", True),
+            **kwargs,
+        )
+
+    def get_image_processor(
+        self,
+        **kwargs: object,
+    ) -> HunYuanVLProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        max_image_tokens = self.get_max_image_tokens()
+        # TODO: support video
+        max_video_tokens = 0
+        return {"image": max_image_tokens, "video": max_video_tokens}
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+        image_processor: HunYuanVLProcessor | None,
+    ) -> tuple[ImageSize, int]:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        spatial_merge_size = vision_config.spatial_merge_size
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * spatial_merge_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
+
+        grid_t = 1
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_vision_tokens = (
+            grid_t * grid_h // spatial_merge_size * (grid_w // spatial_merge_size + 1)
+            + 2
+        )
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor: HunYuanVLProcessor | None,
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            image_processor=image_processor,
+        )
+        return num_image_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=512,
+            image_height=8192,
+            image_processor=None,
+        )
+        return max_image_size
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            image_processor=None,
+        )
+
+
+class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_token: str = hf_processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 1)
+
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width, height=target_height, num_images=num_images
+            ),
+        }
+
+
+class HunYuanVLMultiModalProcessor(BaseMultiModalProcessor[HunYuanVLProcessingInfo]):
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return HunYuanVLMultiModalDataParser()
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            dict(**mm_kwargs, **tok_kwargs),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+
+        placeholder = {
+            "image": hf_processor.image_token_id,
+        }
+
+        merge_size = image_processor.merge_size
+
+        def get_replacement_hunyuan_vl(item_idx: int, modality: str):
+            out_item = out_mm_kwargs[modality][item_idx]
+            grid_thw = out_item[f"{modality}_grid_thw"].data
+            assert isinstance(grid_thw, torch.Tensor)
+
+            _, grid_h, grid_w = grid_thw
+            num_tokens = (int(grid_h) // merge_size) * (
+                int(grid_w) // merge_size + 1
+            ) + 2
+            return [placeholder[modality]] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=[placeholder[modality]],
+                replacement=partial(get_replacement_hunyuan_vl, modality=modality),
+            )
+            for modality in ("image",)
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _hunyuan_vl_field_config(hf_inputs)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    HunYuanVLMultiModalProcessor,
+    info=HunYuanVLProcessingInfo,
+    dummy_inputs=HunYuanVLDummyInputsBuilder,
+)
+class HunYuanVLForConditionalGeneration(
+    nn.Module,
+    SupportsMultiModal,
+    SupportsLoRA,
+    SupportsPP,
+    SupportsQuant,
+    SupportsXDRoPE,
+):
+    multimodal_cpu_fields = {"image_grid_thw"}
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "vit.vit.": "visual.",
+            "vit.": "visual.",
+            "model.": "language_model.model.",
+        }
+    )
+
+    supports_encoder_tp_data = True
+
+    def get_xdrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> torch.Tensor:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+
+        hf_config = self.config
+        image_start_token_id = hf_config.image_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        xd_num = len(hf_config.rope_scaling["xdrope_section"])
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        image_start_indices = torch.argwhere(
+            input_tokens_tensor == image_start_token_id
+        ).squeeze(1)
+
+        p_index = torch.arange(len(input_tokens_tensor))
+        w_index = torch.arange(len(input_tokens_tensor))
+        h_index = torch.arange(len(input_tokens_tensor))
+        t_index = torch.arange(len(input_tokens_tensor))
+        for image_index in range(len(image_start_indices)):
+            # +1 : first image_token, +2: for xdrope positions
+            pos = image_start_indices[image_index] + 2
+            t, h, w = image_grid_thw[image_index]
+            _, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+
+            token_num = (llm_grid_w + 1) * llm_grid_h
+            w_index[pos : pos + token_num].copy_(
+                torch.arange(0, llm_grid_w + 1)
+                .reshape(1, -1)
+                .expand(llm_grid_h, -1)
+                .reshape(-1)
+            )
+            h_index[pos : pos + token_num].copy_(
+                torch.arange(0, llm_grid_h)
+                .reshape(-1, 1)
+                .expand(-1, llm_grid_w + 1)
+                .reshape(-1)
+            )
+            h_index[pos : pos + token_num] = 0
+
+        if xd_num == 4:
+            llm_positions = torch.stack([p_index, w_index, h_index, t_index])
+        elif xd_num == 3:
+            llm_positions = torch.stack([w_index, h_index, t_index])
+
+        return llm_positions
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: HunYuanVLConfig = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        if multimodal_config.get_limit_per_prompt("image"):
+            attn_backend_override = (
+                multimodal_config.mm_encoder_attn_backend
+                if multimodal_config is not None
+                else None
+            )
+            self.visual = HunYuanVisionTransformer(
+                config.vision_config,
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "visual"),
+                multimodal_config=multimodal_config,
+                attn_backend_override=attn_backend_override,
+            )
+        else:
+            self.visual = None
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model.model"),
+            architectures=[
+                "HunYuanDenseV1ForCausalLM",
+                "HunYuanMoEV1ForCausalLM",
+            ],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> HunYuanVLImageInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        # TODO: refine
+        if isinstance(pixel_values, list):
+            pixel_values = torch.cat(pixel_values, dim=0)
+        if len(pixel_values.shape) == 3:
+            last_dim = pixel_values.shape[-1]
+            pixel_values = pixel_values.reshape(-1, last_dim)
+            image_grid_thw = image_grid_thw.reshape(-1, 3)
+
+        if pixel_values is not None:
+            return HunYuanVLImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            return HunYuanVLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _process_image_input(
+        self, image_input: HunYuanVLImageInputs
+    ) -> tuple[torch.Tensor, ...]:
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"]
+
+            # TODO: use_data_parallel (split image_embeds in visual)
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+
+        return image_embeds
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "image" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
+                    **kwargs
+                )
+        return mm_input_by_modality
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not mm_input_by_modality:
+            return []
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                image_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += tuple(image_embeddings)
+        return multimodal_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.model",
+            connector="visual.perceive",
+            tower_model="visual",
+        )
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 9966498e1b4c..6f6ce32538b7 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1047,7 +1047,7 @@ class SupportsMRoPE(Protocol):
     supports_mrope: ClassVar[Literal[True]] = True
     """
     A flag that indicates this model supports M-RoPE.
-    
+
     Note:
         There is no need to redefine this flag if this class is in the
         MRO of your model class.
@@ -1088,3 +1088,52 @@ def supports_mrope(
     model: type[object] | object,
 ) -> TypeIs[type[SupportsMRoPE]] | TypeIs[SupportsMRoPE]:
     return isinstance(model, SupportsMRoPE)
+
+
+@runtime_checkable
+class SupportsXDRoPE(Protocol):
+    """The interface required for all models that support XD-RoPE."""
+
+    supports_xdrope: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports XD-RoPE.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        XDRope of your model class.
+    """
+
+    def get_xdrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list["MultiModalFeatureSpec"],
+    ) -> torch.Tensor:
+        """
+        Get XD-RoPE input positions and delta value for this specific model.
+
+        This method should be implemented by each model that supports XD-RoPE
+        to provide model-specific logic for computing input positions.
+
+        Args:
+            input_tokens: List of input token IDs
+            mm_features: Information about each multi-modal data item
+
+        Returns:
+            llm_positions: Tensor of shape `[xdrope_dim, num_tokens]` with
+            4D(P/W/H/T) or 3D(W/H/T) positions.
+        """
+        ...
+
+
+@overload
+def supports_xdrope(model: type[object]) -> TypeIs[type[SupportsXDRoPE]]: ...
+
+
+@overload
+def supports_xdrope(model: object) -> TypeIs[SupportsXDRoPE]: ...
+
+
+def supports_xdrope(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsXDRoPE]] | TypeIs[SupportsXDRoPE]:
+    return isinstance(model, SupportsXDRoPE)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b3da64af750c..a0d8a78a2ae7 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -287,6 +287,10 @@
         "GraniteSpeechForConditionalGeneration",
     ),
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
+    "HunYuanVLForConditionalGeneration": (
+        "hunyuan_vision",
+        "HunYuanVLForConditionalGeneration",
+    ),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
     "OpenCUAForConditionalGeneration": (
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 3d282da8c611..c1880a3fba0e 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -86,6 +86,7 @@ def __getitem__(self, key):
     deepseek_vl_v2="DeepseekVLV2Config",
     deepseek_v32="DeepseekV3Config",
     flex_olmo="FlexOlmoConfig",
+    hunyuan_vl="HunYuanVLConfig",
     kimi_linear="KimiLinearConfig",
     kimi_vl="KimiVLConfig",
     RefinedWeb="RWConfig",  # For tiiuae/falcon-40b(-instruct)
@@ -549,6 +550,23 @@ def thinker_uses_mrope(config: PretrainedConfig) -> bool:
     return uses_mrope(thinker_text_config)
 
 
+def uses_xdrope_dim(config: PretrainedConfig) -> int:
+    """Detect if the model with this config uses XD-ROPE."""
+    xdrope_section = getattr(config, "xdrope_section", None)
+    if xdrope_section is not None and isinstance(xdrope_section, list):
+        return len(xdrope_section)
+    rope_scaling = getattr(config, "rope_scaling", None)
+    if rope_scaling is None:
+        return 0
+
+    if isinstance(rope_scaling, dict) and "xdrope_section" in rope_scaling:
+        xdrope_section = rope_scaling["xdrope_section"]
+        if xdrope_section is not None and isinstance(xdrope_section, list):
+            return len(xdrope_section)
+
+    return 0
+
+
 def is_encoder_decoder(config: PretrainedConfig) -> bool:
     """Detect if the model with this config is used as an encoder/decoder."""
 
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index d28fd8d03337..109f2b698651 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -23,6 +23,11 @@
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
+from vllm.transformers_utils.configs.hunyuan_vl import (
+    HunYuanVLConfig,
+    HunYuanVLTextConfig,
+    HunYuanVLVisionConfig,
+)
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
@@ -53,6 +58,9 @@
     "DotsOCRConfig",
     "EAGLEConfig",
     "FlexOlmoConfig",
+    "HunYuanVLConfig",
+    "HunYuanVLTextConfig",
+    "HunYuanVLVisionConfig",
     "RWConfig",
     "JAISConfig",
     "Lfm2MoeConfig",
diff --git a/vllm/transformers_utils/configs/hunyuan_vl.py b/vllm/transformers_utils/configs/hunyuan_vl.py
new file mode 100644
index 000000000000..a826ed9b5155
--- /dev/null
+++ b/vllm/transformers_utils/configs/hunyuan_vl.py
@@ -0,0 +1,322 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/configuration_hunyuan_vl.py
+
+from transformers import PretrainedConfig
+
+
+class HunYuanVLVisionConfig(PretrainedConfig):
+    model_type = "hunyuan_vl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_act="gelu",
+        hidden_size=1152,
+        intermediate_size=4304,
+        interpolate_mode="bilinear",
+        rms_norm_eps=1e-05,
+        learnable_mlp_pooling_size=0,
+        num_attention_heads=16,
+        num_key_value_heads=None,
+        num_channels=3,
+        num_hidden_layers=27,
+        out_hidden_size=4096,
+        patch_size=16,
+        remove_prenorm=True,
+        spatial_merge_size=2,
+        temporal_patch_size=1,
+        resize_resolution=2048,
+        img_max_token_num=4096,
+        max_image_size=2048,
+        video_max_image_size=768,
+        video_min_image_size=256,
+        min_image_size=512,
+        anyres_vit_max_image_size=2048,
+        max_vit_seq_len=16384,
+        text_hidden_size=3072,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.interpolate_mode = interpolate_mode
+        self.learnable_mlp_pooling_size = learnable_mlp_pooling_size
+        self.num_attention_heads = num_attention_heads
+        if not num_key_value_heads:
+            self.num_key_value_heads = num_attention_heads
+        else:
+            self.num_key_value_heads = num_key_value_heads
+        self.num_channels = num_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.out_hidden_size = out_hidden_size
+        self.patch_size = patch_size
+        self.remove_prenorm = remove_prenorm
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.rms_norm_eps = rms_norm_eps
+
+        self.resize_resolution = resize_resolution
+        self.img_max_token_num = img_max_token_num
+        self.max_image_size = max_image_size
+        self.min_image_size = min_image_size
+        self.video_max_image_size = video_max_image_size
+        self.video_min_image_size = video_min_image_size
+        self.anyres_vit_max_image_size = anyres_vit_max_image_size
+        self.max_vit_seq_len = max_vit_seq_len
+        self.text_hidden_size = text_hidden_size
+
+
+class HunYuanVLTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunYuanVLTextConfig`]. It is used to instantiate an
+    HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the HunYuan-7B.
+    Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 290943):
+            Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HunYuanVLTextConfig`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        eod_token_id (int, *optional*, defaults to 3):
+            Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
+            Example: In multi-document processing, this token helps the model distinguish between separate documents.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+    """  # noqa: E501
+
+    model_type = "hunyuan_vl_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=290943,
+        hidden_size=4096,
+        intermediate_size: int = 11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        eod_token_id=3,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # self._rope_scaling_validation()   # TODO: Need validation?
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and "
+                f"`factor` or `type` and `alpha`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        rope_scaling_alpha = self.rope_scaling.get("alpha", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                "`rope_scaling`'s type field must be one of ['linear', 'dynamic'], "
+                f"got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None and rope_scaling_alpha is None:
+            raise ValueError(
+                "`rope_scaling`'s factor or alpha field must be have one, "
+                "got both of none"
+            )
+        if rope_scaling_factor is not None and (
+            not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0
+        ):
+            raise ValueError(
+                "`rope_scaling`'s factor field must be a float > 1.0, "
+                f"got {rope_scaling_factor}"
+            )
+        if rope_scaling_alpha is not None and (
+            not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0
+        ):
+            raise ValueError(
+                "`rope_scaling`'s alpha field must be a float > 1.0, "
+                f"got {rope_scaling_alpha}"
+            )
+
+
+class HunYuanVLConfig(PretrainedConfig):
+    model_type = "hunyuan_vl"
+    sub_configs = {
+        "vision_config": HunYuanVLVisionConfig,
+        "text_config": HunYuanVLTextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        im_start_id=120118,
+        im_end_id=120119,
+        image_token_id=120120,
+        im_newline_id=120121,
+        video_start_id=120122,
+        video_end_id=120123,
+        **kwargs,
+    ):
+        # We need to init super() here so that it does not reset values
+        # that are in text config to the BaseClass defaults. The Base
+        # config has many text related defaults and not all defaults are
+        # same as for `HunYuanVLTextConfig`.
+        super().__init__(**kwargs)
+
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init `TextConfig`
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+
+        self.image_token_id = image_token_id
+        self.im_start_id = im_start_id
+        self.im_end_id = im_end_id
+        self.im_newline_id = im_newline_id
+        self.video_start_id = video_start_id
+        self.video_end_id = video_end_id
+
+        self.vision_config.text_hidden_size = self.text_config.hidden_size
+
+        # Attention implementation to use. It sets it recursively on sub-configs
+        # so we call it again in the end.
+        self._attn_implementation = kwargs.pop("attn_implementation", None)
+
+    def __setattr__(self, key, value):
+        if (
+            (text_config := super().__getattribute__("__dict__").get("text_config"))
+            is not None
+            and key not in ["dtype", "_attn_implementation_internal"]
+            and key in text_config.__dict__
+        ):
+            setattr(text_config, key, value)
+        else:
+            super().__setattr__(key, value)
+
+    def __getattribute__(self, key):
+        if "text_config" in super().__getattribute__("__dict__") and key not in [
+            "_name_or_path",
+            "model_type",
+            "dtype",
+            "_attn_implementation_internal",
+        ]:
+            text_config = super().__getattribute__("text_config")
+            if key in text_config.__dict__:
+                return getattr(text_config, key)
+
+        return super().__getattribute__(key)
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 76b6d3dc9c99..b49fdbe9ce77 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -9,7 +9,15 @@
 """
 
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
+from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
+from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
 from vllm.transformers_utils.processors.ovis import OvisProcessor
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 
-__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
+__all__ = [
+    "DeepseekVLV2Processor",
+    "HunYuanVLProcessor",
+    "HunYuanVLImageProcessor",
+    "OvisProcessor",
+    "Ovis2_5Processor",
+]
diff --git a/vllm/transformers_utils/processors/hunyuan_vl.py b/vllm/transformers_utils/processors/hunyuan_vl.py
new file mode 100644
index 000000000000..615a8bff8591
--- /dev/null
+++ b/vllm/transformers_utils/processors/hunyuan_vl.py
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/processing_hunyuan_vl.py
+
+import numpy as np
+import torch
+from transformers import AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.video_utils import VideoInput
+
+
+class HunYuanVLProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"  # ("AutoTokenizer", None)
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        video_processor=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        # TODO Fix the init
+        self.tokenizer = tokenizer
+        self.image_token_id = 120120  # self.tokenizer.image_token_id
+        self.image_token = self.tokenizer.convert_ids_to_tokens(self.image_token_id)
+        self.im_start_token_id = 120118  # self.tokenizer.im_start_id
+        self.im_start_token = self.tokenizer.convert_ids_to_tokens(
+            self.im_start_token_id
+        )
+        self.im_end_token_id = 120119  # self.tokenizer.im_end_id
+        self.im_end_token = self.tokenizer.convert_ids_to_tokens(self.im_end_token_id)
+        self.placeholder_token = self.tokenizer.convert_ids_to_tokens(
+            self.tokenizer.vocab_size - 1
+        )
+        self.pad_id = 120002  # self.tokenizer.pad_token_id
+
+        super().__init__(
+            image_processor, tokenizer, video_processor, chat_template=chat_template
+        )
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
+        videos: VideoInput = None,
+        **kwargs,
+    ) -> BatchFeature:
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images)
+            image_grid_thw = image_inputs["image_grid_thw"]
+
+        if not isinstance(text, list):
+            text = [text]
+
+        text = text.copy()  # below lines change text in-place
+
+        image_tokens_cumsum = [0]
+        if images is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    grid_h, grid_w = image_grid_thw[index][-2:]
+                    patch_h = grid_h // self.image_processor.merge_size
+                    patch_w = grid_w // self.image_processor.merge_size
+                    num_image_tokens = patch_h * (patch_w + 1) + 2
+                    image_tokens_cumsum.append(
+                        image_tokens_cumsum[-1] + num_image_tokens
+                    )
+                    # text[i] = text[i].replace(self.image_token, self.im_start_token + self.placeholder_token * num_image_tokens + self.im_end_token, 1) # noqa: E501
+                    text[i] = text[i].replace(
+                        self.image_token, self.placeholder_token * num_image_tokens, 1
+                    )
+                    index += 1
+                text[i] = text[i].replace(self.placeholder_token, self.image_token)
+                # text[i] = self.tokenizer.bos_token + text[i]
+
+        text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        input_ids = text_inputs["input_ids"]
+        position_ids = torch.arange(len(input_ids[0]))
+        position_ids_w = torch.arange(len(input_ids[0]))
+        position_ids_h = torch.arange(len(input_ids[0]))
+        position_ids_t = torch.arange(len(input_ids[0]))
+
+        if images is not None:
+            image_token_pos_indices = torch.where(input_ids[0] == self.image_token_id)[
+                0
+            ]
+            for i in range(len(image_grid_thw)):
+                grid_h, grid_w = image_grid_thw[i][-2:]
+                patch_h = grid_h // self.image_processor.merge_size
+                patch_w = grid_w // self.image_processor.merge_size
+                start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1
+                replace_num = (patch_w + 1) * patch_h
+                position_ids_w[start_pos : start_pos + replace_num] = torch.tensor(
+                    list(range(patch_w + 1)) * patch_h, dtype=torch.int64
+                )
+                patch_h_list = []
+                for h in range(patch_h):
+                    patch_h_list += [h] * (patch_w + 1)
+                position_ids_h[start_pos : start_pos + replace_num] = torch.tensor(
+                    patch_h_list, dtype=torch.int64
+                )
+                position_ids_t[start_pos : start_pos + replace_num] = 0
+
+        position_ids = torch.stack(
+            [position_ids, position_ids_w, position_ids_h, position_ids_t]
+        ).unsqueeze(0)
+        text_inputs["position_ids"] = position_ids
+
+        attention_mask = input_ids.ne(self.pad_id)
+        text_inputs["attention_mask"] = attention_mask
+        text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids)]
+        # image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
+
+        return_tensors = kwargs.pop("return_tensors", None)
+        return BatchFeature(
+            data={**text_inputs, **image_inputs},
+            tensor_type=return_tensors,
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_image_text_to_text(
+        self,
+        generated_outputs,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        assert 0
+
+    def apply_chat_template(self, *args, **kwargs):
+        token_ids = self.tokenizer.apply_chat_template(*args, **kwargs)
+        return token_ids
+
+    def get_imgs_pos(self, doc_ids):
+        doc_ids = np.array(doc_ids, dtype=np.int64)
+        img_begin_index = np.where(doc_ids == self.im_start_token_id)[0]
+        img_end_index = np.where(doc_ids == self.im_end_token_id)[0]
+        imgs_pos = np.concatenate(
+            (
+                np.reshape(img_begin_index + 1, (-1, 1)),
+                np.reshape(img_end_index, (-1, 1)),
+            ),
+            axis=-1,
+        ).tolist()
+        return imgs_pos
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+def split_image_into_patch_blocks(
+    pixel_values: torch.Tensor,  # shape: [batch_size, 3, H, W]
+    patch_size: int = 16,  # e.g. 16
+    adaptor_patch_div: int = 4,  # e.g. 4 --> each patch_size is cut into 4x4 small regions, i.e. patch_size // 4 # noqa: E501
+) -> torch.Tensor:
+    """
+    Split the input image tensor (supporting batch) into large patches of size `patch_size`,
+    and then further divide each large patch into smaller regions of size
+    (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div).
+    Each small region is extracted as a tensor of shape [3, patch_size, patch_size].
+    The final output contains all such small region tensors.
+
+    Args:
+        pixel_values: Input image tensor of shape [batch_size, 3, H, W].
+        patch_size: Size of the large patch, e.g., 16.
+        adaptor_patch_div: Each large patch is divided into
+                          (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div)
+                          smaller regions.
+
+    Returns:
+        patches: A tensor of shape [N, 3, patch_size, patch_size],
+                 where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2.
+                 Each element in the batch corresponds to one small image region.
+    """  # noqa: E501
+    batch_size, channels, height, width = pixel_values.shape
+    assert channels == 3, "Pixel values must have 3 channels in dim=1"
+    assert height % patch_size == 0 and width % patch_size == 0, (
+        "H and W must be divisible by patch_size"
+    )
+
+    patch_height_num = height // patch_size
+    patch_width_num = width // patch_size
+
+    # Reshape to [B, 3, ph, ps, pw, ps]
+    img = pixel_values.reshape(
+        batch_size, 3, patch_height_num, patch_size, patch_width_num, patch_size
+    )
+
+    # Further split each psxps patch into (ps//aps)x(ps//aps) small regions
+    img = img.reshape(
+        batch_size,
+        3,
+        patch_height_num,
+        patch_size // adaptor_patch_div,  # ps // aps
+        adaptor_patch_div,
+        patch_width_num,
+        patch_size // adaptor_patch_div,  # ps // aps
+        adaptor_patch_div,
+    )
+
+    # Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps]
+    img = img.permute(0, 2, 5, 3, 6, 1, 4, 7)
+
+    # Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size]
+    patches = img.reshape(-1, 3, patch_size, patch_size)
+
+    return patches
+
+
+AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor)
diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py
new file mode 100644
index 000000000000..0a7e7865c783
--- /dev/null
+++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py
@@ -0,0 +1,477 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/image_processing_hunyuan_vl.py
+"""Image processor class for HunYuanVL."""
+
+# isort conflicts with ruff for transformers imports
+# isort: skip_file
+import math
+
+import numpy as np
+import torchvision.transforms as transforms
+from transformers import AutoImageProcessor
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_flat_list_of_images,
+    make_list_of_images,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, logging
+from transformers.video_utils import VideoInput, make_batched_videos
+
+logger = logging.get_logger(__name__)
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 16,
+    min_pixels: int = 512 * 512,
+    max_pixels: int = 2048 * 2048,
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            "absolute aspect ratio must be smaller than 200, got "
+            f"{max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class HunYuanVLImageProcessor(BaseImageProcessor):
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: int | float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int | None = None,
+        max_pixels: int | None = None,
+        patch_size: int = 16,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if size is not None and (
+            "shortest_edge" not in size or "longest_edge" not in size
+        ):
+            raise ValueError(
+                "size must contain 'shortest_edge' and 'longest_edge' keys."
+            )
+        else:
+            size = {"shortest_edge": 512 * 512, "longest_edge": 2048 * 2048}
+        # backward compatibility: override size with min_pixels and max_pixels
+        # if they are provided.
+        if min_pixels is not None:
+            size["shortest_edge"] = min_pixels
+        if max_pixels is not None:
+            size["longest_edge"] = max_pixels
+        self.min_pixels = size["shortest_edge"]
+        self.max_pixels = size["longest_edge"]
+        self.size = size
+
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.do_convert_rgb = do_convert_rgb
+
+        # hard-code
+
+    def _preprocess(
+        self,
+        images: ImageInput | VideoInput,
+        do_resize: bool | None = None,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool | None = None,
+        rescale_factor: float | None = None,
+        do_normalize: bool | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        patch_size: int = 16,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        do_convert_rgb: bool | None = None,
+        data_format: ChannelDimension | None = ChannelDimension.FIRST,
+        input_data_format: str | ChannelDimension | None = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """  # noqa: E501
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        width, height = images[0].width, images[0].height
+        resized_width, resized_height = width, height
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_width, resized_height = smart_resize(
+                    width,
+                    height,
+                    factor=patch_size * merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = image.resize((resized_width, resized_height))
+
+            if do_normalize:
+                image = transforms.Compose(
+                    [
+                        transforms.ToTensor(),
+                        transforms.Normalize(self.image_mean, self.image_std),
+                    ]
+                )(image)
+            processed_images.append(image)
+
+        patches = np.array(processed_images)
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // temporal_patch_size
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+        patches = patches.reshape(
+            1,
+            channel,
+            grid_h // merge_size,
+            merge_size,
+            patch_size,
+            grid_w // merge_size,
+            merge_size,
+            patch_size,
+        )
+        patches = patches.transpose(0, 2, 3, 5, 6, 1, 4, 7)
+        flatten_patches = patches.reshape(
+            1 * grid_h * grid_w, channel * patch_size * patch_size
+        )
+
+        return flatten_patches, (grid_t, grid_h, grid_w)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool | None = None,
+        size: dict[str, int] | None = None,
+        min_pixels: int | None = None,
+        max_pixels: int | None = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool | None = None,
+        rescale_factor: float | None = None,
+        do_normalize: bool | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        patch_size: int | None = None,
+        temporal_patch_size: int | None = None,
+        merge_size: int | None = None,
+        do_convert_rgb: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+        data_format: ChannelDimension | None = ChannelDimension.FIRST,
+        input_data_format: str | ChannelDimension | None = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
+                The min pixels of the image to resize the image.
+            max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
+                The max pixels of the image to resize the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """  # noqa: E501
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+
+        if size is not None:
+            if "shortest_edge" not in size or "longest_edge" not in size:
+                raise ValueError(
+                    "size must contain 'shortest_edge' and 'longest_edge' keys."
+                )
+            min_pixels = size["shortest_edge"]
+        elif min_pixels is not None and max_pixels is not None:
+            # backward compatibility: override size with min_pixels and max_pixels
+            # if they are provided.
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
+        else:
+            size = {**self.size}
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        temporal_patch_size = (
+            temporal_patch_size
+            if temporal_patch_size is not None
+            else self.temporal_patch_size
+        )
+        merge_size = merge_size if merge_size is not None else self.merge_size
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+
+        if images is not None:
+            images = make_flat_list_of_images(images)
+
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        data = {}
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data.update(
+                {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+            )
+
+        # kept for BC only and should be removed after v5.0
+        if videos is not None:
+            logger.warning(
+                "`HunYuanVLV1ImageProcessor` works only with image inputs "
+                "and doesn't process videos anymore. "
+                "This is a deprecated behavior and will be removed in v5.0. "
+                "Your videos should be forwarded to `HunYuanVLV1VideoProcessor`. "
+            )
+            videos = make_batched_videos(videos)
+            pixel_values_videos, vision_grid_thws_videos = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values_videos.extend(patches)
+                vision_grid_thws_videos.append(video_grid_thw)
+            data.update(
+                {
+                    "pixel_values_videos": np.array(pixel_values_videos),
+                    "video_grid_thw": np.array(vision_grid_thws_videos),
+                }
+            )
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number of image patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*):
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of image patches per image.
+        """
+        min_pixels = (
+            images_kwargs["min_pixels"]
+            if "min_pixels" in images_kwargs
+            else self.size["shortest_edge"]
+        )
+        max_pixels = (
+            images_kwargs["max_pixels"]
+            if "max_pixels" in images_kwargs
+            else self.size["longest_edge"]
+        )
+        patch_size = images_kwargs.get("patch_size", self.patch_size)
+        merge_size = images_kwargs.get("merge_size", self.merge_size)
+
+        factor = patch_size * merge_size
+        resized_height, resized_width = smart_resize(
+            height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
+        )
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+        return grid_h * (grid_w + 1) + 2
+
+
+AutoImageProcessor.register("HunYuanVLImageProcessor", HunYuanVLImageProcessor)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 4a2818ab1bfd..e7991baeaa1b 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -43,6 +43,8 @@ class CachedRequestState:
     mrope_positions: torch.Tensor | None = None
     mrope_position_delta: int | None = None
 
+    xdrope_positions: torch.Tensor | None = None
+
     lora_request: LoRARequest | None = None
     prompt_embeds: torch.Tensor | None = None
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6a83ac14e0b3..6413be66b141 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -50,16 +50,21 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.layers.rotary_embedding import (
+    MRotaryEmbedding,
+    XDRotaryEmbedding,
+)
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (
     SupportsMRoPE,
     SupportsMultiModal,
+    SupportsXDRoPE,
     is_mixture_of_experts,
     supports_eagle3,
     supports_mrope,
     supports_multimodal_pruning,
     supports_transcription,
+    supports_xdrope,
 )
 from vllm.model_executor.models.interfaces_base import (
     VllmModelForPooling,
@@ -324,6 +329,7 @@ def __init__(
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
+        self.uses_xdrope_dim = model_config.uses_xdrope_dim
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
             model_config
         )
@@ -512,6 +518,13 @@ def __init__(
                 (3, self.max_num_tokens + 1), dtype=torch.int64
             )
 
+        # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+        if self.uses_xdrope_dim > 0:
+            # Similar to mrope but use assigned dimension number for RoPE, 4 as default.
+            self.xdrope_positions = self._make_buffer(
+                (self.uses_xdrope_dim, self.max_num_tokens + 1), dtype=torch.int64
+            )
+
         # None in the first PP rank. The rest are set after load_model.
         self.intermediate_tensors: IntermediateTensors | None = None
 
@@ -593,10 +606,14 @@ def _get_positions(self, num_tokens: Any):
         if isinstance(num_tokens, int):
             if self.uses_mrope:
                 return self.mrope_positions.gpu[:, :num_tokens]
+            if self.uses_xdrope_dim > 0:
+                return self.xdrope_positions.gpu[:, :num_tokens]
             return self.positions.gpu[:num_tokens]
         else:
             if self.uses_mrope:
                 return self.mrope_positions.gpu[:, num_tokens]
+            if self.uses_xdrope_dim > 0:
+                return self.xdrope_positions.gpu[:, num_tokens]
             return self.positions.gpu[num_tokens]
 
     def _make_buffer(
@@ -772,6 +789,10 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             if self.uses_mrope:
                 self._init_mrope_positions(req_state)
 
+            # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+            if self.uses_xdrope_dim > 0:
+                self._init_xdrope_positions(req_state)
+
             reqs_to_add.append(req_state)
 
         # Update the states of the running/resumed requests.
@@ -987,6 +1008,19 @@ def _init_mrope_positions(self, req_state: CachedRequestState):
             )
         )
 
+    def _init_xdrope_positions(self, req_state: CachedRequestState):
+        model = self.get_model()
+        xdrope_model = cast(SupportsXDRoPE, model)
+        assert req_state.prompt_token_ids is not None, (
+            "XD-RoPE requires prompt_token_ids to be available."
+        )
+        assert supports_xdrope(model), "XD-RoPE support is not implemented."
+
+        req_state.xdrope_positions = xdrope_model.get_xdrope_input_positions(
+            req_state.prompt_token_ids,
+            req_state.mm_features,
+        )
+
     def _extract_mm_kwargs(
         self,
         scheduler_output: "SchedulerOutput",
@@ -1231,6 +1265,11 @@ def _prepare_inputs(
         if self.uses_mrope:
             self._calc_mrope_positions(scheduler_output)
 
+        # Calculate XD-RoPE positions.
+        # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+        if self.uses_xdrope_dim > 0:
+            self._calc_xdrope_positions(scheduler_output)
+
         # Get token indices.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
@@ -1364,6 +1403,12 @@ def _prepare_inputs(
                 self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
                 non_blocking=True,
             )
+        elif self.uses_xdrope_dim > 0:
+            # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+            self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
+                self.xdrope_positions.cpu[:, :total_num_scheduled_tokens],
+                non_blocking=True,
+            )
         else:
             # Common case (1D positions)
             self.positions.copy_to_gpu(total_num_scheduled_tokens)
@@ -1793,6 +1838,53 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
 
                 mrope_pos_ptr += completion_part_len
 
+    def _calc_xdrope_positions(self, scheduler_output: "SchedulerOutput"):
+        xdrope_pos_ptr = 0
+        for index, req_id in enumerate(self.input_batch.req_ids):
+            req = self.requests[req_id]
+            assert req.xdrope_positions is not None
+
+            num_computed_tokens = self.input_batch.num_computed_tokens_cpu[index]
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
+                req.prompt_token_ids, req.prompt_embeds
+            )
+
+            if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
+                prompt_part_len = max(0, num_prompt_tokens - num_computed_tokens)
+                completion_part_len = max(0, num_scheduled_tokens - prompt_part_len)
+            else:
+                prompt_part_len = num_scheduled_tokens
+                completion_part_len = 0
+
+            assert num_scheduled_tokens == prompt_part_len + completion_part_len
+
+            if prompt_part_len > 0:
+                # prompt's xdrope_positions are pre-computed
+                dst_start = xdrope_pos_ptr
+                dst_end = xdrope_pos_ptr + prompt_part_len
+                src_start = num_computed_tokens
+                src_end = num_computed_tokens + prompt_part_len
+
+                self.xdrope_positions.cpu[:, dst_start:dst_end] = req.xdrope_positions[
+                    :, src_start:src_end
+                ]
+                xdrope_pos_ptr += prompt_part_len
+
+            if completion_part_len > 0:
+                # compute completion's xdrope_positions on-the-fly
+                dst_start = xdrope_pos_ptr
+                dst_end = xdrope_pos_ptr + completion_part_len
+
+                XDRotaryEmbedding.get_next_input_positions_tensor(
+                    out=self.xdrope_positions.np,
+                    out_offset=dst_start,
+                    context_len=num_computed_tokens + prompt_part_len,
+                    num_new_tokens=completion_part_len,
+                )
+
+                xdrope_pos_ptr += completion_part_len
+
     def _calc_spec_decode_metadata(
         self,
         num_draft_tokens: np.ndarray,
@@ -2037,6 +2129,7 @@ def _gather_mm_embeddings(
 
         req_start_idx = 0
         should_sync_mrope_positions = False
+        should_sync_xdrope_positions = False
 
         for req_id in self.input_batch.req_ids:
             mm_embeds_req: list[torch.Tensor] = []
@@ -2110,6 +2203,10 @@ def _gather_mm_embeddings(
             self._calc_mrope_positions(scheduler_output)
             self.mrope_positions.copy_to_gpu(total_num_scheduled_tokens)
 
+        if should_sync_xdrope_positions:
+            self._calc_xdrope_positions(scheduler_output)
+            self.xdrope_positions.copy_to_gpu(total_num_scheduled_tokens)
+
         return mm_embeds, is_mm_embed
 
     def get_model(self) -> nn.Module:
@@ -2384,8 +2481,11 @@ def _preprocess(
             input_ids = self.input_ids.gpu[:num_input_tokens]
             inputs_embeds = None
             model_kwargs = self._init_model_kwargs(num_input_tokens)
+
         if self.uses_mrope:
             positions = self.mrope_positions.gpu[:, :num_input_tokens]
+        elif self.uses_xdrope_dim > 0:
+            positions = self.xdrope_positions.gpu[:, :num_input_tokens]
         else:
             positions = self.positions.gpu[:num_input_tokens]
 
@@ -3824,6 +3924,8 @@ def _dummy_run(
 
             if self.uses_mrope:
                 positions = self.mrope_positions.gpu[:, :num_tokens_after_padding]
+            elif self.uses_xdrope_dim > 0:
+                positions = self.xdrope_positions.gpu[:, :num_tokens_after_padding]
             else:
                 positions = self.positions.gpu[:num_tokens_after_padding]
 

From 81db702ed28d9a6edbd59fbd0ec039e107d36bc0 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Tue, 25 Nov 2025 12:25:20 +0800
Subject: [PATCH 399/578] [Attention] add `_cudagraph_support` for linear
 attention (#28934)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/v1/attention/backends/linear_attn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py
index 1900c50849ec..004baa2d09cd 100644
--- a/vllm/v1/attention/backends/linear_attn.py
+++ b/vllm/v1/attention/backends/linear_attn.py
@@ -7,6 +7,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
     split_decodes_and_prefills,
@@ -35,6 +36,8 @@ class LinearAttentionMetadata:
 class LinearAttentionMetadataBuilder(AttentionMetadataBuilder[LinearAttentionMetadata]):
     reorder_batch_threshold: int = 1
 
+    _cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,

From 2d9ee28cab204b90aa304f60fd7083ea45204bd7 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Mon, 24 Nov 2025 23:55:57 -0500
Subject: [PATCH 400/578] [CI/Test Fix] Fix CP tests on Blackwell (#29338)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/attention/ops/common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index 67c5f7dbba9c..af6766bdd161 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -194,7 +194,6 @@ def _cp_lse_common(
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
     out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
-    assert out.is_contiguous()
     return out, lse
 
 
From 316c8492bf4d5fca8f9f8ea6f8ef1d76a0cb940f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 05:24:05 +0000
Subject: [PATCH 401/578] Scheduled removal of `guided_*` config fields
 (#29326)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/features/structured_outputs.md           |   2 +-
 .../llm/test_struct_output_generate.py        |  29 +--
 vllm/engine/arg_utils.py                      |  33 ---
 vllm/entrypoints/openai/protocol.py           | 203 ++++--------------
 vllm/sampling_params.py                       |  38 ----
 5 files changed, 43 insertions(+), 262 deletions(-)

diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index e38627c70788..7d52891bea7b 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -7,7 +7,7 @@ This document shows you some examples of the different options that are
 available to generate structured outputs.
 
 !!! warning
-    If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
+    If you are still using the following deprecated API fields which were removed in v0.12.0, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
 
     - `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
     - `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index d1b037b7956c..85f108786c05 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -3,7 +3,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
-from dataclasses import fields
 from enum import Enum
 from typing import TYPE_CHECKING, Any
 
@@ -21,7 +20,6 @@
 from vllm.platforms import current_platform
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
 from vllm.sampling_params import (
-    GuidedDecodingParams,
     SamplingParams,
     StructuredOutputsParams,
 )
@@ -108,23 +106,6 @@ class CarDescription(BaseModel):
     car_type: CarType
 
 
-def test_guided_decoding_deprecated():
-    with pytest.warns(DeprecationWarning, match="GuidedDecodingParams is deprecated.*"):
-        guided_decoding = GuidedDecodingParams(json_object=True)
-
-    structured_outputs = StructuredOutputsParams(json_object=True)
-    assert fields(guided_decoding) == fields(structured_outputs)
-
-    with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
-        sp1 = SamplingParams(guided_decoding=guided_decoding)
-
-    with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
-        sp2 = SamplingParams.from_optional(guided_decoding=guided_decoding)
-
-    assert sp1 == sp2
-    assert sp1.structured_outputs == guided_decoding
-
-
 @pytest.mark.parametrize(
     "model_name, backend, tokenizer_mode, speculative_config",
     PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
@@ -899,13 +880,11 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
                 output_json = json.loads(generated_text)
 
 
-@pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"])
-def test_structured_output_with_structural_tag(
-    guided_decoding_backend: str,
-):
+@pytest.mark.parametrize("backend", ["xgrammar"])
+def test_structured_output_with_structural_tag(backend: str):
     llm = LLM(
         model="Qwen/Qwen2.5-1.5B-Instruct",
-        guided_decoding_backend=guided_decoding_backend,
+        structured_outputs_config=StructuredOutputsConfig(backend=backend),
     )
 
     structural_tag_config = {
@@ -923,7 +902,7 @@ def test_structured_output_with_structural_tag(
     sampling_params = SamplingParams(
         temperature=0.0,
         max_tokens=500,
-        guided_decoding=StructuredOutputsParams(
+        structured_outputs=StructuredOutputsParams(
             structural_tag=json.dumps(structural_tag_config)
         ),
     )
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a7c6b11ccd5a..3cb76fc63f69 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -502,11 +502,6 @@ class EngineArgs:
     )
     reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
     reasoning_parser_plugin: str | None = None
-    # Deprecated guided decoding fields
-    guided_decoding_backend: str | None = None
-    guided_decoding_disable_fallback: bool | None = None
-    guided_decoding_disable_any_whitespace: bool | None = None
-    guided_decoding_disable_additional_properties: bool | None = None
 
     logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern
 
@@ -725,19 +720,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--reasoning-parser-plugin",
             **structured_outputs_kwargs["reasoning_parser_plugin"],
         )
-        # Deprecated guided decoding arguments
-        for arg, type in [
-            ("--guided-decoding-backend", str),
-            ("--guided-decoding-disable-fallback", bool),
-            ("--guided-decoding-disable-any-whitespace", bool),
-            ("--guided-decoding-disable-additional-properties", bool),
-        ]:
-            structured_outputs_group.add_argument(
-                arg,
-                type=type,
-                help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
-                deprecated=True,
-            )
 
         # Parallel arguments
         parallel_kwargs = get_kwargs(ParallelConfig)
@@ -1712,21 +1694,6 @@ def create_engine_config(
                 self.reasoning_parser_plugin
             )
 
-        # Forward the deprecated CLI args to the StructuredOutputsConfig
-        so_config = self.structured_outputs_config
-        if self.guided_decoding_backend is not None:
-            so_config.guided_decoding_backend = self.guided_decoding_backend
-        if self.guided_decoding_disable_fallback is not None:
-            so_config.disable_fallback = self.guided_decoding_disable_fallback
-        if self.guided_decoding_disable_any_whitespace is not None:
-            so_config.disable_any_whitespace = (
-                self.guided_decoding_disable_any_whitespace
-            )
-        if self.guided_decoding_disable_additional_properties is not None:
-            so_config.disable_additional_properties = (
-                self.guided_decoding_disable_additional_properties
-            )
-
         observability_config = ObservabilityConfig(
             show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
             otlp_traces_endpoint=self.otlp_traces_endpoint,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b352c3ad01db..5a0a05f9af32 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -652,62 +652,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: str | dict | BaseModel | None = Field(
-        default=None,
-        description=(
-            "`guided_json` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `json` to `structured_outputs` instead."
-        ),
-    )
-    guided_regex: str | None = Field(
-        default=None,
-        description=(
-            "`guided_regex` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `regex` to `structured_outputs` instead."
-        ),
-    )
-    guided_choice: list[str] | None = Field(
-        default=None,
-        description=(
-            "`guided_choice` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `choice` to `structured_outputs` instead."
-        ),
-    )
-    guided_grammar: str | None = Field(
-        default=None,
-        description=(
-            "`guided_grammar` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `grammar` to `structured_outputs` instead."
-        ),
-    )
-    structural_tag: str | None = Field(
-        default=None,
-        description=(
-            "`structural_tag` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `structural_tag` to `structured_outputs` instead."
-        ),
-    )
-    guided_decoding_backend: str | None = Field(
-        default=None,
-        description=(
-            "`guided_decoding_backend` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please remove it from your request."
-        ),
-    )
-    guided_whitespace_pattern: str | None = Field(
-        default=None,
-        description=(
-            "`guided_whitespace_pattern` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `whitespace_pattern` to `structured_outputs` instead."
-        ),
-    )
     priority: int = Field(
         default=0,
         description=(
@@ -841,20 +785,6 @@ def to_sampling_params(
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
 
-        # Forward deprecated guided_* parameters to structured_outputs
-        if self.structured_outputs is None:
-            kwargs = dict[str, Any](
-                json=self.guided_json,
-                regex=self.guided_regex,
-                choice=self.guided_choice,
-                grammar=self.guided_grammar,
-                whitespace_pattern=self.guided_whitespace_pattern,
-                structural_tag=self.structural_tag,
-            )
-            kwargs = {k: v for k, v in kwargs.items() if v is not None}
-            if len(kwargs) > 0:
-                self.structured_outputs = StructuredOutputsParams(**kwargs)
-
         response_format = self.response_format
         if response_format is not None:
             # If structured outputs wasn't already enabled,
@@ -863,24 +793,23 @@ def to_sampling_params(
                 self.structured_outputs = StructuredOutputsParams()
 
             # Set structured output params for response format
-            if response_format is not None:
-                if response_format.type == "json_object":
-                    self.structured_outputs.json_object = True
-                elif response_format.type == "json_schema":
-                    json_schema = response_format.json_schema
-                    assert json_schema is not None
-                    self.structured_outputs.json = json_schema.json_schema
-                elif response_format.type == "structural_tag":
-                    structural_tag = response_format
-                    assert structural_tag is not None and isinstance(
-                        structural_tag,
-                        (
-                            LegacyStructuralTagResponseFormat,
-                            StructuralTagResponseFormat,
-                        ),
-                    )
-                    s_tag_obj = structural_tag.model_dump(by_alias=True)
-                    self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
+            if response_format.type == "json_object":
+                self.structured_outputs.json_object = True
+            elif response_format.type == "json_schema":
+                json_schema = response_format.json_schema
+                assert json_schema is not None
+                self.structured_outputs.json = json_schema.json_schema
+            elif response_format.type == "structural_tag":
+                structural_tag = response_format
+                assert structural_tag is not None and isinstance(
+                    structural_tag,
+                    (
+                        LegacyStructuralTagResponseFormat,
+                        StructuralTagResponseFormat,
+                    ),
+                )
+                s_tag_obj = structural_tag.model_dump(by_alias=True)
+                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -1140,58 +1069,6 @@ class CompletionRequest(OpenAIBaseModel):
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: str | dict | BaseModel | None = Field(
-        default=None,
-        description=(
-            "`guided_json` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `json` to `structured_outputs` instead."
-        ),
-    )
-    guided_regex: str | None = Field(
-        default=None,
-        description=(
-            "`guided_regex` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `regex` to `structured_outputs` instead."
-        ),
-    )
-    guided_choice: list[str] | None = Field(
-        default=None,
-        description=(
-            "`guided_choice` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `choice` to `structured_outputs` instead."
-        ),
-    )
-    guided_grammar: str | None = Field(
-        default=None,
-        description=(
-            "`guided_grammar` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `grammar` to `structured_outputs` instead."
-        ),
-    )
-    structural_tag: str | None = Field(
-        default=None,
-        description=("If specified, the output will follow the structural tag schema."),
-    )
-    guided_decoding_backend: str | None = Field(
-        default=None,
-        description=(
-            "`guided_decoding_backend` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please remove it from your request."
-        ),
-    )
-    guided_whitespace_pattern: str | None = Field(
-        default=None,
-        description=(
-            "`guided_whitespace_pattern` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `whitespace_pattern` to `structured_outputs` instead."
-        ),
-    )
     priority: int = Field(
         default=0,
         description=(
@@ -1336,35 +1213,31 @@ def to_sampling_params(
 
         echo_without_generation = self.echo and self.max_tokens == 0
 
-        guided_json_object = None
-        if self.response_format is not None:
-            if self.response_format.type == "json_object":
-                guided_json_object = True
-            elif self.response_format.type == "json_schema":
-                json_schema = self.response_format.json_schema
+        response_format = self.response_format
+        if response_format is not None:
+            # If structured outputs wasn't already enabled,
+            # we must enable it for these features to work
+            if self.structured_outputs is None:
+                self.structured_outputs = StructuredOutputsParams()
+
+            # Set structured output params for response format
+            if response_format.type == "json_object":
+                self.structured_outputs.json_object = True
+            elif response_format.type == "json_schema":
+                json_schema = response_format.json_schema
                 assert json_schema is not None
-                self.guided_json = json_schema.json_schema
-            elif self.response_format.type == "structural_tag":
-                structural_tag = self.response_format
+                self.structured_outputs.json = json_schema.json_schema
+            elif response_format.type == "structural_tag":
+                structural_tag = response_format
                 assert structural_tag is not None and isinstance(
-                    structural_tag, StructuralTagResponseFormat
+                    structural_tag,
+                    (
+                        LegacyStructuralTagResponseFormat,
+                        StructuralTagResponseFormat,
+                    ),
                 )
                 s_tag_obj = structural_tag.model_dump(by_alias=True)
-                self.structural_tag = json.dumps(s_tag_obj)
-
-        # Forward deprecated guided_* parameters to structured_outputs
-        if self.structured_outputs is None:
-            kwargs = dict[str, Any](
-                json=self.guided_json,
-                json_object=guided_json_object,
-                regex=self.guided_regex,
-                choice=self.guided_choice,
-                grammar=self.guided_grammar,
-                whitespace_pattern=self.guided_whitespace_pattern,
-            )
-            kwargs = {k: v for k, v in kwargs.items() if v is not None}
-            if len(kwargs) > 0:
-                self.structured_outputs = StructuredOutputsParams(**kwargs)
+                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index fbbe3d4cabb9..142853ff0ff0 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -3,7 +3,6 @@
 """Sampling parameters for text generation."""
 
 import copy
-import warnings
 from dataclasses import field
 from enum import Enum, IntEnum
 from functools import cached_property
@@ -100,19 +99,6 @@ def all_non_structural_tag_constraints_none(self) -> bool:
         )
 
 
-@dataclass
-class GuidedDecodingParams(StructuredOutputsParams):
-    def __post_init__(self):
-        warnings.warn(
-            "GuidedDecodingParams is deprecated. This will be removed in "
-            "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-            "StructuredOutputsParams instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return super().__post_init__()
-
-
 class RequestOutputKind(Enum):
     # Return entire output so far in every RequestOutput
     CUMULATIVE = 0
@@ -234,8 +220,6 @@ class SamplingParams(
     # Fields used to construct logits processors
     structured_outputs: StructuredOutputsParams | None = None
     """Parameters for configuring structured outputs."""
-    guided_decoding: GuidedDecodingParams | None = None
-    """Deprecated alias for structured_outputs."""
     logit_bias: dict[int, float] | None = None
     """If provided, the engine will construct a logits processor that applies
     these logit biases."""
@@ -283,7 +267,6 @@ def from_optional(
         truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
         structured_outputs: StructuredOutputsParams | None = None,
-        guided_decoding: GuidedDecodingParams | None = None,
         logit_bias: dict[int, float] | dict[str, float] | None = None,
         allowed_token_ids: list[int] | None = None,
         extra_args: dict[str, Any] | None = None,
@@ -295,16 +278,6 @@ def from_optional(
                 int(token): min(100.0, max(-100.0, bias))
                 for token, bias in logit_bias.items()
             }
-        if guided_decoding is not None:
-            warnings.warn(
-                "guided_decoding is deprecated. This will be removed in "
-                "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-                "structured_outputs instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            structured_outputs = guided_decoding
-            guided_decoding = None
 
         return SamplingParams(
             n=1 if n is None else n,
@@ -387,17 +360,6 @@ def __post_init__(self) -> None:
         # eos_token_id is added to this by the engine
         self._all_stop_token_ids.update(self.stop_token_ids)
 
-        if self.guided_decoding is not None:
-            warnings.warn(
-                "guided_decoding is deprecated. This will be removed in "
-                "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-                "structured_outputs instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            self.structured_outputs = self.guided_decoding
-            self.guided_decoding = None
-
         if self.skip_reading_prefix_cache is None:
             # If prefix caching is enabled,
             # the output of prompt logprobs may less than n_prompt_tokens,

From a21256c46327ec366b7804d22ba66ed04c2ae18b Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Tue, 25 Nov 2025 14:03:20 +0800
Subject: [PATCH 402/578] Add TP CLI argument to multimodal inference examples
 (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
---
 examples/offline_inference/audio_language.py  | 15 +++++++
 examples/offline_inference/vision_language.py | 15 +++++++
 .../vision_language_multi_image.py            | 40 ++++++++++++++++---
 3 files changed, 65 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 examples/offline_inference/audio_language.py
 mode change 100644 => 100755 examples/offline_inference/vision_language.py
 mode change 100644 => 100755 examples/offline_inference/vision_language_multi_image.py

diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
old mode 100644
new mode 100755
index 04e6f99f8957..df6e96ca375f
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -425,6 +425,13 @@ def parse_args():
         default=None,
         help="Set the seed when initializing `vllm.LLM`.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
 
     return parser.parse_args()
 
@@ -434,6 +441,12 @@ def main(args):
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {args.tensor_parallel_size}"
+        )
+
     audio_count = args.num_audios
     req_data = model_example_map[model](
         question_per_audio_count[audio_count], audio_count
@@ -446,6 +459,8 @@ def main(args):
     )
 
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    if args.tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
     llm = LLM(**engine_args)
 
     # We set temperature to 0.2 so that outputs can be different
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
old mode 100644
new mode 100755
index 65ea4df4a309..8f72bf6f0b0d
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -2064,6 +2064,13 @@ def parse_args():
         help="If True, will send all requests in a second batch with empty mm "
         "data to verify cache hits with UUIDs.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
     return parser.parse_args()
 
 
@@ -2072,6 +2079,12 @@ def main(args):
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {args.tensor_parallel_size}"
+        )
+
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
@@ -2089,6 +2102,8 @@ def main(args):
         "seed": args.seed,
         "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
     }
+    if args.tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
     llm = LLM(**engine_args)
 
     # Don't want to check the flag multiple times, so just hijack `prompts`.
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
old mode 100644
new mode 100755
index 301265d4e17f..7ba4e64b567d
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1352,10 +1352,18 @@ def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
 }
 
 
-def run_generate(model, question: str, image_urls: list[str], seed: int | None):
+def run_generate(
+    model,
+    question: str,
+    image_urls: list[str],
+    seed: int | None,
+    tensor_parallel_size: int | None,
+):
     req_data = model_example_map[model](question, image_urls)
 
-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    if tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = tensor_parallel_size
     llm = LLM(**engine_args)
 
     sampling_params = SamplingParams(
@@ -1378,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
         print("-" * 50)
 
 
-def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
+def run_chat(
+    model: str,
+    question: str,
+    image_urls: list[str],
+    seed: int | None,
+    tensor_parallel_size: int | None,
+):
     req_data = model_example_map[model](question, image_urls)
 
     # Disable other modalities to save memory
@@ -1388,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
     )
 
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    if tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = tensor_parallel_size
     llm = LLM(**engine_args)
 
     sampling_params = (
@@ -1463,6 +1479,13 @@ def parse_args():
         default=2,
         help="Number of images to use for the demo.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
     return parser.parse_args()
 
 
@@ -1470,13 +1493,20 @@ def main(args: Namespace):
     model = args.model_type
     method = args.method
     seed = args.seed
+    tensor_parallel_size = args.tensor_parallel_size
+
+    if tensor_parallel_size is not None and tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {tensor_parallel_size}"
+        )
 
     image_urls = IMAGE_URLS[: args.num_images]
 
     if method == "generate":
-        run_generate(model, QUESTION, image_urls, seed)
+        run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
     elif method == "chat":
-        run_chat(model, QUESTION, image_urls, seed)
+        run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
     else:
         raise ValueError(f"Invalid method: {method}")
 

From ce58fdc1c366b0257c2b2d8310b14d4ea8f8dd30 Mon Sep 17 00:00:00 2001
From: kflu <kflu@users.noreply.github.com>
Date: Mon, 24 Nov 2025 22:39:29 -0800
Subject: [PATCH 403/578] Fix PoolingParams.skip_reading_prefix_cache type
 (#29364)

Signed-off-by: KFL <kludev@gmail.com>
---
 vllm/pooling_params.py  | 2 +-
 vllm/sampling_params.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 5c3dfa8ac9cb..d1aab98c274e 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -57,7 +57,7 @@ class PoolingParams(
     ## Internal use only
     task: PoolingTask | None = None
     requires_token_ids: bool = False
-    skip_reading_prefix_cache: bool = None
+    skip_reading_prefix_cache: bool | None = None
     extra_kwargs: dict[str, Any] | None = None
     output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 142853ff0ff0..8de961e62db1 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -238,7 +238,7 @@ class SamplingParams(
     generated token can complete the sequence."""
     _bad_words_token_ids: list[list[int]] | None = None
 
-    skip_reading_prefix_cache: bool = None
+    skip_reading_prefix_cache: bool | None = None
 
     @staticmethod
     def from_optional(

From 40a6f53f6c09cd15b07436acc8d631a3a86f7416 Mon Sep 17 00:00:00 2001
From: Inoki <inoki@inoki.cc>
Date: Tue, 25 Nov 2025 07:40:06 +0100
Subject: [PATCH 404/578] Display warning only when ROCm version is less than
 Pytorch required version (#29200)

Signed-off-by: Inoki <inoki@inoki.cc>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4cf51d17e98..86746a0db4c0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,7 +136,7 @@ elseif(HIP_FOUND)
 
   # ROCm 5.X and 6.X
   if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+      Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
     message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
       "expected for ROCm build, saw ${Torch_VERSION} instead.")
   endif()

From 7992324f23478bebf5e39542a4ce198cd7a1ab2a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Nov 2025 22:55:16 -0800
Subject: [PATCH 405/578] [BugFix] Use unique ids for different transcription
 prompts (#29372)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/speech_to_text.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index b9b9b1ab30ad..3dece07748cc 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -201,10 +201,10 @@ async def _create_speech_to_text(
                 self.engine_client.generate(
                     prompt,
                     sampling_params,
-                    request_id,
+                    f"{request_id}_{i}",
                     lora_request=lora_request,
                 )
-                for prompt in prompts
+                for i, prompt in enumerate(prompts)
             ]
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error

From 64deead719cc181a1930982b0a5f4d280c284156 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 25 Nov 2025 14:56:06 +0800
Subject: [PATCH 406/578] [Bugfix] [ROCm] [UX]: revert Flex attention backend
 (#29371)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../v1/attention/test_rocm_attention_backends_selection.py  | 6 ++++++
 vllm/platforms/rocm.py                                      | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
index 4ec79e9eb6ba..80158d4b7278 100644
--- a/tests/v1/attention/test_rocm_attention_backends_selection.py
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -36,6 +36,12 @@ def mock_on_gfx9():
 @pytest.mark.parametrize(
     "env_vars, selected_backend, expected_backend_path",
     [
+        # Test Case: Explicit FLEX_ATTENTION backend
+        (
+            {},
+            "FLEX_ATTENTION",
+            AttentionBackendEnum.FLEX_ATTENTION.get_path(),
+        ),
         # Test Case 1: Default (no env vars, no explicit backend)
         (
             {},
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f3ec965bd088..b0434b9642f0 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -262,6 +262,10 @@ def get_attn_backend_cls(
                 f"is not MLA type while requested for MLA backend."
             )
 
+        if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
+            logger.info("Using FlexAttention backend.")
+            return AttentionBackendEnum.FLEX_ATTENTION.get_path()
+
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
             logger.info("Using Triton Attention backend on V1 engine.")
             return AttentionBackendEnum.TRITON_ATTN.get_path()

From 98caeadd54599c8038fab5b19cc8ef5688b7b03a Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Tue, 25 Nov 2025 07:11:11 +0000
Subject: [PATCH 407/578] [fix][cpu] Use a SwigluOAI impl which supports
 interleaved gate-up wei (#29273)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .../layers/fused_moe/cpu_fused_moe.py         | 29 +++++--------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 572307052b48..659a2d4ee5b3 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -6,22 +6,7 @@
 from torch.nn import functional as F
 
 from vllm import _custom_ops as ops
-
-
-def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    return F.silu(x[..., :d]) * x[..., d:]
-
-
-def swigluoai_and_mul(
-    x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0
-) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    gate, up = x[..., :d], x[..., d:]
-    gate = gate.clamp(max=limit)
-    up = up.clamp(min=-limit, max=limit)
-    glu = gate * torch.sigmoid(alpha * gate)
-    return (up + 1) * glu
+from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
 
 
 def grouped_topk(
@@ -227,6 +212,11 @@ def __init__(self, layer: torch.nn.Module) -> None:
             layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
 
+        self.act_to_impl = {
+            "silu": SiluAndMul(),
+            "swigluoai": SwigluOAIAndMul(),
+        }
+
     def __call__(
         self,
         layer: torch.nn.Module,
@@ -246,7 +236,7 @@ def __call__(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
-        assert activation in {"silu", "swigluoai"}, f"{activation} is not supported."
+        assert activation in self.act_to_impl, f"{activation} is not supported."
         assert not apply_router_weight_on_input
         topk_weights, topk_ids = select_experts(
             hidden_states=x,
@@ -283,10 +273,7 @@ def __call__(
             tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
 
             gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
-            if activation == "swigluoai":
-                gate_up = swigluoai_and_mul(gate_up)
-            else:
-                gate_up = silu_and_mul(gate_up)
+            gate_up = self.act_to_impl[activation].forward_native(gate_up)
             expert_out = layer.down_linear[i](gate_up)
             outputs.append(expert_out)
             start_idx = end_idx

From fe3a4f5b347c64f1d5f2cb10990437a56f720660 Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Tue, 25 Nov 2025 01:14:59 -0600
Subject: [PATCH 408/578] [CI/Build] Pin torchgeo dependency for AMD (#29353)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 requirements/rocm-test.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index f9bddc23420b..8a91b59de6f7 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -45,6 +45,7 @@ multiprocess==0.70.16
 
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
+torchgeo==0.7.0
 
 # Required for suffix decoding test
-arctic-inference == 0.1.1
\ No newline at end of file
+arctic-inference == 0.1.1

From 888152bf87d62c9f5929d06f386068990b618db7 Mon Sep 17 00:00:00 2001
From: Icey <1790571317@qq.com>
Date: Tue, 25 Nov 2025 15:25:15 +0800
Subject: [PATCH 409/578] Allow oot custom compiler extension via
 CompilerInterface (#28623)

Signed-off-by: wxsIcey <1790571317@qq.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/compilation/backends.py | 34 +++++++++++++++++-----------------
 vllm/config/compilation.py   | 12 +++++-------
 vllm/platforms/interface.py  | 20 ++++++++++++++++++++
 3 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 1e66f21ff638..2d8dd4c51c7e 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -63,13 +63,14 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
         else:
             logger.debug("Using InductorAdaptor")
             return InductorAdaptor()
-    else:
-        assert compilation_config.backend == "eager", (
-            "Custom backends not supported with CompilationMode.VLLM_COMPILE"
-        )
-
+    elif compilation_config.backend == "eager":
         logger.debug("Using EagerAdaptor")
         return EagerAdaptor()
+    else:
+        logger.debug("Using custom backend: %s", compilation_config.backend)
+        compiler = resolve_obj_by_qualname(current_platform.get_compile_backend())()
+        assert isinstance(compiler, CompilerInterface)
+        return compiler
 
 
 class CompilerManager:
@@ -545,7 +546,10 @@ def __init__(
         self.prefix = prefix or model_tag
 
         # Passes to run on the graph post-grad.
-        self.post_grad_pass_manager = PostGradPassManager()
+        self.pass_manager = resolve_obj_by_qualname(
+            current_platform.get_pass_manager_cls()
+        )()
+        self.pass_key = current_platform.pass_key
 
         self.sym_tensor_indices = []
         self.input_buffers = []
@@ -562,24 +566,20 @@ def __init__(
 
     def configure_post_pass(self):
         config = self.compilation_config
-        self.post_grad_pass_manager.configure(self.vllm_config)
+        self.pass_manager.configure(self.vllm_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
         # hook. If a pass for that hook exists, add it to the pass manager.
         inductor_config = config.inductor_compile_config
-        PASS_KEY = "post_grad_custom_post_pass"
-        if PASS_KEY in inductor_config:
-            if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
+        if self.pass_key in inductor_config:
+            if isinstance(inductor_config[self.pass_key], PostGradPassManager):
                 # PassManager already added to config, make sure it's correct
-                assert (
-                    inductor_config[PASS_KEY].uuid()
-                    == self.post_grad_pass_manager.uuid()
-                )
+                assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid()
             else:
                 # Config should automatically wrap all inductor passes
-                assert isinstance(inductor_config[PASS_KEY], InductorPass)
-                self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
-        inductor_config[PASS_KEY] = self.post_grad_pass_manager
+                assert isinstance(inductor_config[self.pass_key], InductorPass)
+                self.pass_manager.add(inductor_config[self.pass_key])
+        inductor_config[self.pass_key] = self.pass_manager
 
     def __call__(
         self, graph: fx.GraphModule, example_inputs
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 42eccf9f4112..556b2d9168b3 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -331,9 +331,9 @@ class CompilationConfig:
     We use string to avoid serialization issues when using compilation in a
     distributed setting. When the compilation mode is 1 or 2, the backend is
     used for the compilation directly (it sees the whole graph). When the
-    compilation mode is 3, the backend is used for the piecewise compilation
-    (it sees a part of the graph). The backend can not be custom for compilation
-    mode 3, i.e. the backend must be either eager or inductor. Furthermore,
+    compilation mode is 3, the backend supports both whole graph and piecewise 
+    compilation, available backends include eager, inductor, and custom backends, 
+    the latter of which can be defined via `get_compile_backend`. Furthermore,
     compilation is only piecewise if splitting ops is set accordingly and
     use_inductor_graph_partition is off. Note that the default options for
     splitting ops are sufficient for piecewise compilation.
@@ -768,7 +768,7 @@ def __post_init__(self) -> None:
             self.backend = "inductor" if self.use_inductor else "eager"
 
         if self.backend == "":
-            self.backend = current_platform.simple_compile_backend
+            self.backend = current_platform.get_compile_backend()
 
     def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
         """
@@ -800,9 +800,7 @@ def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
 
         assert self.mode == CompilationMode.VLLM_COMPILE
         if self.backend not in ["eager", "inductor"]:
-            raise ValueError(
-                f"Invalid backend for piecewise compilation: {self.backend}"
-            )
+            logger.info("Using OOT custom backend for compilation.")
 
         from vllm.compilation.backends import VllmBackend
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 0471c20429b1..1e6b53021f88 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -134,6 +134,11 @@ class Platform:
 
     _global_graph_pool: Any | None = None
 
+    @property
+    def pass_key(self) -> str:
+        """Inductor config key for the PassManager custom pass"""
+        return "post_grad_custom_post_pass"
+
     @property
     def supported_dtypes(self) -> list[torch.dtype]:
         """Returns the supported dtypes for the current platform."""
@@ -177,6 +182,21 @@ def is_sleep_mode_available(self) -> bool:
         # all ROCm platforms for now.
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
+    @classmethod
+    def get_pass_manager_cls(cls) -> str:
+        """
+        Get the pass manager class for this platform.
+        It will be registered as a custom pass under the current_platform.pass_key.
+        """
+        return "vllm.compilation.pass_manager.PostGradPassManager"
+
+    @classmethod
+    def get_compile_backend(cls) -> str:
+        """
+        Get the custom compile backend for current platform.
+        """
+        return cls.simple_compile_backend
+
     @classmethod
     def device_id_to_physical_device_id(cls, device_id: int):
         # Treat empty device control env var as unset. This is a valid

From f242cfcdd5f1db4e005503a02a1317369d2a8e3d Mon Sep 17 00:00:00 2001
From: zhrrr <43847754+izhuhaoran@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:31:07 +0800
Subject: [PATCH 410/578] [Perf] use cpu all reduce to avoid sync when
 async_scheduling & dp > 1 (#29311)

Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
---
 vllm/engine/arg_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3cb76fc63f69..8338e54d4fd8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1570,6 +1570,12 @@ def create_engine_config(
             model_config.skip_tokenizer_init = True
             logger.info("Skipping tokenizer initialization for tokens-only mode.")
 
+        if self.async_scheduling and not self.disable_nccl_for_dp_synchronization:
+            logger.info(
+                "Disabling NCCL for DP synchronization when using async scheduling."
+            )
+            self.disable_nccl_for_dp_synchronization = True
+
         # Forward the deprecated CLI args to the EPLB config.
         if self.num_redundant_experts is not None:
             self.eplb_config.num_redundant_experts = self.num_redundant_experts

From 12c007e288bf5c0ae3bd438036fbafbad88e706b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Delacourt?=
 <54138269+Flechman@users.noreply.github.com>
Date: Tue, 25 Nov 2025 08:32:21 +0100
Subject: [PATCH 411/578] EAGLE Support DP>1 (#26086)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Delacourt <remi@mistral.ai>
Signed-off-by: Rémi Delacourt <54138269+Flechman@users.noreply.github.com>
Signed-off-by: remi <remi@mistral.ai>
---
 .buildkite/test-pipeline.yaml         |   2 +
 tests/v1/distributed/test_eagle_dp.py |  77 ++++++++++++++++
 vllm/v1/spec_decode/eagle.py          | 123 +++++++++++++++++++-------
 vllm/v1/worker/gpu_model_runner.py    |   5 +-
 4 files changed, 176 insertions(+), 31 deletions(-)
 create mode 100644 tests/v1/distributed/test_eagle_dp.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f1cd39ef4f94..e88e693a2dda 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -192,6 +192,7 @@ steps:
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@@ -1116,6 +1117,7 @@ steps:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
diff --git a/tests/v1/distributed/test_eagle_dp.py b/tests/v1/distributed/test_eagle_dp.py
new file mode 100644
index 000000000000..9f6a6614fc1f
--- /dev/null
+++ b/tests/v1/distributed/test_eagle_dp.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+from contextlib import AsyncExitStack
+from dataclasses import replace
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+
+DP_SIZE = int(os.getenv("DP_SIZE", 2))
+
+
+@pytest.mark.asyncio
+async def test_run_eagle_dp():
+    target_model = "meta-llama/Llama-3.1-8B-Instruct"
+    draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+
+    engine_args = AsyncEngineArgs(
+        model=target_model,
+        tokenizer_mode="auto",
+        enforce_eager=False,
+        tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+        data_parallel_size=DP_SIZE,
+        data_parallel_backend="mp",  # ray takes more time
+        trust_remote_code=True,
+        max_model_len=16384,
+    )
+
+    eagle_engine_args = replace(
+        engine_args,
+        speculative_config={
+            "model": draft_model,
+            "method": "eagle",
+            "num_speculative_tokens": 3,
+        },
+    )
+
+    prompt = "This is a test of data parallel with eagle"
+    num_expected_tokens = 100
+    sampling_params = SamplingParams(
+        min_tokens=num_expected_tokens,
+        max_tokens=num_expected_tokens,
+        ignore_eos=True,
+        output_kind=RequestOutputKind.FINAL_ONLY,
+        temperature=0,
+    )
+
+    async def generate_with_timeout(given_engine: AsyncLLM):
+        async for out in given_engine.generate(
+            request_id="test-eagle-dp", prompt=prompt, sampling_params=sampling_params
+        ):
+            token_ids = out.outputs[0].token_ids
+            assert len(token_ids) == num_expected_tokens
+            return token_ids
+
+    async def engine_create_and_generate(engine_args: AsyncEngineArgs):
+        async with AsyncExitStack() as after:
+            engine = AsyncLLM.from_engine_args(engine_args)
+            after.callback(engine.shutdown)
+
+            token_ids = await asyncio.wait_for(
+                generate_with_timeout(engine), timeout=30
+            )
+
+            assert not engine.output_processor.has_unfinished_requests()
+        return token_ids
+
+    token_ids_with_eagle = await engine_create_and_generate(eagle_engine_args)
+    token_ids_no_eagle = await engine_create_and_generate(engine_args)
+
+    # Test for correctness
+    assert token_ids_with_eagle == token_ids_no_eagle
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index afa16573eea1..784ccbc04932 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -40,6 +40,7 @@
 from vllm.v1.sample.sampler import _SAMPLING_EPS
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 logger = init_logger(__name__)
@@ -65,6 +66,7 @@ def __init__(
         self.dtype = vllm_config.model_config.dtype
         self.max_model_len = vllm_config.model_config.max_model_len
         self.block_size = vllm_config.cache_config.block_size
+        self.dp_rank = vllm_config.parallel_config.data_parallel_rank
         self.num_speculative_tokens = self.speculative_config.num_speculative_tokens
         self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
         self.token_arange_np = np.arange(self.max_num_tokens)
@@ -271,15 +273,24 @@ def propose(
             assert draft_indexer_metadata is not None
             per_layer_attn_metadata[layer_name] = draft_indexer_metadata
 
+        num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
+            num_tokens_unpadded=num_tokens,
+            num_tokens_padded=num_tokens,
+        )
+
         cudagraph_runtime_mode = CUDAGraphMode.NONE
         if (
             self.use_cuda_graph
-            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+            and num_tokens_dp_padded
+            <= self.compilation_config.max_cudagraph_capture_size
         ):
-            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens_dp_padded)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
-            num_input_tokens = num_tokens
+            num_input_tokens = num_tokens_dp_padded
+        if num_tokens_across_dp is not None:
+            num_tokens_across_dp[self.dp_rank] = num_input_tokens
+
         # copy inputs to buffer for cudagraph
         self._set_positions(num_tokens, target_positions)
         self.hidden_states[:num_tokens] = target_hidden_states
@@ -303,6 +314,7 @@ def propose(
             per_layer_attn_metadata,
             self.vllm_config,
             num_tokens=num_input_tokens,
+            num_tokens_across_dp=num_tokens_across_dp,
             cudagraph_runtime_mode=cudagraph_runtime_mode,
         ):
             ret_hidden_states = self.model(
@@ -365,15 +377,23 @@ def propose(
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
 
+        batch_size_dp_padded, batch_size_across_dp = self._pad_batch_across_dp(
+            num_tokens_unpadded=batch_size,
+            num_tokens_padded=batch_size,
+        )
+
         if (
             self.use_cuda_graph
-            and batch_size <= self.compilation_config.max_cudagraph_capture_size
+            and batch_size_dp_padded
+            <= self.compilation_config.max_cudagraph_capture_size
         ):
-            input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
+            input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size_dp_padded)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
-            input_batch_size = batch_size
+            input_batch_size = batch_size_dp_padded
             cudagraph_runtime_mode = CUDAGraphMode.NONE
+        if batch_size_across_dp is not None:
+            batch_size_across_dp[self.dp_rank] = input_batch_size
 
         common_attn_metadata.num_actual_tokens = batch_size
         common_attn_metadata.max_query_len = 1
@@ -474,6 +494,7 @@ def propose(
                 per_layer_attn_metadata,
                 self.vllm_config,
                 num_tokens=input_batch_size,
+                num_tokens_across_dp=batch_size_across_dp,
                 cudagraph_runtime_mode=cudagraph_runtime_mode,
             ):
                 ret_hidden_states = self.model(
@@ -1116,36 +1137,56 @@ def dummy_run(
         self,
         num_tokens: int,
         use_cudagraphs=True,
+        is_graph_capturing=False,
     ) -> None:
         # Determine if CUDA graphs should be used for this run.
         cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
-        if (
-            cudagraphs_enabled
-            and num_tokens <= self.compilation_config.max_cudagraph_capture_size
-        ):
-            num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
 
-        with set_forward_context(
-            None,
-            self.vllm_config,
-            num_tokens=num_tokens,
-            cudagraph_runtime_mode=(
-                CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE
-            ),
+        # FIXME: when using tree-based specdec, adjust number of forward-passes
+        # according to the depth of the tree.
+        for fwd_idx in range(
+            self.num_speculative_tokens if not is_graph_capturing else 1
         ):
-            if self.supports_mm_inputs:
-                input_ids = None
-                inputs_embeds = self.inputs_embeds[:num_tokens]
-            else:
-                input_ids = self.input_ids[:num_tokens]
-                inputs_embeds = None
+            if fwd_idx <= 1:
+                num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
+                    num_tokens_unpadded=num_tokens,
+                    num_tokens_padded=num_tokens,
+                )
+                if (
+                    cudagraphs_enabled
+                    and num_tokens_dp_padded
+                    <= self.compilation_config.max_cudagraph_capture_size
+                ):
+                    num_input_tokens = self.vllm_config.pad_for_cudagraph(
+                        num_tokens_dp_padded
+                    )
+                else:
+                    num_input_tokens = num_tokens_dp_padded
+                if num_tokens_across_dp is not None:
+                    num_tokens_across_dp[self.dp_rank] = num_input_tokens
 
-            self.model(
-                input_ids=input_ids,
-                positions=self._get_positions(num_tokens),
-                hidden_states=self.hidden_states[:num_tokens],
-                inputs_embeds=inputs_embeds,
-            )
+            with set_forward_context(
+                None,
+                self.vllm_config,
+                num_tokens=num_input_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE
+                if cudagraphs_enabled
+                else CUDAGraphMode.NONE,
+            ):
+                if self.supports_mm_inputs:
+                    input_ids = None
+                    inputs_embeds = self.inputs_embeds[:num_input_tokens]
+                else:
+                    input_ids = self.input_ids[:num_input_tokens]
+                    inputs_embeds = None
+
+                self.model(
+                    input_ids=input_ids,
+                    positions=self._get_positions(num_input_tokens),
+                    hidden_states=self.hidden_states[:num_input_tokens],
+                    inputs_embeds=inputs_embeds,
+                )
 
     def _get_attention_metadata_builder(self) -> AttentionMetadataBuilder:
         """Find and return the attention metadata builders for EAGLE layers.
@@ -1211,6 +1252,28 @@ def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
             == 1
         ), "All eagle layers should belong to the same kv cache group"
 
+    def _pad_batch_across_dp(
+        self,
+        num_tokens_unpadded: int,
+        num_tokens_padded: int,
+    ) -> tuple[int, torch.Tensor]:
+        # TODO(Flechman): support DBO ubatching
+        ubatch_slices, num_toks_across_dp = coordinate_batch_across_dp(
+            num_tokens_unpadded=num_tokens_unpadded,
+            parallel_config=self.vllm_config.parallel_config,
+            allow_microbatching=False,
+            allow_dp_padding=self.use_cuda_graph,
+            num_tokens_padded=num_tokens_padded,
+            uniform_decode=None,
+            num_scheduled_tokens_per_request=None,
+        )
+        assert ubatch_slices is None, "DBO ubatching not implemented for EAGLE"
+
+        num_tokens_dp_padded = num_tokens_padded
+        if num_toks_across_dp is not None:
+            num_tokens_dp_padded = int(num_toks_across_dp[self.dp_rank].item())
+        return num_tokens_dp_padded, num_toks_across_dp
+
 
 # NOTE(woosuk): Currently, the below code is not used and we always use argmax
 # to sample the draft tokens. We will use this after we find a way to manage
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6413be66b141..74fd2a1e2a2c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3746,6 +3746,7 @@ def _dummy_run(
         create_mixed_batch: bool = False,
         remove_lora: bool = True,
         activate_lora: bool = False,
+        is_graph_capturing: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Run a dummy forward pass to warm up/profile run or capture the
@@ -3981,7 +3982,7 @@ def _dummy_run(
             if self.speculative_config and self.speculative_config.use_eagle():
                 assert isinstance(self.drafter, EagleProposer)
                 use_cudagraphs = (
-                    cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+                    cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE)
                     and not self.speculative_config.enforce_eager
                 )
 
@@ -3995,6 +3996,7 @@ def _dummy_run(
                 self.drafter.dummy_run(
                     num_tokens,
                     use_cudagraphs=use_cudagraphs,
+                    is_graph_capturing=is_graph_capturing,
                 )
 
         # This is necessary to avoid blocking DP.
@@ -4427,6 +4429,7 @@ def _capture_cudagraphs(
                 skip_eplb=True,
                 remove_lora=False,
                 activate_lora=activate_lora,
+                is_graph_capturing=True,
             )
         self.maybe_remove_all_loras(self.lora_config)
 

From ef1f7030f016cc811236517e02fa51ee8876cc31 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Tue, 25 Nov 2025 01:55:09 -0600
Subject: [PATCH 412/578] [ROCm][CI] Fix test_cudagraph_mode failure in AMD CI
 (#29367)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/attention/utils.py               |  7 +++
 tests/v1/cudagraph/test_cudagraph_mode.py | 62 +++++++++++++++--------
 vllm/platforms/rocm.py                    |  4 +-
 3 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index dea89babd4b4..df3d53332c7c 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -340,4 +340,11 @@ class BackendConfig:
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
     ),
+    "RocmAttn": BackendConfig(
+        name="RocmAttn",
+        env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+        comp_config={
+            "cudagraph_mode": "FULL",
+        },
+    ),
 }
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index d6bde16eba36..7f9c2a0571c3 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -35,14 +35,22 @@ def temporary_environ(env_vars):
 
 # test attention backend and cudagraph_mode combo
 # (backend_name, cudagraph_mode, supported)
-combo_cases_1 = [
-    ("FA3", "FULL", True),
-    ("FA3", "FULL_AND_PIECEWISE", True),
-    ("FA2", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
-    ("FA2", "FULL_AND_PIECEWISE", True),
-    ("FlashInfer", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
-    ("FlashInfer", "FULL_AND_PIECEWISE", True),
-]
+if current_platform.is_rocm():
+    combo_cases_1 = [
+        ("RocmAttn", "FULL", True),
+        ("RocmAttn", "FULL_AND_PIECEWISE", True),
+        ("TritonAttn", "FULL", True),
+        ("TritonAttn", "FULL_AND_PIECEWISE", True),
+    ]
+else:
+    combo_cases_1 = [
+        ("FA3", "FULL", True),
+        ("FA3", "FULL_AND_PIECEWISE", True),
+        ("FA2", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+        ("FA2", "FULL_AND_PIECEWISE", True),
+        ("FlashInfer", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+        ("FlashInfer", "FULL_AND_PIECEWISE", True),
+    ]
 
 
 @pytest.mark.parametrize("backend_name, cudagraph_mode, supported", combo_cases_1)
@@ -92,18 +100,32 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
 
 # test cudagraph_mode with different compilation mode.
 # (backend_name, cudagraph_mode, compilation_mode, supported)
-combo_cases_2 = [
-    ("FA2", "FULL", CompilationMode.NONE, True),
-    ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "PIECEWISE", CompilationMode.NONE, False),
-    ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
-    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
-    ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "NONE", CompilationMode.NONE, True),
-    ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
-]
+if current_platform.is_rocm():
+    combo_cases_2 = [
+        ("RocmAttn", "FULL", CompilationMode.NONE, True),
+        ("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "PIECEWISE", CompilationMode.NONE, False),
+        ("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "NONE", CompilationMode.NONE, True),
+        ("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True),
+    ]
+else:
+    combo_cases_2 = [
+        ("FA2", "FULL", CompilationMode.NONE, True),
+        ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+        ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "NONE", CompilationMode.NONE, True),
+        ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
+    ]
 
 
 @pytest.mark.parametrize(
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index b0434b9642f0..0483f6c06ada 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -321,8 +321,8 @@ def get_attn_backend_cls(
             return AttentionBackendEnum.TRITON_ATTN.get_path()
 
         raise RuntimeError(
-            "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
-            "to select a supported backend."
+            f"Attention backend {selected_backend.name} is not supported on "
+            "ROCm. Note that V0 attention backends have been removed."
         )
 
     @classmethod

From 6330f9477db214477004df6546f86e3f14f8eab9 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:59:40 +0800
Subject: [PATCH 413/578] [Bugfix] Fix GPT-OSS AR+NORM fusion (#28841)

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                   |  1 +
 tests/compile/distributed/test_fusions_e2e.py   | 11 +++++++++++
 .../device_communicators/symm_mem.py            |  2 +-
 vllm/model_executor/layers/fused_moe/layer.py   | 17 +++++++++++------
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e88e693a2dda..e444becd9867 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -971,6 +971,7 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - vllm/model_executor/layers/fused_moe/layer.py
   - tests/compile/test_fusion_attn.py
   - tests/compile/test_silu_mul_quant_fusion.py
   - tests/compile/distributed/test_fusion_all_reduce.py
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 661172e1965b..53c3f875d200 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -111,6 +111,17 @@ class ModelBackendTestCase(NamedTuple):
                 async_tp=96,  # MLP is MoE, half the fusions of dense
             ),
         ),
+        ModelBackendTestCase(
+            model_name="openai/gpt-oss-20b",
+            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
+            backend=AttentionBackendEnum.FLASHINFER,
+            matches=Matches(
+                attention_fusion=0,
+                allreduce_fusion=49,
+                sequence_parallel=49,
+                async_tp=48,
+            ),
+        ),
     ]
 
 elif current_platform.is_rocm():
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index eb1f173b1192..7a049b003cf7 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -131,7 +131,7 @@ def all_reduce(
             return None
         if out is None:
             out = torch.empty_like(inp)
-        self.buffer[: inp.numel()].copy_(inp.view(-1))
+        self.buffer[: inp.numel()].copy_(inp.reshape(-1))
 
         # Determine which algorithm to use
         use_multimem = False
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 0ef3130b2633..bb30f1292a5f 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1690,6 +1690,10 @@ def forward_native(
             )
 
         def reduce_output(states: torch.Tensor) -> torch.Tensor:
+            # Slice before all_reduce to enable possible fusion
+            if self.hidden_size != og_hidden_states:
+                states = states[..., :og_hidden_states]
+
             if (
                 not self.is_sequence_parallel
                 and not self.use_dp_chunking
@@ -1712,11 +1716,12 @@ def reduce_output(states: torch.Tensor) -> torch.Tensor:
             if self.zero_expert_num is not None and self.zero_expert_num > 0:
                 assert isinstance(fused_output, tuple)
                 fused_output, zero_expert_result = fused_output
-                return (reduce_output(fused_output) + zero_expert_result)[
-                    ..., :og_hidden_states
-                ]
+                return (
+                    reduce_output(fused_output)
+                    + zero_expert_result[..., :og_hidden_states]
+                )
             else:
-                return reduce_output(fused_output)[..., :og_hidden_states]
+                return reduce_output(fused_output)
         else:
             if current_platform.is_tpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
@@ -1729,8 +1734,8 @@ def reduce_output(states: torch.Tensor) -> torch.Tensor:
                     hidden_states, router_logits, self.layer_name
                 )
             return (
-                reduce_output(shared_output)[..., :og_hidden_states],
-                reduce_output(fused_output)[..., :og_hidden_states],
+                reduce_output(shared_output),
+                reduce_output(fused_output),
             )
 
     def forward_cuda(

From 67fc16cd8cf778a30ad0f7619fe77bd85f1d1633 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 25 Nov 2025 16:06:09 +0800
Subject: [PATCH 414/578] [Bugfix] If chunked_prefill is disabled, end the
 scheduling early. (#28911)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 tests/v1/core/test_scheduler.py | 28 ++++++++++++++++++++++++++++
 tests/v1/core/utils.py          |  3 ++-
 vllm/v1/core/sched/scheduler.py |  6 +++---
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 09acde6e08fa..fe4153e60997 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -641,6 +641,34 @@ def test_schedule_concurrent_batches(
     scheduler.update_from_output(scheduler_output1, model_runner_output)
 
 
+@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
+def test_schedule_order(enable_chunked_prefill: bool):
+    scheduler = create_scheduler(
+        max_num_batched_tokens=1024,
+        max_num_seqs=3,
+        enable_chunked_prefill=enable_chunked_prefill,
+    )
+
+    # long requests
+    requests = create_requests(num_requests=2, num_tokens=800)
+    # short requests
+    requests += create_requests(num_requests=2, num_tokens=10)
+
+    for request in requests:
+        scheduler.add_request(request)
+
+    scheduler_output1 = scheduler.schedule()
+
+    if enable_chunked_prefill:
+        # When enable chunked prefill, long requests will be chunked.
+        assert len(scheduler_output1.scheduled_new_reqs) == 2
+    else:
+        # When disable chunked prefill, should not skip the long requests,
+        # and scheduling subsequent short requests in advance,
+        # even though there is still token budgets remaining.
+        assert len(scheduler_output1.scheduled_new_reqs) == 1
+
+
 def test_preempt_during_execution():
     # NOTE(woosuk): The actual number of available blocks is 10 instead of 11
     # because block 0 is reserved as the null block.
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 6830f6873645..7537c7a60476 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -42,6 +42,7 @@ def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
+    enable_chunked_prefill: bool = True,
     enable_prefix_caching: bool = False,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
@@ -76,7 +77,7 @@ def create_scheduler(
         max_model_len=max_model_len,
         long_prefill_token_threshold=long_prefill_token_threshold,
         disable_chunked_mm_input=disable_chunked_mm_input,
-        enable_chunked_prefill=True,
+        enable_chunked_prefill=enable_chunked_prefill,
         async_scheduling=async_scheduling,
     )
     model_config = ModelConfig(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index a7ec0de37263..23af014c1036 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -508,9 +508,9 @@ def schedule(self) -> SchedulerOutput:
                         not self.scheduler_config.enable_chunked_prefill
                         and num_new_tokens > token_budget
                     ):
-                        self.waiting.pop_request()
-                        skipped_waiting_requests.prepend_request(request)
-                        continue
+                        # If chunked_prefill is disabled,
+                        # we can stop the scheduling here.
+                        break
 
                     num_new_tokens = min(num_new_tokens, token_budget)
                     assert num_new_tokens > 0

From db2906108acdc141e8a21e390228c69b1379e3c2 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 25 Nov 2025 00:30:11 -0800
Subject: [PATCH 415/578] [Misc] Streamline unique id generation (#29375)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/protocol.py       | 16 ++++++++--------
 vllm/entrypoints/openai/serving_engine.py |  9 +++++----
 vllm/utils/__init__.py                    |  4 +++-
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 5a0a05f9af32..c4023a618528 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -661,7 +661,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1078,7 +1078,7 @@ class CompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1375,7 +1375,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1470,7 +1470,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1892,7 +1892,7 @@ class ClassificationCompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1983,7 +1983,7 @@ class ClassificationChatRequest(OpenAIBaseModel):
     )
 
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -3094,7 +3094,7 @@ class TranslationResponseVerbose(OpenAIBaseModel):
 ####### Tokens IN <> Tokens OUT #######
 class GenerateRequest(BaseModel):
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -3151,7 +3151,7 @@ class GenerateResponseChoice(BaseModel):
 
 class GenerateResponse(BaseModel):
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index de22c48809dc..09a135b701d0 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1349,11 +1349,12 @@ def _base_request_id(
         raw_request: Request | None, default: str | None = None
     ) -> str | None:
         """Pulls the request id to use from a header, if provided"""
-        default = default or random_uuid()
-        if raw_request is None:
-            return default
+        if raw_request is not None and (
+            (req_id := raw_request.headers.get("X-Request-Id")) is not None
+        ):
+            return req_id
 
-        return raw_request.headers.get("X-Request-Id", default)
+        return random_uuid() if default is None else default
 
     @staticmethod
     def _get_data_parallel_rank(raw_request: Request | None) -> int | None:
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index d94da71b289f..fddcc2720430 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -52,9 +52,11 @@ def __dir__() -> list[str]:
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"
 
+MASK_64_BITS = (1 << 64) - 1
+
 
 def random_uuid() -> str:
-    return str(uuid.uuid4().hex)
+    return f"{uuid.uuid4().int & MASK_64_BITS:016x}"  # 16 hex chars
 
 
 def length_from_prompt_token_ids_or_embeds(

From 32c40b95e09f26fc140d442d687072d01ea9ff2b Mon Sep 17 00:00:00 2001
From: Avishek Goswami <86944690+GOavi101@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:06:34 +0530
Subject: [PATCH 416/578] [BugFix] bad_words filtering ineffective when n > 1
 (#29313)

Signed-off-by: GOavi101 <1704178@kiit.ac.in>
---
 vllm/v1/engine/__init__.py   | 8 ++++++++
 vllm/v1/engine/async_llm.py  | 9 +++++----
 vllm/v1/engine/llm_engine.py | 7 +++++--
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3f621d77c024..ce2aae77108d 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -72,6 +72,14 @@ class EngineCoreRequest(
 
     trace_headers: Mapping[str, str] | None = None
 
+    @property
+    def params(self) -> SamplingParams | PoolingParams:
+        """Return the processed params (sampling or pooling)."""
+        if self.sampling_params is not None:
+            return self.sampling_params
+        assert self.pooling_params is not None
+        return self.pooling_params
+
 
 class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c64b3cccfc65..55087baadff9 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -321,14 +321,15 @@ async def add_request(
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        # Use cloned params that may have been updated in process_inputs()
+        params = request.params
+
         if is_pooling or params.n == 1:
             await self._add_request(request, prompt_text, None, 0, queue)
             return queue
 
-        # Get the updated SamplingParams from the request, which
-        # were cloned/updated in processor.process_inputs above.
-        parent_params = request.sampling_params
-        assert parent_params is not None
+        parent_params = params
+        assert isinstance(parent_params, SamplingParams)
 
         # Fan out child requests (for n>1).
         parent_request = ParentRequest(request_id, parent_params)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index e403cea87788..dffe05445ee4 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -250,6 +250,9 @@ def add_request(
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        # Use cloned params that may have been updated in process_inputs()
+        params = request.params
+
         n = params.n if isinstance(params, SamplingParams) else 1
 
         if n == 1:
@@ -262,10 +265,10 @@ def add_request(
         # Fan out child requests (for n>1).
         parent_req = ParentRequest(request_id, params)
         for idx in range(n):
-            request_id, params = parent_req.get_child_info(idx)
+            request_id, child_params = parent_req.get_child_info(idx)
             child_request = request if idx == n - 1 else copy(request)
             child_request.request_id = request_id
-            child_request.sampling_params = params
+            child_request.sampling_params = child_params
 
             # Make a new RequestState and queue.
             self.output_processor.add_request(

From a685b47c575de7bf1c8adf309f9eba33af354535 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 25 Nov 2025 01:47:10 -0800
Subject: [PATCH 417/578] [responsesAPI] refactor construct_input_messages
 (#29359)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 vllm/entrypoints/openai/serving_responses.py | 50 +++-----------------
 vllm/entrypoints/responses_utils.py          | 46 +++++++++++++++++-
 2 files changed, 52 insertions(+), 44 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 06efb43ecb7b..f546dbda7fef 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -94,7 +94,7 @@
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.responses_utils import (
-    construct_chat_message_with_tool_call,
+    construct_input_messages,
     convert_tool_responses_to_completions_format,
     extract_tool_types,
 )
@@ -504,7 +504,12 @@ async def _make_request(
                 for tool in request.tools
             ]
         # Construct the input messages.
-        messages = self._construct_input_messages(request, prev_response)
+        messages = construct_input_messages(
+            request_instructions=request.instructions,
+            request_input=request.input,
+            prev_msg=self.msg_store.get(prev_response.id) if prev_response else None,
+            prev_response_output=prev_response.output if prev_response else None,
+        )
         _, request_prompts, engine_prompts = await self._preprocess_chat(
             request,
             tokenizer,
@@ -869,47 +874,6 @@ def _make_response_output_items_with_harmony(
             output_items.extend(last_items)
         return output_items
 
-    def _construct_input_messages(
-        self,
-        request: ResponsesRequest,
-        prev_response: ResponsesResponse | None = None,
-    ) -> list[ChatCompletionMessageParam]:
-        messages: list[ChatCompletionMessageParam] = []
-        if request.instructions:
-            messages.append(
-                {
-                    "role": "system",
-                    "content": request.instructions,
-                }
-            )
-
-        # Prepend the conversation history.
-        if prev_response is not None:
-            # Add the previous messages.
-            prev_msg = self.msg_store[prev_response.id]
-            messages.extend(prev_msg)
-
-            # Add the previous output.
-            for output_item in prev_response.output:
-                # NOTE: We skip the reasoning output.
-                if isinstance(output_item, ResponseOutputMessage):
-                    for content in output_item.content:
-                        messages.append(
-                            {
-                                "role": "assistant",
-                                "content": content.text,
-                            }
-                        )
-
-        # Append the new input.
-        # Responses API supports simple text inputs without chat format.
-        if isinstance(request.input, str):
-            messages.append({"role": "user", "content": request.input})
-        else:
-            for item in request.input:
-                messages.append(construct_chat_message_with_tool_call(item))
-        return messages
-
     def _construct_harmony_system_input_message(
         self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str]
     ) -> OpenAIHarmonyMessage:
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 912e8a690573..b02c43c7f824 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -9,7 +9,8 @@
 from openai.types.chat.chat_completion_message_tool_call_param import (
     Function as FunctionCallTool,
 )
-from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
 
@@ -20,6 +21,49 @@
 )
 
 
+def construct_input_messages(
+    *,
+    request_instructions: str | None = None,
+    request_input: str | list[ResponseInputOutputItem],
+    prev_msg: list[ChatCompletionMessageParam] | None = None,
+    prev_response_output: list[ResponseOutputItem] | None = None,
+):
+    messages: list[ChatCompletionMessageParam] = []
+    if request_instructions:
+        messages.append(
+            {
+                "role": "system",
+                "content": request_instructions,
+            }
+        )
+
+    # Prepend the conversation history.
+    if prev_msg is not None:
+        # Add the previous messages.
+        messages.extend(prev_msg)
+    if prev_response_output is not None:
+        # Add the previous output.
+        for output_item in prev_response_output:
+            # NOTE: We skip the reasoning output.
+            if isinstance(output_item, ResponseOutputMessage):
+                for content in output_item.content:
+                    messages.append(
+                        {
+                            "role": "assistant",
+                            "content": content.text,
+                        }
+                    )
+
+    # Append the new input.
+    # Responses API supports simple text inputs without chat format.
+    if isinstance(request_input, str):
+        messages.append({"role": "user", "content": request_input})
+    else:
+        for item in request_input:
+            messages.append(construct_chat_message_with_tool_call(item))
+    return messages
+
+
 def construct_chat_message_with_tool_call(
     item: ResponseInputOutputItem,
 ) -> ChatCompletionMessageParam:

From e1dd706cd1f2b008f6295a6a64634bf9e9b202c1 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Tue, 25 Nov 2025 04:56:15 -0500
Subject: [PATCH 418/578] [Frontend] Respect Chat Completion
 parallel_tool_calls param (#26233)

Signed-off-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 docs/serving/openai_compatible_server.md   |  3 +-
 tests/tool_use/test_parallel_tool_calls.py | 57 ++++++++++++++++++++++
 vllm/entrypoints/openai/protocol.py        |  4 +-
 vllm/entrypoints/openai/serving_chat.py    |  3 ++
 vllm/entrypoints/openai/serving_engine.py  |  6 +--
 vllm/entrypoints/openai/utils.py           | 37 ++++++++++++++
 6 files changed, 102 insertions(+), 8 deletions(-)
 create mode 100644 vllm/entrypoints/openai/utils.py

diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 23df3963823a..e3280bd15b55 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -49,7 +49,8 @@ We currently support the following OpenAI APIs:
     - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
     - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
-    - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
+    - *Note: `user` parameter is ignored.*
+    - *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls.
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
     - Only applicable to [embedding models](../models/pooling_models.md).
 - [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index 9af94a6a64a2..77084ec2d945 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -212,3 +212,60 @@ async def test_parallel_tool_calls_with_results(
     assert finish_reason_count == 1
     assert len(chunks)
     assert "".join(chunks) == choice.message.content
+
+
+@pytest.mark.asyncio
+async def test_parallel_tool_calls_false(client: openai.AsyncOpenAI):
+    """
+    Ensure only one tool call is returned when parallel_tool_calls is False.
+    """
+
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        parallel_tool_calls=False,
+    )
+
+    stop_reason = chat_completion.choices[0].finish_reason
+    non_streamed_tool_calls = chat_completion.choices[0].message.tool_calls
+
+    # make sure only 1 tool call is present
+    assert len(non_streamed_tool_calls) == 1
+    assert stop_reason == "tool_calls"
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        parallel_tool_calls=False,
+        stream=True,
+    )
+
+    finish_reason_count: int = 0
+    tool_call_id_count: int = 0
+
+    async for chunk in stream:
+        # if there's a finish reason make sure it's tools
+        if chunk.choices[0].finish_reason:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == "tool_calls"
+
+        streamed_tool_calls = chunk.choices[0].delta.tool_calls
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            tool_call = streamed_tool_calls[0]
+            if tool_call.id:
+                tool_call_id_count += 1
+
+    # make sure only 1 streaming tool call is present
+    assert tool_call_id_count == 1
+    assert finish_reason_count == 1
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index c4023a618528..98a385a1dcd5 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -559,9 +559,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
     ) = "none"
     reasoning_effort: Literal["low", "medium", "high"] | None = None
     include_reasoning: bool = True
+    parallel_tool_calls: bool | None = True
 
-    # NOTE this will be ignored by vLLM -- the model determines the behavior
-    parallel_tool_calls: bool | None = False
+    # NOTE this will be ignored by vLLM
     user: str | None = None
 
     # --8<-- [start:chat-completion-sampling-params]
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 2a870dbc3afa..9a7051e0920a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -55,6 +55,7 @@
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
+from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
@@ -1206,6 +1207,7 @@ async def chat_completion_stream_generator(
 
                         finish_reason_sent[i] = True
 
+                    choice_data = maybe_filter_parallel_tool_calls(choice_data, request)
                     chunk = ChatCompletionStreamResponse(
                         id=request_id,
                         object=chunk_object_type,
@@ -1531,6 +1533,7 @@ async def chat_completion_full_generator(
                     as_list(output.token_ids) if request.return_token_ids else None
                 ),
             )
+            choice_data = maybe_filter_parallel_tool_calls(choice_data, request)
 
             choices.append(choice_data)
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 09a135b701d0..d9feee917ff4 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -296,11 +296,7 @@ def _get_tool_parser(
         parser = None
         if not enable_auto_tools or tool_parser_name is None:
             return parser
-        logger.info(
-            '"auto" tool choice has been enabled please note that while'
-            " the parallel_tool_calls client option is preset for "
-            "compatibility reasons, it will be ignored."
-        )
+        logger.info('"auto" tool choice has been enabled.')
 
         try:
             if tool_parser_name == "pythonic" and self.model_config.model.startswith(
diff --git a/vllm/entrypoints/openai/utils.py b/vllm/entrypoints/openai/utils.py
new file mode 100644
index 000000000000..6f37f6adff4c
--- /dev/null
+++ b/vllm/entrypoints/openai/utils.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TypeVar
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+)
+
+# Used internally
+_ChatCompletionResponseChoiceT = TypeVar(
+    "_ChatCompletionResponseChoiceT",
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+)
+
+
+def maybe_filter_parallel_tool_calls(
+    choice: _ChatCompletionResponseChoiceT, request: ChatCompletionRequest
+) -> _ChatCompletionResponseChoiceT:
+    """Filter to first tool call only when parallel_tool_calls is False."""
+
+    if request.parallel_tool_calls:
+        return choice
+
+    if isinstance(choice, ChatCompletionResponseChoice) and choice.message.tool_calls:
+        choice.message.tool_calls = choice.message.tool_calls[:1]
+    elif (
+        isinstance(choice, ChatCompletionResponseStreamChoice)
+        and choice.delta.tool_calls
+    ):
+        choice.delta.tool_calls = [
+            tool_call for tool_call in choice.delta.tool_calls if tool_call.index == 0
+        ]
+
+    return choice

From 7a80b01889a963f1769a3a6f9cd509dc50d5b8ad Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 25 Nov 2025 18:39:10 +0800
Subject: [PATCH 419/578] [CI] Resettle pooling entrypoints tests.  (#29370)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 tests/entrypoints/pooling/{correctness => basic}/__init__.py      | 0
 tests/entrypoints/pooling/{llm => basic}/test_encode.py           | 0
 tests/entrypoints/pooling/{openai => basic}/test_truncation.py    | 0
 tests/entrypoints/pooling/{llm => classify}/__init__.py           | 0
 .../pooling/{llm/test_classify.py => classify/test_offline.py}    | 0
 .../{openai/test_classification.py => classify/test_online.py}    | 0
 .../test_online_vision.py}                                        | 0
 tests/entrypoints/pooling/{openai => embed}/__init__.py           | 0
 .../test_mteb_embed.py => embed/test_correctness_mteb.py}         | 0
 .../pooling/{llm/test_embedding.py => embed/test_offline.py}      | 0
 .../pooling/{openai/test_embedding.py => embed/test_online.py}    | 0
 .../test_online_dimensions.py}                                    | 0
 .../test_online_long_text.py}                                     | 0
 .../test_vision_embedding.py => embed/test_online_vision.py}      | 0
 tests/entrypoints/pooling/pooling/__init__.py                     | 0
 .../pooling/{openai/test_pooling.py => pooling/test_online.py}    | 0
 tests/entrypoints/pooling/reward/__init__.py                      | 0
 .../pooling/{llm/test_reward.py => reward/test_offline.py}        | 0
 tests/entrypoints/pooling/score/__init__.py                       | 0
 .../test_mteb_score.py => score/test_correctness_mteb.py}         | 0
 .../pooling/{llm/test_score.py => score/test_offline.py}          | 0
 .../{openai/test_rerank.py => score/test_online_rerank.py}        | 0
 .../pooling/{openai/test_score.py => score/test_online_score.py}  | 0
 23 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/entrypoints/pooling/{correctness => basic}/__init__.py (100%)
 rename tests/entrypoints/pooling/{llm => basic}/test_encode.py (100%)
 rename tests/entrypoints/pooling/{openai => basic}/test_truncation.py (100%)
 rename tests/entrypoints/pooling/{llm => classify}/__init__.py (100%)
 rename tests/entrypoints/pooling/{llm/test_classify.py => classify/test_offline.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_classification.py => classify/test_online.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_vision_classification.py => classify/test_online_vision.py} (100%)
 rename tests/entrypoints/pooling/{openai => embed}/__init__.py (100%)
 rename tests/entrypoints/pooling/{correctness/test_mteb_embed.py => embed/test_correctness_mteb.py} (100%)
 rename tests/entrypoints/pooling/{llm/test_embedding.py => embed/test_offline.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_embedding.py => embed/test_online.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_embedding_dimensions.py => embed/test_online_dimensions.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_embedding_long_text.py => embed/test_online_long_text.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_vision_embedding.py => embed/test_online_vision.py} (100%)
 create mode 100644 tests/entrypoints/pooling/pooling/__init__.py
 rename tests/entrypoints/pooling/{openai/test_pooling.py => pooling/test_online.py} (100%)
 create mode 100644 tests/entrypoints/pooling/reward/__init__.py
 rename tests/entrypoints/pooling/{llm/test_reward.py => reward/test_offline.py} (100%)
 create mode 100644 tests/entrypoints/pooling/score/__init__.py
 rename tests/entrypoints/pooling/{correctness/test_mteb_score.py => score/test_correctness_mteb.py} (100%)
 rename tests/entrypoints/pooling/{llm/test_score.py => score/test_offline.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_rerank.py => score/test_online_rerank.py} (100%)
 rename tests/entrypoints/pooling/{openai/test_score.py => score/test_online_score.py} (100%)

diff --git a/tests/entrypoints/pooling/correctness/__init__.py b/tests/entrypoints/pooling/basic/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/correctness/__init__.py
rename to tests/entrypoints/pooling/basic/__init__.py
diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/basic/test_encode.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_encode.py
rename to tests/entrypoints/pooling/basic/test_encode.py
diff --git a/tests/entrypoints/pooling/openai/test_truncation.py b/tests/entrypoints/pooling/basic/test_truncation.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_truncation.py
rename to tests/entrypoints/pooling/basic/test_truncation.py
diff --git a/tests/entrypoints/pooling/llm/__init__.py b/tests/entrypoints/pooling/classify/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/__init__.py
rename to tests/entrypoints/pooling/classify/__init__.py
diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/classify/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_classify.py
rename to tests/entrypoints/pooling/classify/test_offline.py
diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/classify/test_online.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_classification.py
rename to tests/entrypoints/pooling/classify/test_online.py
diff --git a/tests/entrypoints/pooling/openai/test_vision_classification.py b/tests/entrypoints/pooling/classify/test_online_vision.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_vision_classification.py
rename to tests/entrypoints/pooling/classify/test_online_vision.py
diff --git a/tests/entrypoints/pooling/openai/__init__.py b/tests/entrypoints/pooling/embed/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/__init__.py
rename to tests/entrypoints/pooling/embed/__init__.py
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
similarity index 100%
rename from tests/entrypoints/pooling/correctness/test_mteb_embed.py
rename to tests/entrypoints/pooling/embed/test_correctness_mteb.py
diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/embed/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_embedding.py
rename to tests/entrypoints/pooling/embed/test_offline.py
diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/embed/test_online.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_embedding.py
rename to tests/entrypoints/pooling/embed/test_online.py
diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/embed/test_online_dimensions.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_embedding_dimensions.py
rename to tests/entrypoints/pooling/embed/test_online_dimensions.py
diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/embed/test_online_long_text.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_embedding_long_text.py
rename to tests/entrypoints/pooling/embed/test_online_long_text.py
diff --git a/tests/entrypoints/pooling/openai/test_vision_embedding.py b/tests/entrypoints/pooling/embed/test_online_vision.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_vision_embedding.py
rename to tests/entrypoints/pooling/embed/test_online_vision.py
diff --git a/tests/entrypoints/pooling/pooling/__init__.py b/tests/entrypoints/pooling/pooling/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/entrypoints/pooling/openai/test_pooling.py b/tests/entrypoints/pooling/pooling/test_online.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_pooling.py
rename to tests/entrypoints/pooling/pooling/test_online.py
diff --git a/tests/entrypoints/pooling/reward/__init__.py b/tests/entrypoints/pooling/reward/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/reward/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_reward.py
rename to tests/entrypoints/pooling/reward/test_offline.py
diff --git a/tests/entrypoints/pooling/score/__init__.py b/tests/entrypoints/pooling/score/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/score/test_correctness_mteb.py
similarity index 100%
rename from tests/entrypoints/pooling/correctness/test_mteb_score.py
rename to tests/entrypoints/pooling/score/test_correctness_mteb.py
diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/score/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_score.py
rename to tests/entrypoints/pooling/score/test_offline.py
diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/score/test_online_rerank.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_rerank.py
rename to tests/entrypoints/pooling/score/test_online_rerank.py
diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/score/test_online_score.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/test_score.py
rename to tests/entrypoints/pooling/score/test_online_score.py

From de6889946bd10045f2ee79b252e75d8f3e323956 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 25 Nov 2025 19:00:44 +0800
Subject: [PATCH 420/578] [Misc] Suppress log outputs when constructing the
 default vllm config. (#29291)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 14 ++++++++------
 vllm/logger.py           | 11 ++++++++++-
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8338e54d4fd8..6b5c8ba87ecb 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -77,7 +77,7 @@
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
-from vllm.logger import init_logger
+from vllm.logger import init_logger, suppress_logging
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
@@ -247,11 +247,13 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
             default = field.default
             # Handle pydantic.Field defaults
             if isinstance(default, FieldInfo):
-                default = (
-                    default.default
-                    if default.default_factory is None
-                    else default.default_factory()
-                )
+                if default.default_factory is None:
+                    default = default.default
+                else:
+                    # VllmConfig's Fields have default_factory set to config classes.
+                    # These could emit logs on init, which would be confusing.
+                    with suppress_logging():
+                        default = default.default_factory()
         elif field.default_factory is not MISSING:
             default = field.default_factory()
 
diff --git a/vllm/logger.py b/vllm/logger.py
index 772e36497b45..ad3123c0f014 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -7,7 +7,8 @@
 import logging
 import os
 import sys
-from collections.abc import Hashable
+from collections.abc import Generator, Hashable
+from contextlib import contextmanager
 from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
@@ -212,6 +213,14 @@ def init_logger(name: str) -> _VllmLogger:
     return cast(_VllmLogger, logger)
 
 
+@contextmanager
+def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]:
+    current_level = logging.root.manager.disable
+    logging.disable(level)
+    yield
+    logging.disable(current_level)
+
+
 # The root logger is initialized when the module is imported.
 # This is thread-safe as the module is only imported once,
 # guaranteed by the Python GIL.

From 798e87db5c21e017f77edcdf7e50b1ac79d65c54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 25 Nov 2025 12:32:11 +0100
Subject: [PATCH 421/578] [Core] Generalize Encoder-Decoder `seq_lens`
 computation to avoid Whisper hardcoded logic   (#29268)

Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
---
 vllm/attention/layers/cross_attention.py | 44 ++++++++++++------------
 vllm/v1/attention/backends/utils.py      |  3 +-
 vllm/v1/worker/gpu_model_runner.py       | 38 ++++++++++++++------
 3 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py
index 5b44c7e3e7ec..068fd0a0eb7d 100644
--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/attention/layers/cross_attention.py
@@ -25,15 +25,6 @@
 logger = init_logger(__name__)
 
 
-def _get_max_encoder_len(vllm_config: "VllmConfig") -> int:
-    """Gets the max number of encoder input tokens from the config."""
-    sc = vllm_config.scheduler_config
-    assert sc and isinstance(sc.max_num_encoder_input_tokens, int), (
-        "max_num_encoder_input_tokens must be int for enc-dec models"
-    )
-    return sc.max_num_encoder_input_tokens
-
-
 def _get_cross_slot_mapping(
     encoder_seq_lens: np.ndarray,
     block_table_tensor: torch.Tensor,
@@ -93,23 +84,32 @@ def build(
         ) -> AttentionMetadata:
             new_metadata = copy(common_attn_metadata)
             new_metadata.causal = False
-            max_encoder_len = _get_max_encoder_len(self.vllm_config)
+            max_encoder_len = int(new_metadata.encoder_seq_lens_cpu.max())
             new_metadata.max_seq_len = max_encoder_len
-
-            new_metadata.seq_lens = torch.full(
-                (new_metadata.num_reqs,),
-                max_encoder_len,
-                dtype=torch.int32,
-                device=self.device,
+            # Any computed tokens indicated decode step>1 (no chunked prefill)
+            num_cache_decodes = (
+                (common_attn_metadata.num_computed_tokens_cpu > 0).sum().item()
             )
-            new_metadata.seq_lens_cpu = torch.full(
-                (new_metadata.num_reqs,),
-                max_encoder_len,
-                dtype=torch.int32,
-                device="cpu",
+            if num_cache_decodes > 0:
+                # CrossAttn KV cache has already been populated on first decoder step,
+                # skip slot_mapping calculation for requests that do not need
+                # reshape_and_cache.
+                num_tokens = common_attn_metadata.num_computed_tokens_cpu.numpy()
+                new_metadata.encoder_seq_lens_cpu = np.where(
+                    num_tokens > 0, 0, new_metadata.encoder_seq_lens_cpu
+                )
+
+            # seq_lens is provided by model runner: initial encoder input length is
+            # needed here to know how many tokens to attend to from the cached
+            # cross-attention KV cache.
+            new_metadata.seq_lens = common_attn_metadata.encoder_seq_lens
+            new_metadata.seq_lens_cpu = torch.from_numpy(
+                common_attn_metadata.encoder_seq_lens_cpu
             )
+
+            # NOTE (NickLucche) use `new_metadata` instead of `common_*` (initial) here
             new_metadata.slot_mapping = _get_cross_slot_mapping(
-                new_metadata.encoder_seq_lens,
+                new_metadata.encoder_seq_lens_cpu,
                 new_metadata.block_table_tensor,
                 self.kv_cache_spec,
                 self.device,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 540a8e2b1d01..cebfe8a3ff04 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -89,7 +89,8 @@ class CommonAttentionMetadata:
     num_logits_indices: int | None = None
 
     # Needed by CrossAttentionBuilder
-    encoder_seq_lens: np.ndarray | None = None
+    encoder_seq_lens: torch.Tensor | None = None
+    encoder_seq_lens_cpu: np.ndarray | None = None
 
     dcp_local_seq_lens: torch.Tensor | None = None
     dcp_local_seq_lens_cpu: torch.Tensor | None = None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 74fd2a1e2a2c..0ce6c4a3204b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -475,6 +475,7 @@ def __init__(
             self.max_num_reqs + 1, dtype=torch.int32
         )
         self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        self.encoder_seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
         if self.dcp_world_size > 1:
             self.dcp_local_seq_lens = self._make_buffer(
                 self.max_num_reqs, dtype=torch.int32
@@ -1202,21 +1203,35 @@ def _prepare_input_ids(
 
     def _get_encoder_seq_lens(
         self,
-        scheduled_encoder_inputs: dict[str, list[int]],
+        num_scheduled_tokens: dict[str, int],
         kv_cache_spec: KVCacheSpec,
         num_reqs: int,
-    ) -> np.ndarray | None:
+    ) -> tuple[torch.Tensor | None, np.ndarray | None]:
         if not isinstance(kv_cache_spec, CrossAttentionSpec):
-            return None
+            return None, None
 
         # Build encoder_seq_lens array mapping request indices to
         # encoder lengths for inputs scheduled in this batch
-        encoder_seq_lens = np.zeros(num_reqs, dtype=np.int32)
-        for req_id in scheduled_encoder_inputs:
+        for req_id in num_scheduled_tokens:
             req_index = self.input_batch.req_id_to_index[req_id]
-            encoder_seq_lens[req_index] = self.max_encoder_len
+            req_state = self.requests[req_id]
+            if req_state.mm_features is None:
+                self.encoder_seq_lens.np[req_index] = 0
+                continue
+
+            # Get the total number of encoder input tokens for running encoder requests
+            # whether encoding is finished or not so that cross-attention knows how
+            # many encoder tokens to attend to.
+            encoder_input_tokens = sum(
+                feature.mm_position.length for feature in req_state.mm_features
+            )
+            self.encoder_seq_lens.np[req_index] = encoder_input_tokens
+
+        self.encoder_seq_lens.copy_to_gpu(num_reqs)
+        encoder_seq_lens = self.encoder_seq_lens.gpu[:num_reqs]
+        encoder_seq_lens_cpu = self.encoder_seq_lens.np[:num_reqs]
 
-        return encoder_seq_lens
+        return encoder_seq_lens, encoder_seq_lens_cpu
 
     def _prepare_inputs(
         self,
@@ -1482,7 +1497,7 @@ def _build_attention_metadata(
         logits_indices: torch.Tensor | None = None,
         use_spec_decode: bool = False,
         for_cudagraph_capture: bool = False,
-        scheduled_encoder_inputs: dict[str, list[int]] | None = None,
+        num_scheduled_tokens: dict[str, int] | None = None,
         cascade_attn_prefix_lens: list[list[int]] | None = None,
     ) -> tuple[PerLayerAttnMetadata, CommonAttentionMetadata | None]:
         """
@@ -1547,8 +1562,8 @@ def _build_attention_metadata(
         for kv_cache_gid, kv_cache_group in enumerate(
             self.kv_cache_config.kv_cache_groups
         ):
-            encoder_seq_lens = self._get_encoder_seq_lens(
-                scheduled_encoder_inputs or {},
+            encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens(
+                num_scheduled_tokens or {},
                 kv_cache_group.kv_cache_spec,
                 num_reqs,
             )
@@ -1591,6 +1606,7 @@ def _build_attention_metadata(
                 num_logits_indices=num_logits_indices,
                 causal=True,
                 encoder_seq_lens=encoder_seq_lens,
+                encoder_seq_lens_cpu=encoder_seq_lens_cpu,
                 dcp_local_seq_lens=dcp_local_seq_lens,
                 dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu,
             )
@@ -2828,7 +2844,7 @@ def execute_model(
                         ubatch_slices=ubatch_slices,
                         logits_indices=logits_indices,
                         use_spec_decode=use_spec_decode,
-                        scheduled_encoder_inputs=scheduler_output.scheduled_encoder_inputs,
+                        num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
                         cascade_attn_prefix_lens=cascade_attn_prefix_lens,
                     )
                 )

From c2c661af9be413fb22adc59fc17fe5f5a680b313 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Tue, 25 Nov 2025 04:38:36 -0800
Subject: [PATCH 422/578] [Bugfix] Fix overallocation in MM profiling  (#29386)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 vllm/v1/worker/gpu_model_runner.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0ce6c4a3204b..e78d3c71af77 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4245,14 +4245,18 @@ def profile_run(self) -> None:
                     # NOTE: This happens when encoder cache needs to store
                     # the embeddings that encoder outputs are scattered onto.
                     # In this case we create dummy embeddings of size
-                    # (encode_budget, hidden_size) and scatter encoder
-                    # output into it.
+                    # (max_tokens_for_modality, hidden_size) and scatter
+                    # encoder output into it.
                     encoder_output_shape = dummy_encoder_outputs[0].shape
-                    if encoder_output_shape[0] < encoder_budget:
+                    max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[
+                        dummy_modality
+                    ]
+                    if encoder_output_shape[0] < max_mm_tokens_per_item:
+                        encoder_hidden_size = encoder_output_shape[-1]
                         expanded_outputs = []
                         for output in dummy_encoder_outputs:
                             expanded = output.new_zeros(
-                                (encoder_budget, encoder_output_shape[-1])
+                                (max_mm_tokens_per_item, encoder_hidden_size)
                             )
                             num_tokens = output.shape[0]
                             expanded[:num_tokens].copy_(output)

From bf0c75cd4f638b359f52602bb0fd54ef434068cb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 12:41:15 +0000
Subject: [PATCH 423/578] Make Transformers Nightly tests soft-fail and enable
 all tests (#29401)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e444becd9867..10a19c52c72d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -903,11 +903,12 @@ steps:
 - label: Transformers Nightly Models Test
   working_dir: "/vllm-workspace/"
   optional: true
+  soft_fail: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py
     - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/processing/
     - pytest -v -s tests/models/multimodal/test_mapping.py
     - python3 examples/offline_inference/basic/chat.py
     - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl

From 51fc9e017a721c7fb283cecf3231bbe6e358132b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 12:55:42 +0000
Subject: [PATCH 424/578] Scheduled removal of `CompilationConfig.use_inductor`
 (#29323)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/compile/fullgraph/test_simple.py | 12 +++++------
 tests/engine/test_arg_utils.py         |  9 ++++----
 tests/utils_/test_argparse_utils.py    |  4 ++--
 vllm/config/compilation.py             | 29 +-------------------------
 4 files changed, 13 insertions(+), 41 deletions(-)

diff --git a/tests/compile/fullgraph/test_simple.py b/tests/compile/fullgraph/test_simple.py
index e258133ab50a..36cc1510ed79 100644
--- a/tests/compile/fullgraph/test_simple.py
+++ b/tests/compile/fullgraph/test_simple.py
@@ -55,7 +55,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 def _run_simple_model(
     splitting_ops,
     use_inductor_graph_partition,
-    use_inductor,
+    backend,
     expected_num_piecewise_graphs_seen,
     expected_num_piecewise_capturable_graphs_seen,
     expected_num_backend_compilations,
@@ -64,7 +64,7 @@ def _run_simple_model(
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_inductor=use_inductor,
+            backend=backend,
             splitting_ops=splitting_ops,
             use_inductor_graph_partition=use_inductor_graph_partition,
             cudagraph_copy_inputs=True,
@@ -124,14 +124,14 @@ def _run_simple_model(
         assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
 
 
-@pytest.mark.parametrize("use_inductor", [True, False])
+@pytest.mark.parametrize("backend", ["inductor", "eager"])
 @torch.inference_mode()
 @create_new_process_for_each_test("spawn")
-def test_simple_piecewise_compile(use_inductor):
+def test_simple_piecewise_compile(backend):
     _run_simple_model(
         splitting_ops=["silly::attention"],
         use_inductor_graph_partition=False,
-        use_inductor=use_inductor,
+        backend=backend,
         # 2 * num_layers + 1
         expected_num_piecewise_graphs_seen=5,
         # 1 + num_layers
@@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
     _run_simple_model(
         splitting_ops=["silly::attention"],
         use_inductor_graph_partition=True,
-        use_inductor=True,
+        backend="inductor",
         # Since not splitting at fx graph level
         expected_num_piecewise_graphs_seen=1,
         # Since not splitting at fx graph level
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 10827e3b4b9c..93bc94123aaa 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -249,14 +249,13 @@ def test_compilation_config():
     args = parser.parse_args(
         [
             "-O",
-            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-            '"use_inductor": false}',
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
         ]
     )
     assert (
         args.compilation_config.mode == 3
         and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and not args.compilation_config.use_inductor
+        and args.compilation_config.backend == "eager"
     )
 
     # set to string form of a dict
@@ -264,13 +263,13 @@ def test_compilation_config():
         [
             "--compilation-config="
             '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-            '"use_inductor": true}',
+            '"backend": "inductor"}',
         ]
     )
     assert (
         args.compilation_config.mode == 3
         and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and args.compilation_config.use_inductor
+        and args.compilation_config.backend == "inductor"
     )
 
 
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 3310753d2b6d..32d4eca54135 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -166,7 +166,7 @@ def test_dict_args(parser):
         "--hf-overrides.key2.key4",
         "val3",
         # Test compile config and compilation mode
-        "-O.use_inductor=true",
+        "-O.use_inductor_graph_partition=true",
         "-O.backend",
         "custom",
         "-O1",
@@ -219,7 +219,7 @@ def test_dict_args(parser):
     }
     assert parsed_args.compilation_config == {
         "mode": 1,
-        "use_inductor": True,
+        "use_inductor_graph_partition": True,
         "backend": "custom",
         "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
     }
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 556b2d9168b3..865d045676d1 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -264,7 +264,6 @@ class CompilationConfig:
         - [`cudagraph_copy_inputs`]
         [vllm.config.CompilationConfig.cudagraph_copy_inputs]
     - Inductor compilation:
-        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
         - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
         - [`inductor_compile_config`]
         [vllm.config.CompilationConfig.inductor_compile_config]
@@ -348,7 +347,7 @@ class CompilationConfig:
     - 'none,+op1,+op2' to enable only op1 and op2
 
     By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
+    disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
     Inductor generates (fused) Triton kernels for disabled custom ops."""
     splitting_ops: list[str] | None = None
     """A list of ops to exclude from cudagraphs, used in piecewise compilation.
@@ -374,24 +373,6 @@ class CompilationConfig:
     Disabled by default until more models are supported/tested to work."""
 
     # Inductor capture
-    use_inductor: bool | None = None
-    """
-    Whether to use inductor compilation.
-
-    This flag is deprecated and will be removed in the next release 0.12.0.
-    Please use the 'backend' option instead.
-
-    - False: inductor compilation is not used. graph runs in eager
-        (custom_ops enabled by default).
-    - True: inductor compilation is used (custom_ops disabled by default).
-        One graph for symbolic shape and one graph per size in compile_sizes
-        are compiled using configurations in inductor_compile_config.
-
-    This setting is ignored if mode<VLLM_COMPILE.
-
-    For future compatibility:
-    If use_inductor is True, backend="inductor" otherwise backend="eager".
-    """
     compile_sizes: list[int | str] | None = None
     """Sizes to compile for inductor. In addition
     to integers, it also supports "cudagraph_capture_sizes" to
@@ -759,14 +740,6 @@ def __post_init__(self) -> None:
                 f"Invalid backend for piecewise compilation: {self.backend}"
             )
 
-        if self.use_inductor is not None:
-            logger.warning_once(
-                "The 'use_inductor' flag is deprecated and will be "
-                "removed in the next release (v0.12.0). "
-                "Please use the 'backend' option instead.",
-            )
-            self.backend = "inductor" if self.use_inductor else "eager"
-
         if self.backend == "":
             self.backend = current_platform.get_compile_backend()
 

From 516c3f784723f0f25d65305ef7051ae016e18d42 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 25 Nov 2025 15:05:10 +0100
Subject: [PATCH 425/578] [Bugfix] Fix logic for choosing default prefix
 caching setting (#29393)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 tests/engine/test_arg_utils.py | 3 ++-
 vllm/engine/arg_utils.py       | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 93bc94123aaa..be926764e494 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -277,8 +277,9 @@ def test_prefix_cache_default():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     args = parser.parse_args([])
 
+    # should be None by default (depends on model).
     engine_args = EngineArgs.from_cli_args(args=args)
-    assert engine_args.enable_prefix_caching, "prefix caching should default to on."
+    assert engine_args.enable_prefix_caching is None
 
     # with flag to turn it on.
     args = parser.parse_args(["--enable-prefix-caching"])
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6b5c8ba87ecb..177915715140 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -880,7 +880,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
         )
         cache_group.add_argument(
-            "--enable-prefix-caching", **cache_kwargs["enable_prefix_caching"]
+            "--enable-prefix-caching",
+            **{
+                **cache_kwargs["enable_prefix_caching"],
+                "default": None,
+            },
         )
         cache_group.add_argument(
             "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]

From 0231ce836a27ad0ed722e4b7162821baeeb19fdf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eldar=20Kurti=C4=87?=
 <8884008+eldarkurtic@users.noreply.github.com>
Date: Tue, 25 Nov 2025 15:23:38 +0100
Subject: [PATCH 426/578] Revert back to torch.equal over torch.allclose from
 #28819  (#29086)

Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com>
---
 vllm/v1/spec_decode/eagle.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 784ccbc04932..7b9037c03d4f 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1055,11 +1055,11 @@ def load_model(self, target_model: nn.Module) -> None:
                 elif (
                     isinstance(target_embed_tokens.weight, torch.Tensor)
                     and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
-                    and torch.allclose(
+                    # TODO: Offload to CPU for comparison to avoid extra GPU memory
+                    # usage in CI testing environments with limited GPU memory
+                    and torch.equal(
                         target_embed_tokens.weight.cpu(),
                         self.model.model.embed_tokens.weight.cpu(),
-                        rtol=1e-5,
-                        atol=1e-7,
                     )
                 ):
                     share_embeddings = True
@@ -1105,8 +1105,11 @@ def load_model(self, target_model: nn.Module) -> None:
                 hasattr(target_language_model, "lm_head")
                 and isinstance(target_language_model.lm_head.weight, torch.Tensor)
                 and isinstance(self.model.lm_head.weight, torch.Tensor)
+                # TODO: Offload to CPU for comparison to avoid extra GPU memory
+                # usage in CI testing environments with limited GPU memory
                 and torch.equal(
-                    target_language_model.lm_head.weight, self.model.lm_head.weight
+                    target_language_model.lm_head.weight.cpu(),
+                    self.model.lm_head.weight.cpu(),
                 )
             ):
                 share_lm_head = True

From 794029f012066e6039b63566ef7d845af1ade831 Mon Sep 17 00:00:00 2001
From: Injae Ryou <injaeryou@gmail.com>
Date: Tue, 25 Nov 2025 23:28:53 +0900
Subject: [PATCH 427/578] [Feature]: Improve GGUF loading from HuggingFace user
 experience like repo_id:quant_type (#29137)

Signed-off-by: Injae Ryou <injaeryou@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/test_gguf_download.py            | 240 ++++++++++++++++++
 tests/transformers_utils/test_utils.py        | 146 +++++++++++
 vllm/config/model.py                          |  15 +-
 vllm/engine/arg_utils.py                      |   6 +-
 .../model_loader/gguf_loader.py               |  31 ++-
 .../model_loader/weight_utils.py              |  46 ++++
 vllm/transformers_utils/config.py             |  51 +++-
 vllm/transformers_utils/processor.py          |  10 +-
 vllm/transformers_utils/tokenizer.py          |  17 +-
 vllm/transformers_utils/utils.py              |  53 ++++
 10 files changed, 579 insertions(+), 36 deletions(-)
 create mode 100644 tests/models/test_gguf_download.py

diff --git a/tests/models/test_gguf_download.py b/tests/models/test_gguf_download.py
new file mode 100644
index 000000000000..155768ac9bff
--- /dev/null
+++ b/tests/models/test_gguf_download.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
+from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader
+from vllm.model_executor.model_loader.weight_utils import download_gguf
+
+
+class TestGGUFDownload:
+    """Test GGUF model downloading functionality."""
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_single_file(self, mock_download):
+        """Test downloading a single GGUF file."""
+        # Setup mock
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        # Mock glob to return a single file
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [f"{mock_folder}/model-IQ1_S.gguf"] if "IQ1_S" in pattern else []
+            )
+
+            result = download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
+
+            # Verify download_weights_from_hf was called with correct patterns
+            mock_download.assert_called_once_with(
+                model_name_or_path="unsloth/Qwen3-0.6B-GGUF",
+                cache_dir=None,
+                allow_patterns=[
+                    "*-IQ1_S.gguf",
+                    "*-IQ1_S-*.gguf",
+                    "*/*-IQ1_S.gguf",
+                    "*/*-IQ1_S-*.gguf",
+                ],
+                revision=None,
+                ignore_patterns=None,
+            )
+
+            # Verify result is the file path, not folder
+            assert result == f"{mock_folder}/model-IQ1_S.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_sharded_files(self, mock_download):
+        """Test downloading sharded GGUF files."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        # Mock glob to return sharded files
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [
+                    f"{mock_folder}/model-Q2_K-00001-of-00002.gguf",
+                    f"{mock_folder}/model-Q2_K-00002-of-00002.gguf",
+                ]
+                if "Q2_K" in pattern
+                else []
+            )
+
+            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
+
+            # Should return the first file after sorting
+            assert result == f"{mock_folder}/model-Q2_K-00001-of-00002.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_subdir(self, mock_download):
+        """Test downloading GGUF files from subdirectory."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [f"{mock_folder}/Q2_K/model-Q2_K.gguf"]
+                if "Q2_K" in pattern or "**/*.gguf" in pattern
+                else []
+            )
+
+            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
+
+            assert result == f"{mock_folder}/Q2_K/model-Q2_K.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    @patch("glob.glob", return_value=[])
+    def test_download_gguf_no_files_found(self, mock_glob, mock_download):
+        """Test error when no GGUF files are found."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        with pytest.raises(ValueError, match="Downloaded GGUF files not found"):
+            download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
+
+
+class TestGGUFModelLoader:
+    """Test GGUFModelLoader class methods."""
+
+    @patch("os.path.isfile", return_value=True)
+    def test_prepare_weights_local_file(self, mock_isfile):
+        """Test _prepare_weights with local file."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "/path/to/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/path/to/model.gguf"
+        mock_isfile.assert_called_once_with("/path/to/model.gguf")
+
+    @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_https_url(self, mock_isfile, mock_hf_download):
+        """Test _prepare_weights with HTTPS URL."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_hf_download.return_value = "/downloaded/model.gguf"
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "https://huggingface.co/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/downloaded/model.gguf"
+        mock_hf_download.assert_called_once_with(
+            url="https://huggingface.co/model.gguf"
+        )
+
+    @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_repo_filename(self, mock_isfile, mock_hf_download):
+        """Test _prepare_weights with repo_id/filename.gguf format."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_hf_download.return_value = "/downloaded/model.gguf"
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "unsloth/Qwen3-0.6B-GGUF/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/downloaded/model.gguf"
+        mock_hf_download.assert_called_once_with(
+            repo_id="unsloth/Qwen3-0.6B-GGUF", filename="model.gguf"
+        )
+
+    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
+    @patch("vllm.transformers_utils.config.file_or_path_exists", return_value=True)
+    @patch("vllm.config.model.get_config")
+    @patch("vllm.config.model.is_gguf", return_value=True)
+    @patch("vllm.model_executor.model_loader.gguf_loader.download_gguf")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_repo_quant_type(
+        self,
+        mock_isfile,
+        mock_download_gguf,
+        mock_is_gguf,
+        mock_get_config,
+        mock_file_exists,
+        mock_get_image_config,
+    ):
+        """Test _prepare_weights with repo_id:quant_type format."""
+        mock_hf_config = MagicMock()
+        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
+
+        class MockTextConfig:
+            max_position_embeddings = 4096
+            sliding_window = None
+            model_type = "qwen3"
+            num_attention_heads = 32
+
+        mock_text_config = MockTextConfig()
+        mock_hf_config.get_text_config.return_value = mock_text_config
+        mock_hf_config.dtype = "bfloat16"
+        mock_get_config.return_value = mock_hf_config
+
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_download_gguf.return_value = "/downloaded/model-IQ1_S.gguf"
+
+        model_config = ModelConfig(
+            model="unsloth/Qwen3-0.6B-GGUF:IQ1_S", tokenizer="Qwen/Qwen3-0.6B"
+        )
+        result = loader._prepare_weights(model_config)
+        # The actual result will be the downloaded file path from mock
+        assert result == "/downloaded/model-IQ1_S.gguf"
+        mock_download_gguf.assert_called_once_with(
+            "unsloth/Qwen3-0.6B-GGUF",
+            "IQ1_S",
+            cache_dir=None,
+            revision=None,
+            ignore_patterns=["original/**/*"],
+        )
+
+    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
+    @patch("vllm.config.model.get_config")
+    @patch("vllm.config.model.is_gguf", return_value=False)
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_invalid_format(
+        self,
+        mock_isfile,
+        mock_check_gguf,
+        mock_is_gguf,
+        mock_get_config,
+        mock_get_image_config,
+    ):
+        """Test _prepare_weights with invalid format."""
+        mock_hf_config = MagicMock()
+        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
+
+        class MockTextConfig:
+            max_position_embeddings = 4096
+            sliding_window = None
+            model_type = "qwen3"
+            num_attention_heads = 32
+
+        mock_text_config = MockTextConfig()
+        mock_hf_config.get_text_config.return_value = mock_text_config
+        mock_hf_config.dtype = "bfloat16"
+        mock_get_config.return_value = mock_hf_config
+
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        # Create ModelConfig with a valid repo_id to avoid validation errors
+        # Then test _prepare_weights with invalid format
+        model_config = ModelConfig(model="unsloth/Qwen3-0.6B")
+        # Manually set model to invalid format after creation
+        model_config.model = "invalid-format"
+        with pytest.raises(ValueError, match="Unrecognised GGUF reference"):
+            loader._prepare_weights(model_config)
diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py
index bfe1cec76c13..a8d0b9be9ec2 100644
--- a/tests/transformers_utils/test_utils.py
+++ b/tests/transformers_utils/test_utils.py
@@ -1,11 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+from unittest.mock import patch
 
+import pytest
 
 from vllm.transformers_utils.utils import (
     is_cloud_storage,
     is_gcs,
+    is_gguf,
+    is_remote_gguf,
     is_s3,
+    split_remote_gguf,
 )
 
 
@@ -28,3 +34,143 @@ def test_is_cloud_storage():
     assert is_cloud_storage("s3://model-path/path-to-model")
     assert not is_cloud_storage("/unix/local/path")
     assert not is_cloud_storage("nfs://nfs-fqdn.local")
+
+
+class TestIsRemoteGGUF:
+    """Test is_remote_gguf utility function."""
+
+    def test_is_remote_gguf_with_colon_and_slash(self):
+        """Test is_remote_gguf with repo_id:quant_type format."""
+        # Valid quant types
+        assert is_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert is_remote_gguf("user/repo:Q2_K")
+        assert is_remote_gguf("repo/model:Q4_K")
+        assert is_remote_gguf("repo/model:Q8_0")
+
+        # Invalid quant types should return False
+        assert not is_remote_gguf("repo/model:quant")
+        assert not is_remote_gguf("repo/model:INVALID")
+        assert not is_remote_gguf("repo/model:invalid_type")
+
+    def test_is_remote_gguf_without_colon(self):
+        """Test is_remote_gguf without colon."""
+        assert not is_remote_gguf("repo/model")
+        assert not is_remote_gguf("unsloth/Qwen3-0.6B-GGUF")
+
+    def test_is_remote_gguf_without_slash(self):
+        """Test is_remote_gguf without slash."""
+        assert not is_remote_gguf("model.gguf")
+        # Even with valid quant_type, no slash means not remote GGUF
+        assert not is_remote_gguf("model:IQ1_S")
+        assert not is_remote_gguf("model:quant")
+
+    def test_is_remote_gguf_local_path(self):
+        """Test is_remote_gguf with local file path."""
+        assert not is_remote_gguf("/path/to/model.gguf")
+        assert not is_remote_gguf("./model.gguf")
+
+    def test_is_remote_gguf_with_path_object(self):
+        """Test is_remote_gguf with Path object."""
+        assert is_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
+        assert not is_remote_gguf(Path("repo/model"))
+
+    def test_is_remote_gguf_with_http_https(self):
+        """Test is_remote_gguf with HTTP/HTTPS URLs."""
+        # HTTP/HTTPS URLs should return False even with valid quant_type
+        assert not is_remote_gguf("http://example.com/repo/model:IQ1_S")
+        assert not is_remote_gguf("https://huggingface.co/repo/model:Q2_K")
+        assert not is_remote_gguf("http://repo/model:Q4_K")
+        assert not is_remote_gguf("https://repo/model:Q8_0")
+
+    def test_is_remote_gguf_with_cloud_storage(self):
+        """Test is_remote_gguf with cloud storage paths."""
+        # Cloud storage paths should return False even with valid quant_type
+        assert not is_remote_gguf("s3://bucket/repo/model:IQ1_S")
+        assert not is_remote_gguf("gs://bucket/repo/model:Q2_K")
+        assert not is_remote_gguf("s3://repo/model:Q4_K")
+        assert not is_remote_gguf("gs://repo/model:Q8_0")
+
+
+class TestSplitRemoteGGUF:
+    """Test split_remote_gguf utility function."""
+
+    def test_split_remote_gguf_valid(self):
+        """Test split_remote_gguf with valid repo_id:quant_type format."""
+        repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
+        assert quant_type == "IQ1_S"
+
+        repo_id, quant_type = split_remote_gguf("repo/model:Q2_K")
+        assert repo_id == "repo/model"
+        assert quant_type == "Q2_K"
+
+    def test_split_remote_gguf_with_path_object(self):
+        """Test split_remote_gguf with Path object."""
+        repo_id, quant_type = split_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
+        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
+        assert quant_type == "IQ1_S"
+
+    def test_split_remote_gguf_invalid(self):
+        """Test split_remote_gguf with invalid format."""
+        # Invalid format (no colon) - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("repo/model")
+
+        # Invalid quant type - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("repo/model:INVALID_TYPE")
+
+        # HTTP URL - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("http://repo/model:IQ1_S")
+
+        # Cloud storage - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("s3://bucket/repo/model:Q2_K")
+
+
+class TestIsGGUF:
+    """Test is_gguf utility function."""
+
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=True)
+    def test_is_gguf_with_local_file(self, mock_check_gguf):
+        """Test is_gguf with local GGUF file."""
+        assert is_gguf("/path/to/model.gguf")
+        assert is_gguf("./model.gguf")
+
+    def test_is_gguf_with_remote_gguf(self):
+        """Test is_gguf with remote GGUF format."""
+        # Valid remote GGUF format (repo_id:quant_type with valid quant_type)
+        assert is_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert is_gguf("repo/model:Q2_K")
+        assert is_gguf("repo/model:Q4_K")
+
+        # Invalid quant_type should return False
+        assert not is_gguf("repo/model:quant")
+        assert not is_gguf("repo/model:INVALID")
+
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    def test_is_gguf_false(self, mock_check_gguf):
+        """Test is_gguf returns False for non-GGUF models."""
+        assert not is_gguf("unsloth/Qwen3-0.6B")
+        assert not is_gguf("repo/model")
+        assert not is_gguf("model")
+
+    def test_is_gguf_edge_cases(self):
+        """Test is_gguf with edge cases."""
+        # Empty string
+        assert not is_gguf("")
+
+        # Only colon, no slash (even with valid quant_type)
+        assert not is_gguf("model:IQ1_S")
+
+        # Only slash, no colon
+        assert not is_gguf("repo/model")
+
+        # HTTP/HTTPS URLs
+        assert not is_gguf("http://repo/model:IQ1_S")
+        assert not is_gguf("https://repo/model:Q2_K")
+
+        # Cloud storage
+        assert not is_gguf("s3://bucket/repo/model:IQ1_S")
+        assert not is_gguf("gs://bucket/repo/model:Q2_K")
diff --git a/vllm/config/model.py b/vllm/config/model.py
index caa9a3440c41..14ffdec2e09d 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -39,7 +39,12 @@
     maybe_patch_hf_config_from_gguf,
 )
 from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import check_gguf_file, maybe_model_redirect
+from vllm.transformers_utils.utils import (
+    is_gguf,
+    is_remote_gguf,
+    maybe_model_redirect,
+    split_remote_gguf,
+)
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.torch_utils import common_broadcastable_dtype
 
@@ -440,7 +445,8 @@ def __post_init__(
         self.model = maybe_model_redirect(self.model)
         # The tokenizer is consistent with the model by default.
         if self.tokenizer is None:
-            if check_gguf_file(self.model):
+            # Check if this is a GGUF model (either local file or remote GGUF)
+            if is_gguf(self.model):
                 raise ValueError(
                     "Using a tokenizer is mandatory when loading a GGUF model. "
                     "Please specify the tokenizer path or name using the "
@@ -832,7 +838,10 @@ def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> No
             self.tokenizer = object_storage_tokenizer.dir
 
     def _get_encoder_config(self):
-        return get_sentence_transformer_tokenizer_config(self.model, self.revision)
+        model = self.model
+        if is_remote_gguf(model):
+            model, _ = split_remote_gguf(model)
+        return get_sentence_transformer_tokenizer_config(model, self.revision)
 
     def _verify_tokenizer_mode(self) -> None:
         tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 177915715140..6d5b3392baa2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -86,7 +86,7 @@
     is_interleaved,
     maybe_override_with_speculators,
 )
-from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage
+from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
@@ -1148,8 +1148,8 @@ def from_cli_args(cls, args: argparse.Namespace):
         return engine_args
 
     def create_model_config(self) -> ModelConfig:
-        # gguf file needs a specific model loader and doesn't use hf_repo
-        if check_gguf_file(self.model):
+        # gguf file needs a specific model loader
+        if is_gguf(self.model):
             self.quantization = self.load_format = "gguf"
 
         # NOTE(woosuk): In V1, we use separate processes for workers (unless
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 2416836be03c..74052f72ceab 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -18,6 +18,7 @@
     process_weights_after_loading,
 )
 from vllm.model_executor.model_loader.weight_utils import (
+    download_gguf,
     get_gguf_extra_tensor_names,
     get_gguf_weight_type_map,
     gguf_quant_weights_iterator,
@@ -43,7 +44,8 @@ def __init__(self, load_config: LoadConfig):
                 f"load format {load_config.load_format}"
             )
 
-    def _prepare_weights(self, model_name_or_path: str):
+    def _prepare_weights(self, model_config: ModelConfig):
+        model_name_or_path = model_config.model
         if os.path.isfile(model_name_or_path):
             return model_name_or_path
         # for raw HTTPS link
@@ -55,12 +57,23 @@ def _prepare_weights(self, model_name_or_path: str):
         if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"):
             repo_id, filename = model_name_or_path.rsplit("/", 1)
             return hf_hub_download(repo_id=repo_id, filename=filename)
-        else:
-            raise ValueError(
-                f"Unrecognised GGUF reference: {model_name_or_path} "
-                "(expected local file, raw URL, or <repo_id>/<filename>.gguf)"
+        # repo_id:quant_type
+        elif "/" in model_name_or_path and ":" in model_name_or_path:
+            repo_id, quant_type = model_name_or_path.rsplit(":", 1)
+            return download_gguf(
+                repo_id,
+                quant_type,
+                cache_dir=self.load_config.download_dir,
+                revision=model_config.revision,
+                ignore_patterns=self.load_config.ignore_patterns,
             )
 
+        raise ValueError(
+            f"Unrecognised GGUF reference: {model_name_or_path} "
+            "(expected local file, raw URL, <repo_id>/<filename>.gguf, "
+            "or <repo_id>:<quant_type>)"
+        )
+
     def _get_gguf_weights_map(self, model_config: ModelConfig):
         """
         GGUF uses this naming convention for their tensors from HF checkpoint:
@@ -244,7 +257,7 @@ def _get_gguf_weight_type(
         gguf_to_hf_name_map: dict[str, str],
     ) -> dict[str, str]:
         weight_type_map = get_gguf_weight_type_map(
-            model_config.model, gguf_to_hf_name_map
+            model_name_or_path, gguf_to_hf_name_map
         )
         is_multimodal = hasattr(model_config.hf_config, "vision_config")
         if is_multimodal:
@@ -290,10 +303,10 @@ def _get_weights_iterator(
         yield from gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
 
     def download_model(self, model_config: ModelConfig) -> None:
-        self._prepare_weights(model_config.model)
+        self._prepare_weights(model_config)
 
     def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
-        local_model_path = self._prepare_weights(model_config.model)
+        local_model_path = self._prepare_weights(model_config)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         model.load_weights(
             self._get_weights_iterator(model_config, local_model_path, gguf_weights_map)
@@ -303,7 +316,7 @@ def load_model(
         self, vllm_config: VllmConfig, model_config: ModelConfig
     ) -> nn.Module:
         device_config = vllm_config.device_config
-        local_model_path = self._prepare_weights(model_config.model)
+        local_model_path = self._prepare_weights(model_config)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         # we can only know if tie word embeddings after mapping weights
         if "lm_head.weight" in get_gguf_extra_tensor_names(
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 4572ebe2ea11..0809bdfa9d4c 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -369,6 +369,52 @@ def get_sparse_attention_config(
     return config
 
 
+def download_gguf(
+    repo_id: str,
+    quant_type: str,
+    cache_dir: str | None = None,
+    revision: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
+) -> str:
+    # Use patterns that snapshot_download can handle directly
+    # Patterns to match:
+    # - *-{quant_type}.gguf (root)
+    # - *-{quant_type}-*.gguf (root sharded)
+    # - */*-{quant_type}.gguf (subdir)
+    # - */*-{quant_type}-*.gguf (subdir sharded)
+    allow_patterns = [
+        f"*-{quant_type}.gguf",
+        f"*-{quant_type}-*.gguf",
+        f"*/*-{quant_type}.gguf",
+        f"*/*-{quant_type}-*.gguf",
+    ]
+
+    # Use download_weights_from_hf which handles caching and downloading
+    folder = download_weights_from_hf(
+        model_name_or_path=repo_id,
+        cache_dir=cache_dir,
+        allow_patterns=allow_patterns,
+        revision=revision,
+        ignore_patterns=ignore_patterns,
+    )
+
+    # Find the downloaded file(s) in the folder
+    local_files = []
+    for pattern in allow_patterns:
+        # Convert pattern to glob pattern for local filesystem
+        glob_pattern = os.path.join(folder, pattern)
+        local_files.extend(glob.glob(glob_pattern))
+
+    if not local_files:
+        raise ValueError(
+            f"Downloaded GGUF files not found in {folder} for quant_type {quant_type}"
+        )
+
+    # Sort to ensure consistent ordering (prefer non-sharded files)
+    local_files.sort(key=lambda x: (x.count("-"), x))
+    return local_files[0]
+
+
 def download_weights_from_hf(
     model_name_or_path: str,
     cache_dir: str | None,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index c1880a3fba0e..a29d92f67f5d 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -42,7 +42,10 @@
 from vllm.transformers_utils.config_parser_base import ConfigParserBase
 from vllm.transformers_utils.utils import (
     check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
     parse_safetensors_file_metadata,
+    split_remote_gguf,
 )
 
 if envs.VLLM_USE_MODELSCOPE:
@@ -629,10 +632,12 @@ def maybe_override_with_speculators(
     Returns:
         Tuple of (resolved_model, resolved_tokenizer, speculative_config)
     """
-    is_gguf = check_gguf_file(model)
-    if is_gguf:
+    if check_gguf_file(model):
         kwargs["gguf_file"] = Path(model).name
         gguf_model_repo = Path(model).parent
+    elif is_remote_gguf(model):
+        repo_id, _ = split_remote_gguf(model)
+        gguf_model_repo = Path(repo_id)
     else:
         gguf_model_repo = None
     kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
@@ -678,10 +683,18 @@ def get_config(
 ) -> PretrainedConfig:
     # Separate model folder from file path for GGUF models
 
-    is_gguf = check_gguf_file(model)
-    if is_gguf:
-        kwargs["gguf_file"] = Path(model).name
-        model = Path(model).parent
+    _is_gguf = is_gguf(model)
+    _is_remote_gguf = is_remote_gguf(model)
+    if _is_gguf:
+        if check_gguf_file(model):
+            # Local GGUF file
+            kwargs["gguf_file"] = Path(model).name
+            model = Path(model).parent
+        elif _is_remote_gguf:
+            # Remote GGUF - extract repo_id from repo_id:quant_type format
+            # The actual GGUF file will be downloaded later by GGUFModelLoader
+            # Keep model as repo_id:quant_type for download, but use repo_id for config
+            model, _ = split_remote_gguf(model)
 
     if config_format == "auto":
         try:
@@ -689,10 +702,25 @@ def get_config(
             # Transformers implementation.
             if file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
                 config_format = "mistral"
-            elif is_gguf or file_or_path_exists(
+            elif (_is_gguf and not _is_remote_gguf) or file_or_path_exists(
                 model, HF_CONFIG_NAME, revision=revision
             ):
                 config_format = "hf"
+            # Remote GGUF models must have config.json in repo,
+            # otherwise the config can't be parsed correctly.
+            # FIXME(Isotr0py): Support remote GGUF repos without config.json
+            elif _is_remote_gguf and not file_or_path_exists(
+                model, HF_CONFIG_NAME, revision=revision
+            ):
+                err_msg = (
+                    "Could not find config.json for remote GGUF model repo. "
+                    "To load remote GGUF model through `<repo_id>:<quant_type>`, "
+                    "ensure your model has config.json (HF format) file. "
+                    "Otherwise please specify --hf-config-path <original_repo> "
+                    "in engine args to fetch config from unquantized hf model."
+                )
+                logger.error(err_msg)
+                raise ValueError(err_msg)
             else:
                 raise ValueError(
                     "Could not detect config format for no config file found. "
@@ -713,9 +741,6 @@ def get_config(
                 "'config.json'.\n"
                 "   - For Mistral models: ensure the presence of a "
                 "'params.json'.\n"
-                "3. For GGUF: pass the local path of the GGUF checkpoint.\n"
-                "   Loading GGUF from a remote repo directly is not yet "
-                "supported.\n"
             ).format(model=model)
 
             raise ValueError(error_message) from e
@@ -729,7 +754,7 @@ def get_config(
         **kwargs,
     )
     # Special architecture mapping check for GGUF models
-    if is_gguf:
+    if _is_gguf:
         if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
             raise RuntimeError(f"Can't get gguf config for {config.model_type}.")
         model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
@@ -889,6 +914,8 @@ def get_pooling_config(model: str, revision: str | None = "main") -> dict | None
         A dictionary containing the pooling type and whether
             normalization is used, or None if no pooling configuration is found.
     """
+    if is_remote_gguf(model):
+        model, _ = split_remote_gguf(model)
 
     modules_file_name = "modules.json"
 
@@ -1108,6 +1135,8 @@ def get_hf_image_processor_config(
     # Separate model folder from file path for GGUF models
     if check_gguf_file(model):
         model = Path(model).parent
+    elif is_remote_gguf(model):
+        model, _ = split_remote_gguf(model)
     return get_image_processor_config(
         model, token=hf_token, revision=revision, **kwargs
     )
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 8deacb5b0791..63cdf6337034 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -18,7 +18,7 @@
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
 
-from vllm.transformers_utils.utils import check_gguf_file, convert_model_repo_to_path
+from vllm.transformers_utils.utils import convert_model_repo_to_path, is_gguf
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
@@ -236,8 +236,8 @@ def cached_processor_from_config(
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
-    if check_gguf_file(model_config.model):
-        assert not check_gguf_file(model_config.tokenizer), (
+    if is_gguf(model_config.model):
+        assert not is_gguf(model_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load processor."
         )
@@ -350,8 +350,8 @@ def cached_image_processor_from_config(
     model_config: "ModelConfig",
     **kwargs: Any,
 ):
-    if check_gguf_file(model_config.model):
-        assert not check_gguf_file(model_config.tokenizer), (
+    if is_gguf(model_config.model):
+        assert not is_gguf(model_config.tokenizer), (
             "For multimodal GGUF models, the original tokenizer "
             "should be used to correctly load image processor."
         )
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 233076741503..f0e0ba8ef424 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -20,7 +20,12 @@
     list_filtered_repo_files,
 )
 from vllm.transformers_utils.tokenizers import MistralTokenizer
-from vllm.transformers_utils.utils import check_gguf_file
+from vllm.transformers_utils.utils import (
+    check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -180,10 +185,12 @@ def get_tokenizer(
         kwargs["truncation_side"] = "left"
 
     # Separate model folder from file path for GGUF models
-    is_gguf = check_gguf_file(tokenizer_name)
-    if is_gguf:
-        kwargs["gguf_file"] = Path(tokenizer_name).name
-        tokenizer_name = Path(tokenizer_name).parent
+    if is_gguf(tokenizer_name):
+        if check_gguf_file(tokenizer_name):
+            kwargs["gguf_file"] = Path(tokenizer_name).name
+            tokenizer_name = Path(tokenizer_name).parent
+        elif is_remote_gguf(tokenizer_name):
+            tokenizer_name, _ = split_remote_gguf(tokenizer_name)
 
     # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
     # first to use official Mistral tokenizer if possible.
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 901a64d9d263..45a873c9f700 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -9,6 +9,8 @@
 from pathlib import Path
 from typing import Any
 
+from gguf import GGMLQuantizationType
+
 import vllm.envs as envs
 from vllm.logger import init_logger
 
@@ -46,6 +48,57 @@ def check_gguf_file(model: str | PathLike) -> bool:
         return False
 
 
+@cache
+def is_remote_gguf(model: str | Path) -> bool:
+    """Check if the model is a remote GGUF model."""
+    model = str(model)
+    return (
+        (not is_cloud_storage(model))
+        and (not model.startswith(("http://", "https://")))
+        and ("/" in model and ":" in model)
+        and is_valid_gguf_quant_type(model.rsplit(":", 1)[1])
+    )
+
+
+def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
+    """Check if the quant type is a valid GGUF quant type."""
+    return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
+
+
+def split_remote_gguf(model: str | Path) -> tuple[str, str]:
+    """Split the model into repo_id and quant type."""
+    model = str(model)
+    if is_remote_gguf(model):
+        parts = model.rsplit(":", 1)
+        return (parts[0], parts[1])
+    raise ValueError(
+        "Wrong GGUF model or invalid GGUF quant type: %s.\n"
+        "- It should be in repo_id:quant_type format.\n"
+        "- Valid GGMLQuantizationType values: %s",
+        model,
+        GGMLQuantizationType._member_names_,
+    )
+
+
+def is_gguf(model: str | Path) -> bool:
+    """Check if the model is a GGUF model.
+
+    Args:
+        model: Model name, path, or Path object to check.
+
+    Returns:
+        True if the model is a GGUF model, False otherwise.
+    """
+    model = str(model)
+
+    # Check if it's a local GGUF file
+    if check_gguf_file(model):
+        return True
+
+    # Check if it's a remote GGUF model (repo_id:quant_type format)
+    return is_remote_gguf(model)
+
+
 def modelscope_list_repo_files(
     repo_id: str,
     revision: str | None = None,

From dbc3d9991ab0e5adc0db6a8c71c9059268032a14 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 25 Nov 2025 09:46:18 -0500
Subject: [PATCH 428/578] [UX] Put CUDA attention backend selection log into
 one line (#29337)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/platforms/cuda.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 06793a3d1bb1..75b6bc77e4c1 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -407,9 +407,6 @@ def get_attn_backend_cls(
 
         # We have found some valid backends. Select the one with the
         # highest priority.
-        logger.info(
-            "Valid backends: %s", [b[0].name for b in valid_backends_priorities]
-        )
         sorted_indices = sorted(
             range(len(valid_backends_priorities)),
             key=lambda i: valid_backends_priorities[i][1],
@@ -417,8 +414,9 @@ def get_attn_backend_cls(
         selected_index = sorted_indices[0]
         selected_backend = valid_backends_priorities[selected_index][0]
         logger.info(
-            "Using %s backend.",
+            "Using %s attention backend out of potential backends: %s",
             selected_backend.name,
+            [b[0].name for b in valid_backends_priorities],
         )
 
         return selected_backend.get_path()

From e502098643b863f09dd9d8e74249523641f01298 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 25 Nov 2025 09:59:07 -0500
Subject: [PATCH 429/578] [Kernel] Add NVFP4 MoE CUTLASS support for SM120
 (#29242)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 CMakeLists.txt                                |   5 +-
 .../fp4/nvfp4_blockwise_moe_kernel.cu         | 230 +++++++++++++++++-
 csrc/quantization/fp4/nvfp4_experts_quant.cu  |   2 +-
 csrc/quantization/fp4/nvfp4_quant_entry.cu    |  10 +-
 .../w8a8/cutlass/scaled_mm_entry.cu           |  27 +-
 docs/design/moe_kernel_features.md            |   2 +-
 .../compressed_tensors_moe.py                 |  14 +-
 .../layers/quantization/modelopt.py           |   4 +
 8 files changed, 264 insertions(+), 30 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86746a0db4c0..d88ba3aa6630 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -604,12 +604,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
       "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${FP4_ARCHS}")
     list(APPEND VLLM_EXT_SRC "${SRCS}")
     list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
     message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
   else()
     message(STATUS "Not building NVFP4 as no compatible archs were found.")
diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
index 5b007e5ea328..674440278383 100644
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -22,6 +22,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
+#include "cutlass_extensions/common.hpp"
 
 #include "cute/tensor.hpp"
 #include "cutlass/tensor_ref.h"
@@ -173,7 +174,7 @@ void run_get_group_gemm_starts(
 }
 
 template <typename OutType>
-void run_fp4_blockwise_scaled_group_mm(
+void run_fp4_blockwise_scaled_group_mm_sm100(
     torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
     const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
@@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm(
 
   auto can_implement_status = gemm_op.can_implement(args);
   TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
-              "Failed to implement GEMM");
+              "Failed to implement GEMM: status=", (int)can_implement_status);
+
+  // Run the GEMM
+  auto status = gemm_op.initialize(args, workspace.data_ptr());
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
+              "Failed to initialize GEMM: status=", (int)status,
+              " workspace_size=", workspace_size, " num_experts=", num_experts,
+              " M=", M, " N=", N, " K=", K);
+
+  status = gemm_op.run(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+void run_fp4_blockwise_scaled_group_mm_sm120(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  using ProblemShape =
+      cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
+  using ElementType = cutlass::float_e2m1_t;
+  using ElementSFType = cutlass::float_ue4m3_t;
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  // NOTE: For SM120 it seems templating the output type is not supported and
+  // we need to hardcode the output type to bfloat16
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  // Layout definitions
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+
+  // Alignment constraints
+  static constexpr int AlignmentA = 32;
+  static constexpr int AlignmentB = 32;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Architecture definitions
+  using ArchTag = cutlass::arch::Sm120;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  using ClusterShape = Shape<_1, _1, _1>;
+  using MmaTileShape = Shape<_128, _128, _128>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
+      ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, MmaTileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutD*, AlignmentD,
+          cutlass::epilogue::collective::EpilogueScheduleAuto,
+          FusionOperation>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
+          LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using LayoutSFA =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+  using LayoutSFB =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+  using ScaleConfig =
+      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor c_strides1 =
+      torch::full({num_experts}, output.stride(0), options_int);
+  torch::Tensor a_strides1 =
+      torch::full({num_experts}, a.stride(0) * 2, options_int);
+  torch::Tensor b_strides1 =
+      torch::full({num_experts}, b.stride(1) * 2, options_int);
+
+  run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
+      a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
+      layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
+      expert_offsets, sf_offsets, problem_sizes, M, N, K);
+
+  // Create an instance of the GEMM
+  Gemm gemm_op;
+
+  // Initialize problem_sizes_as_shapes correctly
+  UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Set the Scheduler info
+  cutlass::KernelHardwareInfo hw_info;
+  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
+  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
+  scheduler.raster_order = RasterOrderOptions::AlongM;
+  hw_info.device_id = a.get_device();
+  static std::unordered_map<int, int> cached_sm_counts;
+  if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
+    cached_sm_counts[hw_info.device_id] =
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+            hw_info.device_id);
+  }
+  hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementType**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides1.data_ptr()),
+      static_cast<const ElementType**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides1.data_ptr()),
+      static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
+      static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(c_strides1.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides1.data_ptr())};
+  auto& fusion_args = epilogue_args.thread;
+  fusion_args.alpha_ptr_array =
+      reinterpret_cast<float**>(alpha_ptrs.data_ptr());
+  fusion_args.dAlpha = {_0{}, _0{}, 1};
+  fusion_args.beta = 0.0f;
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info,
+      scheduler};
+
+  size_t workspace_size = Gemm::get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM: status=", (int)can_implement_status);
 
   // Run the GEMM
   auto status = gemm_op.initialize(args, workspace.data_ptr());
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
+              "Failed to initialize GEMM: status=", (int)status,
+              " workspace_size=", workspace_size, " num_experts=", num_experts,
+              " M=", M, " N=", N, " K=", K);
 
   status = gemm_op.run(args, workspace.data_ptr(), stream);
   TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
 }
 
+template <typename OutType>
+void run_fp4_blockwise_scaled_group_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+  if (version_num >= 120 && version_num < 130) {
+    run_fp4_blockwise_scaled_group_mm_sm120(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+    return;
+  }
+#endif
 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+  if (version_num >= 100 && version_num < 120) {
+    run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+    return;
+  }
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
+      version_num, ". Required capability: 100 or 120");
+}
+
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
+    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
 constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
 constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
 #endif
@@ -374,7 +583,8 @@ void cutlass_fp4_group_mm(
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
     const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
     const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
+    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
   // Input validation
   CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
   CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
@@ -408,6 +618,14 @@ void cutlass_fp4_group_mm(
         output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
         expert_offsets, sf_offsets, M, N, K);
   } else {
+  #if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+    int32_t version_num = get_sm_version_num();
+    if (version_num >= 120 && version_num < 130) {
+      TORCH_CHECK_NOT_IMPLEMENTED(
+          false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
+          output.scalar_type());
+    }
+  #endif
     run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
         output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
         expert_offsets, sf_offsets, M, N, K);
@@ -416,8 +634,8 @@ void cutlass_fp4_group_mm(
   TORCH_CHECK_NOT_IMPLEMENTED(
       false,
       "No compiled cutlass_fp4_group_mm kernel, vLLM must "
-      "be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
-      "12.8 or above.");
+      "be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
+      "and CUDA 12.8 or above.");
 #endif
 }
 
diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu
index 6d385e0dd94e..82c53c2375a3 100644
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float;
 constexpr auto INT = at::ScalarType::Int;
 constexpr auto UINT8 = at::ScalarType::Byte;
 
-void scaled_fp4_experts_quant_sm100a(
+void scaled_fp4_experts_quant_sm1xxa(
     torch::Tensor& output, torch::Tensor& output_scale,
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
     torch::Tensor const& input_offset_by_experts,
diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu
index c2b39e543880..fb6d22f035b9 100644
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                              torch::Tensor const& input_sf);
 #endif
 
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
-void scaled_fp4_experts_quant_sm100a(
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void scaled_fp4_experts_quant_sm1xxa(
     torch::Tensor& output, torch::Tensor& output_scale,
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
     torch::Tensor const& input_offset_by_experts,
@@ -54,8 +55,9 @@ void scaled_fp4_experts_quant(
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
     torch::Tensor const& input_offset_by_experts,
     torch::Tensor const& output_scale_offset_by_experts) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
-  return scaled_fp4_experts_quant_sm100a(
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return scaled_fp4_experts_quant_sm1xxa(
       output, output_scale, input, input_global_scale, input_offset_by_experts,
       output_scale_offset_by_experts);
 #endif
diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
index 1001af05ff00..c5012a866931 100644
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                              std::optional<torch::Tensor> const& bias);
 #endif
 
-#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 ||   \
-    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
-    defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
+#if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120)
 void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data(
   // This function currently gets compiled only if we have a valid cutlass moe
   // mm to run it for.
   int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
   get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                  problem_sizes2, input_permutation,
                                  output_permutation, num_experts, n, k,
@@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data(
       false,
       "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
       "CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 
 void get_cutlass_moe_mm_problem_sizes(
@@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes(
     torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
     const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
   int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
   get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
                                           problem_sizes2, num_experts, n, k,
                                           blockscale_offsets);
@@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes(
       false,
       "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
       "kernel for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
   // This function currently gets compiled only if we have a valid cutlass moe
   // mm to run it for.
   int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
   get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
                                       problem_sizes2, expert_num_tokens,
                                       num_local_experts, padded_m, n, k);
@@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
       false,
       "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
       "for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index f0d5a3e934f3..e54a9e2bc5e7 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 
 - [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
 - [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
-- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
+- [`CompressedTensorsW4A4Nvfp4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoeMethod]
 - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
 - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
 - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 149e4419c64a..71d7de97d4a1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -103,7 +103,7 @@ class GPTQMarlinState(Enum):
     "CompressedTensorsW8A8Int8MoEMethod",
     "CompressedTensorsWNA16MarlinMoEMethod",
     "CompressedTensorsWNA16MoEMethod",
-    "CompressedTensorsW4A4MoeMethod",
+    "CompressedTensorsW4A4Nvfp4MoeMethod",
     "CompressedTensorsW4A8Int8MoEMethod",
 ]
 
@@ -171,7 +171,7 @@ def get_moe_method(
                     quant_config, layer.moe_config
                 )
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
-            return CompressedTensorsW4A4MoeMethod(layer.moe_config)
+            return CompressedTensorsW4A4Nvfp4MoeMethod(layer.moe_config)
         elif (
             quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
             or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
@@ -188,7 +188,7 @@ def get_moe_method(
             )
 
 
-class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
+class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
     def __init__(self, moe: FusedMoEConfig):
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
             detect_nvfp4_moe_support,
@@ -205,8 +205,12 @@ def __init__(self, moe: FusedMoEConfig):
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
-                " for CompressedTensorsW4A4MoeMethod."
+                " for CompressedTensorsW4A4Nvfp4MoeMethod."
             )
+        elif self.use_marlin:
+            logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoeMethod.")
+        else:
+            logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoeMethod.")
 
     def create_weights(
         self,
@@ -612,7 +616,7 @@ def apply(
             assert expert_map is None, (
                 "Expert Parallelism / expert_map "
                 "is currently not supported for "
-                "CompressedTensorsW4A4MoeMethod."
+                "CompressedTensorsW4A4Nvfp4MoeMethod."
             )
             assert self.moe_quant_config is not None
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 816567313591..2cf7089e0ff9 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1132,6 +1132,10 @@ def __init__(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
                 " for ModelOptNvFp4FusedMoE."
             )
+        elif self.use_marlin:
+            logger.info_once("Using Marlin for ModelOptNvFp4FusedMoE.")
+        else:
+            logger.info_once("Using Cutlass for ModelOptNvFp4FusedMoE.")
 
     def maybe_make_prepare_finalize(
         self,

From 48ddb02b79d7e22e2eefbf5294bf70de50afd1b2 Mon Sep 17 00:00:00 2001
From: Yifan Qiao <yifanqiao@berkeley.edu>
Date: Tue, 25 Nov 2025 07:30:57 -0800
Subject: [PATCH 430/578] [Hybrid Allocator] Support KV cache groups with
 different block_size (#29143)

Signed-off-by: Yifan Qiao <yifanqiao@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/core/test_kv_cache_utils.py          |  49 +++++-
 tests/v1/core/test_prefix_caching.py          |  95 ++++++++++-
 .../core/test_single_type_kv_cache_manager.py |  22 ++-
 vllm/engine/arg_utils.py                      |   8 +-
 vllm/model_executor/models/config.py          |  11 +-
 vllm/v1/core/block_pool.py                    |  22 ++-
 vllm/v1/core/kv_cache_coordinator.py          |  98 ++++++++---
 vllm/v1/core/kv_cache_manager.py              |  25 +--
 vllm/v1/core/kv_cache_utils.py                | 156 +++++++++++++++---
 vllm/v1/core/sched/scheduler.py               |   1 +
 vllm/v1/core/single_type_kv_cache_manager.py  |  72 +++++++-
 11 files changed, 472 insertions(+), 87 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 12ed59b6e863..58a7a2692bfc 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1248,7 +1248,9 @@ def test_allocate_with_lookahead():
     )
 
     # Test case 1: Requires additional lookahead tokens
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_new_tokens=3,
@@ -1257,7 +1259,9 @@ def test_allocate_with_lookahead():
     assert len(blocks.get_block_ids()[0]) == 2  # ceil(5/4)=2 blocks
 
     # Test case 2: With precomputed blocks
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
     # required_blocks = ceil((3 + 2) /4) = 2
     blocks = kv_cache_manager.allocate_slots(
         request,
@@ -1268,7 +1272,9 @@ def test_allocate_with_lookahead():
 
     # Test case 3: With precomputed blocks
     # required_blocks = ceil((3 + 4) / 4) = 2
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_new_tokens=3,
@@ -1495,7 +1501,8 @@ def test_get_kv_cache_config_one_worker():
             ),
         ],
     )
-    # different hidden size
+
+    # different hidden size but same type, use UniformTypeKVCacheSpecs
     kv_cache_specs_hybrid = {
         "layer_1": new_kv_cache_spec(head_size=128),
         "layer_2": new_kv_cache_spec(head_size=64),
@@ -1519,6 +1526,40 @@ def test_get_kv_cache_config_one_worker():
         ],
     )
 
+    # Different hidden size and different type, align by different block size
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(head_size=64),
+        "layer_2": new_sliding_window_spec(head_size=32),
+    }
+    kv_cache_config_hybrid = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 32]
+    )[0]
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32, shared_by=["layer_1", "layer_2"]
+            ),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1"], new_kv_cache_spec(head_size=64)),
+            KVCacheGroupSpec(
+                ["layer_2"], new_sliding_window_spec(head_size=32, block_size=32)
+            ),
+        ],
+    )
+
+    # different hidden size that cannot be aligned by using different block size
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(head_size=64),
+        "layer_2": new_sliding_window_spec(head_size=96),
+    }
+
+    with pytest.raises(NotImplementedError):
+        get_kv_cache_configs(
+            vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 2 * 32]
+        )[0]
+
     # Test num_gpu_blocks_override
     vllm_config.cache_config.num_gpu_blocks_override = 16
     kv_cache_config_override_blocks = get_kv_cache_configs(
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 2291f363731f..64fd5ab1dd9a 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -134,6 +134,7 @@ def test_prefill(hash_fn):
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Complete 3 blocks (48 tokens)
@@ -256,6 +257,7 @@ def test_prefill_hybrid_model():
         make_kv_cache_config_hybrid_model(block_size, 21),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     hash_fn = sha256
@@ -416,6 +418,7 @@ def test_prefill_plp():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
     # the default hash function is sha256
     hash_fn = sha256
@@ -523,6 +526,7 @@ def test_decode():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Complete 3 blocks (48 tokens)
@@ -585,6 +589,7 @@ def test_evict():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     last_token_id = 5 * 16 + 7
@@ -643,6 +648,7 @@ def test_hash_block_correct_reuse():
         make_kv_cache_config(16, 2),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Allocate 1 block and cache it.
@@ -683,6 +689,7 @@ def test_computed_blocks_not_evicted():
         make_kv_cache_config(block_size, 3),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Allocate a block and cache it.
@@ -741,6 +748,7 @@ def test_basic_prefix_caching_disabled():
         make_kv_cache_config(block_size, 5),
         max_model_len=8192,
         enable_caching=False,
+        hash_block_size=block_size,
     )
 
     req1 = make_request(
@@ -790,6 +798,7 @@ def test_cache_blocks(hash_fn):
     block_pool = BlockPool(
         num_gpu_blocks=5,
         enable_caching=True,
+        hash_block_size=block_size,
     )
     # Req:
     #  Block 0: [0, 1, 2, 3]
@@ -833,7 +842,9 @@ def test_cache_blocks_multi_group():
     This tests that blocks are cached correctly for different kv cache groups.
     """
     block_size = 4
-    block_pool = BlockPool(num_gpu_blocks=10, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=10, enable_caching=True, hash_block_size=block_size
+    )
 
     # Req:
     #  Block 0/4: [0, 1, 2, 3]
@@ -921,6 +932,7 @@ def test_mm_prefix_caching():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Common prompt tokens (T is text tokens and P is image placeholder tokens)
@@ -1020,6 +1032,7 @@ def test_cache_key_salting():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # 3 complete blocks and an incomplete block with 11 tokens.
@@ -1101,6 +1114,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
     # Complete 3 blocks (48 tokens)
     # | Common-0 | Common-1 | Common-2 | ... |
@@ -1173,6 +1187,7 @@ def test_reset_prefix_cache():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     full_block_token_ids = [i for i in range(3) for _ in range(16)]
@@ -1213,6 +1228,7 @@ def test_prefix_cache_stats_disabled():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
         log_stats=False,  # Disable logging stats
     )
     assert manager.prefix_cache_stats is None
@@ -1232,7 +1248,7 @@ def test_prefix_cache_stats_disabled():
 
 
 def test_maybe_evict_cached_block():
-    pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+    pool = BlockPool(num_gpu_blocks=4, enable_caching=True, hash_block_size=16)
     block_hash0 = make_block_hash_with_group_id(BlockHash(b"10"), 1000)
     block_hash1 = make_block_hash_with_group_id(BlockHash(b"20"), 2000)
     block_hash2 = make_block_hash_with_group_id(BlockHash(b"30"), 3000)
@@ -1293,6 +1309,7 @@ def test_kv_cache_events(blocks_to_cache: int):
         max_model_len=8192,
         enable_caching=True,
         enable_kv_cache_events=True,
+        hash_block_size=block_size,
     )
 
     num_tokens = block_size * blocks_to_cache
@@ -1351,6 +1368,7 @@ def test_kv_cache_events_with_lora(blocks_to_cache: int):
         max_model_len=8192,
         enable_caching=True,
         enable_kv_cache_events=True,
+        hash_block_size=block_size,
     )
 
     # Test with LoRA request
@@ -1405,6 +1423,7 @@ def test_eagle_enabled_removes_last_block():
         max_model_len=8192,
         enable_caching=True,
         use_eagle=True,
+        hash_block_size=block_size,
     )
 
     # Request with 3 full blocks (48 tokens)
@@ -1437,6 +1456,7 @@ def test_eagle_with_partial_blocks():
         max_model_len=8192,
         enable_caching=True,
         use_eagle=True,
+        hash_block_size=block_size,
     )
     # 2 full blocks + 5 tokens (non-divisible length)
     token_ids = [0] * (2 * block_size + 5)
@@ -1476,6 +1496,7 @@ def test_eagle_with_sliding_window():
         max_model_len=8192,
         enable_caching=True,
         use_eagle=True,
+        hash_block_size=block_size,
     )
 
     # 2 full blocks + 5 tokens (non-divisible length)
@@ -1522,6 +1543,76 @@ def test_eagle_with_sliding_window():
     assert num_tokens == 0
 
 
+def test_different_block_size():
+    block_size = 16
+    # full attention and sliding window attention layers have the same page size:
+    # (32 tokens/block * float16 token, vs. 16 tokens/block * float32 token)
+    kv_cache_config = KVCacheConfig(
+        num_blocks=100,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer1"],
+                FullAttentionSpec(block_size * 2, 1, 1, torch.float16),
+            ),
+            KVCacheGroupSpec(
+                ["layer2"],
+                SlidingWindowSpec(
+                    block_size,
+                    1,
+                    1,
+                    torch.float32,
+                    sliding_window=2 * block_size,
+                ),
+            ),
+        ],
+    )
+    manager = KVCacheManager(
+        kv_cache_config=kv_cache_config,
+        max_model_len=8192,
+        enable_caching=True,
+        hash_block_size=block_size,
+    )
+
+    # 10 blocks of 16 tokens each. Token ids are not strictly aligned for each block.
+    common_token_ids = [i for i in range(10) for _ in range(block_size)]
+
+    req0 = make_request("0", common_token_ids, block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert not computed_blocks.blocks[0]
+    assert not computed_blocks.blocks[1]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(
+        req0, 7 * block_size, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7, 8, 9, 10, 11])
+    req1 = make_request("1", common_token_ids[: 7 * block_size + 1], block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(computed_blocks.blocks[0]) == 3
+    assert len(computed_blocks.blocks[1]) == 6
+    assert num_computed_tokens == 6 * 16
+
+    req2 = make_request("2", common_token_ids[: 6 * block_size + 1], block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(computed_blocks.blocks[0]) == 3
+    assert len(computed_blocks.blocks[1]) == 6
+    assert num_computed_tokens == 6 * 16
+
+    # Evict some blocks to make sliding window cache hit length 5*16
+    # But should return 4 * 16 because full attention cache hit length must be
+    # a multiple of 32
+    manager.block_pool.cached_block_hash_to_block.pop(
+        make_block_hash_with_group_id(req1.block_hashes[6], 1), 11
+    )
+    manager.block_pool.cached_block_hash_to_block.pop(
+        make_block_hash_with_group_id(req1.block_hashes[5], 1), 10
+    )
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(computed_blocks.blocks[0]) == 2
+    assert len(computed_blocks.blocks[1]) == 4
+    assert num_computed_tokens == 4 * 16
+
+
 def test_block_lookup_cache_single_block_per_key():
     cache = BlockHashToBlockMap()
     key0 = BlockHashWithGroupId(b"hash0")
diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py
index a27f32938c08..e6a69dc8a949 100644
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@@ -41,7 +41,9 @@ def test_chunked_local_attention_possible_cached_prefix():
         attention_chunk_size=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_chunked_local_attention_manager(
         chunked_local_attention_spec, block_pool
     )
@@ -70,6 +72,7 @@ def run_one_case(block_is_cached, tail_token, expect_length):
             block_pool=block_pool,
             kv_cache_spec=chunked_local_attention_spec,
             use_eagle=False,
+            alignment_tokens=block_size,
         )[0]
         assert len(computed_blocks) == expect_length
 
@@ -111,7 +114,9 @@ def test_sliding_window_possible_cached_prefix():
         sliding_window=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
 
     def run_one_case(block_is_cached, expect_length):
@@ -138,6 +143,7 @@ def run_one_case(block_is_cached, expect_length):
             block_pool=block_pool,
             kv_cache_spec=sliding_window_spec,
             use_eagle=False,
+            alignment_tokens=block_size,
         )[0]
         assert len(computed_blocks) == expect_length
 
@@ -178,7 +184,7 @@ def test_chunked_local_attention_remove_skipped_blocks():
         attention_chunk_size=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
 
     manager = get_chunked_local_attention_manager(attention_spec, block_pool)
 
@@ -239,7 +245,7 @@ def test_sliding_window_remove_skipped_blocks():
         sliding_window=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
 
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
 
@@ -316,7 +322,9 @@ def test_get_num_blocks_to_allocate():
         sliding_window=4,  # Placeholder value, not related to test result
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
     cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
     cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
@@ -341,7 +349,9 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
         attention_chunk_size=4,  # Placeholder value, not related to test result
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_chunked_local_attention_manager(attention_spec, block_pool)
     cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
     cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6d5b3392baa2..bdccb15e3f65 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1816,9 +1816,11 @@ def get_chunked_prefill_prefix_caching_defaults(
         if model_config.runner_type != "pooling":
             default_chunked_prefill = True
 
-            # Disable prefix caching default for hybrid models
-            # since the feature is still experimental.
-            default_prefix_caching = not model_config.is_hybrid
+            # Disable prefix caching default for hybrid models and mamba-only
+            # models since the feature is still experimental.
+            default_prefix_caching = not (
+                model_config.is_hybrid or model_config.is_attention_free
+            )
         else:
             assert model_config.pooler_config is not None
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 3cf4bf991e66..d7e802ba1aca 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -289,9 +289,6 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
 
-        if cache_config.mamba_block_size is None:
-            cache_config.mamba_block_size = model_config.max_model_len
-
         if cache_config.enable_prefix_caching:
             if model_config.supports_mamba_prefix_caching:
                 logger.info(
@@ -299,6 +296,11 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                     "Its support for Mamba layers is experimental. "
                     "Please report any issues you may observe."
                 )
+                # By default, mamba block size will be set to max_model_len (see
+                # below). When enabling prefix caching, we align mamba block size
+                # to the block size as the basic granularity for prefix caching.
+                if cache_config.mamba_block_size is None:
+                    cache_config.mamba_block_size = cache_config.block_size
             else:
                 logger.info(
                     "Hybrid or mamba-based model detected without "
@@ -306,6 +308,9 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 )
                 cache_config.enable_prefix_caching = False
 
+        if cache_config.mamba_block_size is None:
+            cache_config.mamba_block_size = model_config.max_model_len
+
         # TODO(tdoublep): remove once cascade attention is supported
         logger.info(
             "Disabling cascade attention since it is not supported for hybrid models."
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 55710ad5cc69..8b0e8fd3a241 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -13,6 +13,8 @@
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
+    BlockHashList,
+    BlockHashListWithBlockSize,
     BlockHashWithGroupId,
     ExternalBlockHash,
     FreeKVCacheBlockQueue,
@@ -133,6 +135,10 @@ class BlockPool:
     Args:
         num_gpu_blocks: The number of blocks in the pool.
         enable_caching: Whether to enable prefix caching.
+        hash_block_size: The block size of which the block hashes are computed.
+            The actual block size usually equals hash_block_size, but in cases
+            where different KV cache groups have different block sizes, the
+            actual block size can be a multiple of hash_block_size.
         enable_kv_cache_events: Whether to enable kv cache events.
     """
 
@@ -140,11 +146,13 @@ def __init__(
         self,
         num_gpu_blocks: int,
         enable_caching: bool,
+        hash_block_size: int,
         enable_kv_cache_events: bool = False,
     ):
         assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
         self.num_gpu_blocks = num_gpu_blocks
         self.enable_caching = enable_caching
+        self.hash_block_size = hash_block_size
         # All kv-cache blocks.
         self.blocks: list[KVCacheBlock] = [
             KVCacheBlock(idx) for idx in range(num_gpu_blocks)
@@ -223,8 +231,20 @@ def cache_full_blocks(
             return
         new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
         assert len(request.block_hashes) >= num_full_blocks
-        new_block_hashes = request.block_hashes[num_cached_blocks:]
+        if block_size == self.hash_block_size:
+            # Common case.
+            block_hashes: BlockHashList = request.block_hashes
+        else:
+            # block_size is a multiple of hash_block_size. This happens when
+            # different KV cache groups have different block sizes.
+            assert block_size % self.hash_block_size == 0
+            # Recalculate block_hashes at the granularity of block_size, using
+            # the original block_hashes (at the granularity of hash_block_size).
+            block_hashes = BlockHashListWithBlockSize(
+                request.block_hashes, self.hash_block_size, block_size
+            )
 
+        new_block_hashes = block_hashes[num_cached_blocks:]
         new_hashes: list[ExternalBlockHash] | None = (
             [] if self.enable_kv_cache_events else None
         )
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 1531b61f88fe..fd1ec8e27fba 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -2,15 +2,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
+from math import lcm
 
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import (
+    BlockHash,
+    BlockHashList,
+    BlockHashListWithBlockSize,
+    KVCacheBlock,
+)
 from vllm.v1.core.single_type_kv_cache_manager import (
     CrossAttentionManager,
     FullAttentionManager,
     get_manager_for_kv_cache_spec,
 )
-from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheSpec
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheSpec,
+)
 from vllm.v1.request import Request
 
 
@@ -28,13 +38,17 @@ def __init__(
         enable_kv_cache_events: bool,
         dcp_world_size: int,
         pcp_world_size: int,
+        hash_block_size: int,
     ):
         self.kv_cache_config = kv_cache_config
         self.max_model_len = max_model_len
         self.enable_caching = enable_caching
 
         self.block_pool = BlockPool(
-            kv_cache_config.num_blocks, enable_caching, enable_kv_cache_events
+            kv_cache_config.num_blocks,
+            enable_caching,
+            hash_block_size,
+            enable_kv_cache_events,
         )
 
         # Needs special handling for find_longest_cache_hit if eagle is enabled
@@ -213,6 +227,7 @@ def __init__(
         enable_kv_cache_events: bool,
         dcp_world_size: int,
         pcp_world_size: int,
+        hash_block_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -222,6 +237,7 @@ def __init__(
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
         self.num_single_type_manager = len(self.single_type_managers)
 
@@ -255,6 +271,7 @@ def __init__(
         enable_kv_cache_events: bool,
         dcp_world_size: int,
         pcp_world_size: int,
+        hash_block_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -264,6 +281,7 @@ def __init__(
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
         self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec
         self.block_size = self.kv_cache_spec.block_size
@@ -273,6 +291,11 @@ def __init__(
             self.block_size *= dcp_world_size
         if pcp_world_size > 1:
             self.block_size *= pcp_world_size
+        # For models using only Mamba, block_size is set to max_model_len when
+        # prefix caching is disabled, and hash_block_size validation is skipped.
+        assert not enable_caching or (hash_block_size == self.block_size), (
+            "UnitaryKVCacheCoordinator assumes hash_block_size == block_size"
+        )
         assert len(self.kv_cache_config.kv_cache_groups) == 1, (
             "UnitaryKVCacheCoordinator assumes only one kv cache group"
         )
@@ -289,6 +312,7 @@ def find_longest_cache_hit(
             block_pool=self.block_pool,
             kv_cache_spec=self.kv_cache_spec,
             use_eagle=self.use_eagle,
+            alignment_tokens=self.block_size,
             dcp_world_size=self.dcp_world_size,
             pcp_world_size=self.pcp_world_size,
         )
@@ -313,6 +337,7 @@ def __init__(
         enable_kv_cache_events: bool,
         dcp_world_size: int,
         pcp_world_size: int,
+        hash_block_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -322,7 +347,17 @@ def __init__(
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
+        # hash_block_size: the block size used to compute block hashes.
+        # The actual block size usually equals hash_block_size, but in cases where
+        # different KV cache groups have different block sizes, the actual block size
+        # can be a multiple of hash_block_size.
+        self.hash_block_size = hash_block_size
+        assert all(
+            g.kv_cache_spec.block_size % hash_block_size == 0
+            for g in kv_cache_config.kv_cache_groups
+        ), "block_size must be divisible by hash_block_size"
         assert dcp_world_size == 1, "DCP not support hybrid attn now."
         assert pcp_world_size == 1, "PCP not support hybrid attn now."
         self.verify_and_split_kv_cache_groups()
@@ -373,14 +408,12 @@ def verify_and_split_kv_cache_groups(self) -> None:
         self.other_spec = other_spec
         self.full_attention_block_size = self.full_attention_spec.block_size
         self.other_block_size = self.other_spec.block_size
-
-        if self.enable_caching:
-            # this requirement is only needed for the prefix caching logic
-            divisible = self.other_block_size % self.full_attention_block_size
-            assert divisible == 0, (
-                "KVCacheCoordinator assumes the block_size of full "
-                "attention layers is divisible by other layers now."
-            )
+        # The LCM of the block sizes of full attention and other attention.
+        # The cache hit length must be a multiple of the LCM of the block sizes
+        # to make sure the cache hit length is a multiple of the block size of
+        # each attention type. Requiring this because we don't support partial
+        # block cache hit yet.
+        self.lcm_block_size = lcm(self.full_attention_block_size, self.other_block_size)
 
         if max(self.full_attention_group_ids) < min(self.other_group_ids):
             self.full_attn_first = True
@@ -414,25 +447,48 @@ def find_longest_cache_hit(
                 - The number of tokens of the longest cache hit.
         """
         # First, find the longest cache hit for full attention.
+        if self.full_attention_spec.block_size == self.hash_block_size:
+            # Common case.
+            full_attention_block_hashes: BlockHashList = block_hashes
+        else:
+            # block_size is a multiple of hash_block_size. This happens when different
+            # KV cache groups have different block sizes. In this case, we need to
+            # recalculate block_hashes at the granularity of block_size, using the
+            # original block_hashes (at the granularity of hash_block_size).
+            full_attention_block_hashes = BlockHashListWithBlockSize(
+                block_hashes, self.hash_block_size, self.full_attention_spec.block_size
+            )
         hit_blocks_full_attn = self.full_attention_manager_cls.find_longest_cache_hit(
-            block_hashes=block_hashes,
+            block_hashes=full_attention_block_hashes,
             max_length=max_cache_hit_length,
             kv_cache_group_ids=self.full_attention_group_ids,
             block_pool=self.block_pool,
             kv_cache_spec=self.full_attention_spec,
             use_eagle=self.use_eagle,
+            alignment_tokens=self.lcm_block_size,
         )
         hit_length = len(hit_blocks_full_attn[0]) * self.full_attention_block_size
 
         # Next, find the cache hit for the other attention WITHIN
         # the cache hit of full attention.
+        if self.other_spec.block_size == self.hash_block_size:
+            # Common case.
+            other_block_hashes: BlockHashList = block_hashes
+        else:
+            # Similar to the full attention case, here we need to recalculate
+            # block_hashes at the granularity of block_size, using the original
+            # block_hashes (at the granularity of hash_block_size).
+            other_block_hashes = BlockHashListWithBlockSize(
+                block_hashes, self.hash_block_size, self.other_spec.block_size
+            )
         hit_blocks_other_attn = self.other_attention_cls.find_longest_cache_hit(
-            block_hashes=block_hashes,
+            block_hashes=other_block_hashes,
             max_length=hit_length,
             kv_cache_group_ids=self.other_group_ids,
             block_pool=self.block_pool,
             kv_cache_spec=self.other_spec,
             use_eagle=self.use_eagle,
+            alignment_tokens=self.lcm_block_size,
         )
         hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
 
@@ -466,6 +522,7 @@ def get_kv_cache_coordinator(
     enable_kv_cache_events: bool,
     dcp_world_size: int,
     pcp_world_size: int,
+    hash_block_size: int,
 ) -> KVCacheCoordinator:
     if not enable_caching:
         return KVCacheCoordinatorNoPrefixCache(
@@ -473,8 +530,9 @@ def get_kv_cache_coordinator(
             max_model_len,
             use_eagle,
             enable_kv_cache_events,
-            dcp_world_size=dcp_world_size,
-            pcp_world_size=pcp_world_size,
+            dcp_world_size,
+            pcp_world_size,
+            hash_block_size,
         )
     if len(kv_cache_config.kv_cache_groups) == 1:
         return UnitaryKVCacheCoordinator(
@@ -483,8 +541,9 @@ def get_kv_cache_coordinator(
             use_eagle,
             enable_caching,
             enable_kv_cache_events,
-            dcp_world_size=dcp_world_size,
-            pcp_world_size=pcp_world_size,
+            dcp_world_size,
+            pcp_world_size,
+            hash_block_size,
         )
     return HybridKVCacheCoordinator(
         kv_cache_config,
@@ -492,6 +551,7 @@ def get_kv_cache_coordinator(
         use_eagle,
         enable_caching,
         enable_kv_cache_events,
-        dcp_world_size=dcp_world_size,
-        pcp_world_size=pcp_world_size,
+        dcp_world_size,
+        pcp_world_size,
+        hash_block_size,
     )
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 2012c3fef88b..b061e5cc831d 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -95,6 +95,7 @@ def __init__(
         self,
         kv_cache_config: KVCacheConfig,
         max_model_len: int,
+        hash_block_size: int,
         enable_caching: bool = True,
         use_eagle: bool = False,
         log_stats: bool = False,
@@ -107,28 +108,11 @@ def __init__(
         self.enable_caching = enable_caching
         self.use_eagle = use_eagle
         self.log_stats = log_stats
-        # FIXME: make prefix cache stats conditional on log_stats
+        # FIXME: make prefix cache stats conditional on log_stats. We still need
+        # this comment because when the log stats is enabled there are still
+        # potential configs we could expose in the future.
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
 
-        self.block_size: int | None = None
-        if self.enable_caching:
-            assert (
-                len(
-                    set(
-                        g.kv_cache_spec.block_size
-                        for g in kv_cache_config.kv_cache_groups
-                    )
-                )
-                == 1
-            ), "Only one block size is supported for now"
-            self.block_size = kv_cache_config.kv_cache_groups[
-                0
-            ].kv_cache_spec.block_size
-
-            if dcp_world_size * pcp_world_size > 1:
-                assert len(kv_cache_config.kv_cache_groups) == 1
-                self.block_size *= dcp_world_size * pcp_world_size
-
         self.coordinator = get_kv_cache_coordinator(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
@@ -137,6 +121,7 @@ def __init__(
             enable_kv_cache_events=enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
             pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
         self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
         self.block_pool = self.coordinator.block_pool
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index a0033fa650ba..602eb81beb01 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -5,9 +5,9 @@
 import copy
 import os
 from collections import defaultdict
-from collections.abc import Callable, Iterable, Sequence
-from dataclasses import dataclass
-from typing import Any, NewType, TypeAlias
+from collections.abc import Callable, Iterable, Iterator, Sequence
+from dataclasses import dataclass, replace
+from typing import Any, NewType, TypeAlias, overload
 
 from vllm import envs
 from vllm.config import VllmConfig
@@ -825,11 +825,11 @@ def get_num_blocks(
     return num_blocks
 
 
-def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
+def get_uniform_page_size(kv_cache_specs: Iterable[KVCacheSpec]) -> int:
     """
     Get the page size of the KV cache.
     """
-    page_sizes = set(layer.page_size_bytes for layer in kv_cache_spec.values())
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_specs}
     assert len(page_sizes) == 1
     return page_sizes.pop()
 
@@ -882,6 +882,46 @@ def is_kv_cache_page_size_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool
     return len(page_sizes) == 1
 
 
+def unify_kv_cache_spec_page_size(
+    kv_cache_spec: dict[str, KVCacheSpec],
+) -> dict[str, KVCacheSpec]:
+    """
+    Unify the page size of the given KVCacheSpec. If the page size of all layers
+    are the same, return the original KVCacheSpec. If not same, unify the page
+    size by increasing the block size of layers with smaller page size. Raise
+    NotImplementedError if failed to unify the page size.
+
+    Args:
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+
+    Returns:
+        The updated KVCacheSpec with the same page_size_bytes.
+    """
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    if len(page_sizes) <= 1:
+        # All layers have the same page size, no need to unify.
+        return kv_cache_spec
+
+    max_page_size = max(page_sizes)
+    new_kv_cache_spec = {}
+    for layer_name, layer_spec in kv_cache_spec.items():
+        if layer_spec.page_size_bytes == max_page_size:
+            new_kv_cache_spec[layer_name] = layer_spec
+        else:
+            layer_page_size = layer_spec.page_size_bytes
+            if max_page_size % layer_page_size != 0:
+                raise NotImplementedError(
+                    "The page size of the layer is not divisible by the "
+                    "maximum page size. Cannot unify by adjusting block_size."
+                )
+            ratio = max_page_size // layer_page_size
+            new_block_size = layer_spec.block_size * ratio
+            new_spec = replace(layer_spec, block_size=new_block_size)
+            assert new_spec.page_size_bytes == max_page_size
+            new_kv_cache_spec[layer_name] = new_spec
+    return new_kv_cache_spec
+
+
 def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     # kv_cache_spec is an empty dict for attention free models
     return not kv_cache_spec
@@ -1010,7 +1050,6 @@ def _get_kv_cache_groups_uniform_page_size(
 def get_kv_cache_config_from_groups(
     vllm_config: VllmConfig,
     kv_cache_groups: list[KVCacheGroupSpec],
-    kv_cache_specs: dict[str, KVCacheSpec],
     available_memory: int,
 ) -> KVCacheConfig:
     """
@@ -1020,7 +1059,6 @@ def get_kv_cache_config_from_groups(
     Args:
         vllm_config: The global VllmConfig
         kv_cache_groups: The KV cache groups
-        kv_cache_specs: The KV cache spec of each attention layer in the model
         available_memory: Memory available for KV cache in bytes
     Returns:
         The generated KVCacheConfig
@@ -1064,7 +1102,9 @@ def get_kv_cache_config_from_groups(
         # full.1, sw.2: share another Tensor with size=available_memory//2
         group_size = max(len(group.layer_names) for group in kv_cache_groups)
 
-        page_size = get_uniform_page_size(kv_cache_specs)
+        page_size = get_uniform_page_size(
+            [group.kv_cache_spec for group in kv_cache_groups]
+        )
         assert group_size > 0, "group_size must be greater than 0"
         num_blocks = get_num_blocks(
             vllm_config, group_size, available_memory, page_size
@@ -1166,7 +1206,8 @@ def get_kv_cache_groups(
         # This returns an empty list to allow for the KVCacheManager to handle
         # attention free models.
         return []
-    elif is_kv_cache_spec_uniform(kv_cache_spec):
+
+    if is_kv_cache_spec_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
         # each layer.
@@ -1176,14 +1217,16 @@ def get_kv_cache_groups(
         # full attention, or all layers are sliding window attention with the
         # same window size). Put all layers into one group.
         return _get_kv_cache_groups_uniform_type(uniform_spec)
-    elif is_kv_cache_page_size_uniform(kv_cache_spec):
-        # Model contains multiple attention types, but KV cache of all layers
-        # have the same physical memory per block per layer. Split the layers
-        # into groups with the same number of layers, and thus same total page
-        # size.
-        return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
 
-    raise NotImplementedError
+    # As KVCacheManager can only allocate memory of one size, we need to unify
+    # the page size of the layers. For cases cannot be unified, this function
+    # will raise an error.
+    kv_cache_spec = unify_kv_cache_spec_page_size(kv_cache_spec)
+    # Model contains multiple attention types, but KV cache of all layers
+    # have the same physical memory per block per layer. Split the layers
+    # into groups with the same number of layers, and thus same total page
+    # size.
+    return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
 
 
 def generate_scheduler_kv_cache_config(
@@ -1327,10 +1370,7 @@ def get_kv_cache_configs(
         ) == len(kv_cache_spec_one_worker), "Some layers are not assigned to any group."
         kv_cache_configs.append(
             get_kv_cache_config_from_groups(
-                vllm_config,
-                kv_cache_groups_one_worker,
-                kv_cache_spec_one_worker,
-                available_memory_one_worker,
+                vllm_config, kv_cache_groups_one_worker, available_memory_one_worker
             )
         )
 
@@ -1353,3 +1393,79 @@ def get_kv_cache_configs(
             _report_kv_cache_config(vllm_config, kv_cache_config)
 
     return kv_cache_configs
+
+
+class BlockHashListWithBlockSize:
+    """
+    Convert block-hash granularity from `hash_block_size` to `target_block_size`.
+    Used when KV cache groups have different block sizes: `hash_block_size`
+    is the size used to compute the original `block_hashes`; `target_block_size`
+    is the group's actual block size.
+
+    Currently, only scaling up by an integer factor is supported (i.e.,
+    `target_block_size` is a multiple of `hash_block_size`). Conversion is
+    performed lazily on access for efficiency, by concatenating consecutive
+    hashes at `hash_block_size` to form each hash at `target_block_size`.
+
+    Example (`hash_block_size` = 16, `target_block_size` = 32):
+    concatenating two 16-size hashes yields one 32-size hash:
+
+    Block hashes with block_size 16:
+    | Token Range | 0-15 | 16-31 | 32-47 | 48-63 |
+    |-------------|------|-------|-------|-------|
+    | Hash        | A    | B     | C     | D     |
+
+    Block hashes with block_size 32:
+    | Token Range | 0-31 | 32-63 |
+    |-------------|------|-------|
+    | Hash        | AB   | CD    |
+
+    Args:
+        block_hashes: Block hashes to convert, computed at `hash_block_size`.
+        hash_block_size: Block size at which `block_hashes` were computed.
+        target_block_size: Desired block size; must be a multiple of `hash_block_size`.
+    """
+
+    def __init__(
+        self,
+        block_hashes: list[BlockHash],
+        hash_block_size: int,
+        target_block_size: int,
+    ):
+        self.block_hashes = block_hashes
+        assert target_block_size % hash_block_size == 0
+        self.scale_factor = target_block_size // hash_block_size
+
+    def __len__(self) -> int:
+        return len(self.block_hashes) // self.scale_factor
+
+    @overload
+    def __getitem__(self, idx: int) -> BlockHash: ...
+
+    @overload
+    def __getitem__(self, idx: slice) -> list[BlockHash]: ...
+
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            return self._get_value_at(idx)
+
+        if isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            return [self._get_value_at(i) for i in range(start, stop, step)]
+
+        raise TypeError(f"Invalid index type: {type(idx)!r}")
+
+    def __iter__(self) -> Iterator[BlockHash]:
+        for i in range(len(self)):
+            yield self._get_value_at(i)
+
+    def _get_value_at(self, idx: int) -> BlockHash:
+        base = idx * self.scale_factor
+        end = base + self.scale_factor
+        merged_hash: bytes = self.block_hashes[base]
+        for i in range(base + 1, end):
+            merged_hash += self.block_hashes[i]
+        return BlockHash(merged_hash)
+
+
+BlockHashList = list[BlockHash] | BlockHashListWithBlockSize
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 23af014c1036..bea2f865bad4 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -186,6 +186,7 @@ def __init__(
             enable_kv_cache_events=self.enable_kv_cache_events,
             dcp_world_size=self.dcp_world_size,
             pcp_world_size=self.pcp_world_size,
+            hash_block_size=self.block_size,
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
         self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index d90ec550f766..4aeb17a156bb 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -7,7 +7,7 @@
 
 from vllm.utils.math_utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import BlockHashList, KVCacheBlock
 from vllm.v1.kv_cache_interface import (
     ChunkedLocalAttentionSpec,
     CrossAttentionSpec,
@@ -207,12 +207,13 @@ def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
     @abstractmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
@@ -232,6 +233,11 @@ def find_longest_cache_hit(
             block_pool: The block pool.
             kv_cache_spec: The kv cache spec.
             use_eagle: Whether to use eagle.
+            alignment_tokens: The returned cache hit length (in tokens) should
+                be a multiple of this value (in tokens). By default, it should
+                be set to the block_size.
+            dcp_world_size: The world size of decode context parallelism.
+            pcp_world_size: The world size of prefill context parallelism.
 
         Returns:
             A list of cached blocks with skipped blocks replaced by null block
@@ -299,17 +305,18 @@ class FullAttentionManager(SingleTypeKVCacheManager):
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(
-            kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec)
+            kv_cache_spec, FullAttentionSpec | ChunkedLocalAttentionSpec
         ), (
             "FullAttentionManager can only be used for full attention "
             "and chunked local attention groups"
@@ -333,6 +340,13 @@ def find_longest_cache_hit(
             else:
                 break
         if use_eagle and computed_blocks[0]:
+            # Need to drop the last matched block if eagle is enabled.
+            for computed in computed_blocks:
+                computed.pop()
+        while (
+            block_size != alignment_tokens  # Faster for common case.
+            and len(computed_blocks[0]) * block_size % alignment_tokens != 0
+        ):
             for computed in computed_blocks:
                 computed.pop()
         return computed_blocks
@@ -359,12 +373,13 @@ def __init__(
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
@@ -396,6 +411,7 @@ def find_longest_cache_hit(
             [block_pool.null_block] * max_num_blocks
             for _ in range(len(kv_cache_group_ids))
         )
+        block_size = kv_cache_spec.block_size
         num_contiguous_blocks = 0
         match_found = False
         # Search from right to left and early stop when a match is found.
@@ -403,6 +419,15 @@ def find_longest_cache_hit(
             if cached_block := block_pool.get_cached_block(
                 block_hashes[i], kv_cache_group_ids
             ):
+                # Skip prefix matching check if the block is not aligned with
+                # `alignment_tokens`.
+                if (
+                    num_contiguous_blocks == 0
+                    and block_size != alignment_tokens  # Faster for common case.
+                    and (i + 1) * block_size % alignment_tokens != 0
+                ):
+                    continue
+                # Add the cached block to the computed blocks.
                 for computed, cached in zip(computed_blocks, cached_block):
                     computed[i] = cached
                 num_contiguous_blocks += 1
@@ -421,7 +446,16 @@ def find_longest_cache_hit(
             # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
             for computed in computed_blocks:
                 del computed[num_contiguous_blocks:]
+            while (
+                block_size != alignment_tokens  # Faster for common case.
+                and len(computed_blocks[0]) * block_size % alignment_tokens != 0
+            ):
+                for computed in computed_blocks:
+                    computed.pop()
         if use_eagle and computed_blocks[0]:
+            assert kv_cache_spec.block_size == alignment_tokens, (
+                "aligned_length is not compatible with eagle now"
+            )
             for computed in computed_blocks:
                 computed.pop()
         return computed_blocks
@@ -475,12 +509,13 @@ def __init__(
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
@@ -511,6 +546,10 @@ def find_longest_cache_hit(
             block_pool: The block pool.
             kv_cache_spec: The kv cache spec.
             use_eagle: Whether to use eagle.
+            dcp_world_size: The world size of decode context parallelism.
+            pcp_world_size: The world size of prefill context parallelism.
+            alignment_tokens: The returned cache hit length (in tokens) should
+                be a multiple of this value (in tokens).
 
         Returns:
             A list of cached blocks
@@ -524,6 +563,10 @@ def find_longest_cache_hit(
         )
         assert dcp_world_size == 1, "DCP not support chunked local attn now."
         assert pcp_world_size == 1, "PCP not support chunked local attn now."
+        assert kv_cache_spec.block_size == alignment_tokens, (
+            "KV cache groups with different block sizes are not compatible with "
+            "chunked local attention now"
+        )
         max_num_blocks = max_length // kv_cache_spec.block_size
         if max_length > 0:
             local_attention_start_idx = (
@@ -612,12 +655,13 @@ class MambaManager(SingleTypeKVCacheManager):
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
@@ -630,12 +674,21 @@ def find_longest_cache_hit(
             [] for _ in range(len(kv_cache_group_ids))
         )
 
-        max_num_blocks = max_length // kv_cache_spec.block_size
+        block_size = kv_cache_spec.block_size
+        max_num_blocks = max_length // block_size
         # Search from right to left and early stop when a match is found.
         for i in range(max_num_blocks - 1, -1, -1):
             if cached_block := block_pool.get_cached_block(
                 block_hashes[i], kv_cache_group_ids
             ):
+                # When enable Mamba prefix caching, `block_size` will be aligned
+                # across full attention layers and Mamba layers to ensure the
+                # prefix hit length aligned at block
+                if (
+                    block_size != alignment_tokens  # Faster for common case.
+                    and (i + 1) * block_size % alignment_tokens != 0
+                ):
+                    continue
                 for computed, cached in zip(computed_blocks, cached_block):
                     # the hit length logic later assumes:
                     #  hit_length = len(hit_blocks_other_attn[0])
@@ -708,12 +761,13 @@ def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
         pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:

From a1f267687956ae1f0b7c1668b48020055d764619 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 16:08:57 +0000
Subject: [PATCH 431/578] Scheduled removal of `override_pooler_config` and
 `disable_log_requests` (#29402)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/config/model.py         | 16 ----------------
 vllm/engine/arg_utils.py     | 29 +----------------------------
 vllm/entrypoints/llm.py      |  5 -----
 vllm/utils/argparse_utils.py |  8 --------
 vllm/v1/engine/async_llm.py  |  8 --------
 5 files changed, 1 insertion(+), 65 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 14ffdec2e09d..ce5e824da5c2 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -299,9 +299,6 @@ class ModelConfig:
     pooler_config: PoolerConfig | None = None
     """Pooler config which controls the behaviour of output pooling in pooling
     models."""
-    override_pooler_config: dict | PoolerConfig | None = None
-    """[DEPRECATED] Use `pooler_config` instead. This field will be removed in
-    v0.12.0 or v1.0.0, whichever is sooner."""
 
     # Multimodal config and init vars
     multimodal_config: MultiModalConfig | None = None
@@ -359,7 +356,6 @@ def compute_hash(self) -> str:
             "logits_processors",
             "io_processor_plugin",
             "pooler_config",
-            "override_pooler_config",
             "multimodal_config",
             "limit_mm_per_prompt",
             "media_io_kwargs",
@@ -648,18 +644,6 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
 
         # Init pooler config if needed
         if self.runner_type == "pooling":
-            if self.override_pooler_config is not None:
-                logger.warning_once(
-                    "`override_pooler_config` is deprecated and will be "
-                    "removed in v0.12.0 or v1.0.0, whichever is sooner. "
-                    "Please use `pooler_config` instead."
-                )
-
-                if isinstance(self.override_pooler_config, dict):
-                    self.pooler_config = PoolerConfig(**self.override_pooler_config)
-                else:
-                    self.pooler_config = self.override_pooler_config
-
             if self.pooler_config is None:
                 self.pooler_config = PoolerConfig()
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bdccb15e3f65..696ff3a1f402 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -29,7 +29,7 @@
 import torch
 from pydantic import TypeAdapter, ValidationError
 from pydantic.fields import FieldInfo
-from typing_extensions import TypeIs, deprecated
+from typing_extensions import TypeIs
 
 import vllm.envs as envs
 from vllm.attention.backends.registry import AttentionBackendEnum
@@ -520,9 +520,6 @@ class EngineArgs:
     scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
     pooler_config: PoolerConfig | None = ModelConfig.pooler_config
-    override_pooler_config: dict | PoolerConfig | None = (
-        ModelConfig.override_pooler_config
-    )
     compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
@@ -659,11 +656,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         )
         model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"])
         model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"])
-        model_group.add_argument(
-            "--override-pooler-config",
-            **model_kwargs["override_pooler_config"],
-            deprecated=True,
-        )
         model_group.add_argument(
             "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]
         )
@@ -1243,7 +1235,6 @@ def create_model_config(self) -> ModelConfig:
             mm_encoder_tp_mode=self.mm_encoder_tp_mode,
             mm_encoder_attn_backend=self.mm_encoder_attn_backend,
             pooler_config=self.pooler_config,
-            override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
             generation_config=self.generation_config,
             override_generation_config=self.override_generation_config,
@@ -2047,24 +2038,6 @@ class AsyncEngineArgs(EngineArgs):
 
     enable_log_requests: bool = False
 
-    @property
-    @deprecated(
-        "`disable_log_requests` is deprecated and has been replaced with "
-        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
-        "`enable_log_requests` instead."
-    )
-    def disable_log_requests(self) -> bool:
-        return not self.enable_log_requests
-
-    @disable_log_requests.setter
-    @deprecated(
-        "`disable_log_requests` is deprecated and has been replaced with "
-        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
-        "`enable_log_requests` instead."
-    )
-    def disable_log_requests(self, value: bool):
-        self.enable_log_requests = not value
-
     @staticmethod
     def add_cli_args(
         parser: FlexibleArgumentParser, async_args_only: bool = False
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 848916dbd876..1860f383d45f 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -174,9 +174,6 @@ class LLM:
             For example, for Phi-3-Vision: `{"num_crops": 4}`.
         pooler_config: Initialize non-default pooling config for the pooling
             model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
-        override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This
-            argument is deprecated and will be removed in v0.12.0 or v1.0.0,
-            whichever is sooner.
         compilation_config: Either an integer or a dictionary. If it is an
             integer, it is used as the mode of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
@@ -214,7 +211,6 @@ def __init__(
         hf_overrides: HfOverrides | None = None,
         mm_processor_kwargs: dict[str, Any] | None = None,
         pooler_config: PoolerConfig | None = None,
-        override_pooler_config: PoolerConfig | None = None,
         structured_outputs_config: dict[str, Any]
         | StructuredOutputsConfig
         | None = None,
@@ -330,7 +326,6 @@ def __init__(
             hf_overrides=hf_overrides,
             mm_processor_kwargs=mm_processor_kwargs,
             pooler_config=pooler_config,
-            override_pooler_config=override_pooler_config,
             structured_outputs_config=structured_outputs_instance,
             compilation_config=compilation_config_instance,
             logits_processors=logits_processors,
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index 3d105a3685b3..692e756d1963 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -73,14 +73,6 @@ def __init__(self, *args, **kwargs):
         # Enable the deprecated kwarg for Python 3.12 and below
 
         def parse_known_args(self, args=None, namespace=None):
-            if args is not None and "--disable-log-requests" in args:
-                # Special case warning because the warning below won't trigger
-                # if –-disable-log-requests because its value is default.
-                logger.warning_once(
-                    "argument '--disable-log-requests' is deprecated and "
-                    "replaced with '--enable-log-requests'. This will be "
-                    "removed in v0.12.0."
-                )
             namespace, args = super().parse_known_args(args, namespace)
             for action in FlexibleArgumentParser._deprecated:
                 if (
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 55087baadff9..827a2736af28 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -31,7 +31,6 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.async_utils import cancel_task_threadsafe
 from vllm.utils.collection_utils import as_list
-from vllm.utils.func_utils import deprecate_kwargs
 from vllm.utils.math_utils import cdiv
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
@@ -195,12 +194,6 @@ def __init__(
             self.profiler = None
 
     @classmethod
-    @deprecate_kwargs(
-        "disable_log_requests",
-        additional_message=(
-            "This argument will have no effect. Use `enable_log_requests` instead."
-        ),
-    )
     def from_vllm_config(
         cls,
         vllm_config: VllmConfig,
@@ -213,7 +206,6 @@ def from_vllm_config(
         client_addresses: dict[str, str] | None = None,
         client_count: int = 1,
         client_index: int = 0,
-        disable_log_requests: bool = True,  # Deprecated, will be removed
     ) -> "AsyncLLM":
         # Create the LLMEngine.
         return cls(

From 0353d2e162cbda776d9dbfe026e65303204a7f1f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 16:23:45 +0000
Subject: [PATCH 432/578] Fix RoPE related failures in Transformers nightly
 tests (#29333)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/baichuan.py |  2 +-
 vllm/model_executor/models/gpt_j.py    |  2 +-
 vllm/model_executor/models/grok1.py    |  2 +-
 vllm/model_executor/models/llama.py    |  2 +-
 vllm/transformers_utils/config.py      | 62 ++++++++++++++------------
 5 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index edf47270e527..024788918d02 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -233,7 +233,7 @@ def __init__(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             position_embedding=position_embedding,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index e94de8952fa6..bd1bfea3c0fe 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -100,7 +100,7 @@ def __init__(
             self.head_size,
             rotary_dim=config.rotary_dim,
             max_position=max_position_embeddings,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
             is_neox_style=False,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index 4bf23cd6fd19..cfca56492011 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -239,7 +239,7 @@ def __init__(
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index eebb9e07fa89..f6af2bb3b12e 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -262,7 +262,7 @@ def _init_rotary_emb(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
             is_neox_style=is_neox_style,
             partial_rotary_factor=self.partial_rotary_factor,
         )
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index a29d92f67f5d..66680f410cb3 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -456,51 +456,55 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
 
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
-    # Retrieve rope_parameters differently based on Transformers version
+    # Patch rope_parameters differently based on Transformers version
     if Version(version("transformers")) >= Version("5.0.0.dev0"):
-        from transformers.modeling_rope_utils import RopeParameters
-
-        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr(
-            config, "rope_parameters", None
+        from transformers.modeling_rope_utils import (
+            rope_config_validation,
+            standardize_rope_params,
         )
-    elif hasattr(config, "rope_parameters"):
-        # We are in Transformers v4 and rope_parameters
-        # has already been patched for this config
-        return
+
+        # When Transformers v5 is installed, legacy rope_theta may be present
+        # when using custom code models written for Transformers v4
+        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
+            standardize_rope_params(config, rope_theta=rope_theta)
+            rope_config_validation(config)
+            # Delete rope_theta to avoid confusion in downstream code
+            del config.rope_theta
     else:
-        # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters
-        rope_theta: float | None = getattr(config, "rope_theta", None)
-        rope_scaling: dict | None = getattr(config, "rope_scaling", None)
-        rope_parameters = rope_scaling
-        # Move rope_theta into rope_parameters
-        if rope_theta is not None:
-            rope_parameters = rope_parameters or {"rope_type": "default"}
-            rope_parameters["rope_theta"] = rope_theta
-        # Add original_max_position_embeddings if present
-        if rope_parameters and (
-            ompe := getattr(config, "original_max_position_embeddings", None)
-        ):
-            rope_parameters["original_max_position_embeddings"] = ompe
-        # Write back to config
-        config.rope_parameters = rope_parameters
+        # When Transformers v4 is installed, legacy rope_scaling may be present
+        if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
+            config.rope_parameters = rope_scaling
+        # When Transformers v4 is installed, legacy rope_theta may be present
+        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
+            if not hasattr(config, "rope_parameters"):
+                config.rope_parameters = {"rope_type": "default"}
+            config.rope_parameters["rope_theta"] = rope_theta
 
     # No RoPE parameters to patch
-    if rope_parameters is None:
+    if not hasattr(config, "rope_parameters"):
         return
 
+    # Add original_max_position_embeddings if present
+    if ompe := getattr(config, "original_max_position_embeddings", None):
+        config.rope_parameters["original_max_position_embeddings"] = ompe
+
     # Handle nested rope_parameters in interleaved sliding attention models
-    if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
-        for rope_parameters_layer_type in rope_parameters.values():
+    if set(config.rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+        for rope_parameters_layer_type in config.rope_parameters.values():
             patch_rope_parameters_dict(rope_parameters_layer_type)
     else:
-        patch_rope_parameters_dict(rope_parameters)
+        patch_rope_parameters_dict(config.rope_parameters)
 
 
 def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
     if "rope_type" in rope_parameters and "type" in rope_parameters:
         rope_type = rope_parameters["rope_type"]
         rope_type_legacy = rope_parameters["type"]
-        if rope_type != rope_type_legacy:
+        if (rope_type_legacy == "su" and rope_type == "longrope") or (
+            rope_type_legacy == "mrope" and rope_type == "default"
+        ):
+            pass  # No action needed
+        elif rope_type != rope_type_legacy:
             raise ValueError(
                 f"Found conflicts between 'rope_type={rope_type}' (modern "
                 f"field) and 'type={rope_type_legacy}' (legacy field). "

From b07555d26f4c7ad9a2d1ec45428a9d4287db612c Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Tue, 25 Nov 2025 10:27:26 -0800
Subject: [PATCH 433/578] [responsesAPI][2] parse
 ResponseFunctionToolCallOutputItem (#29383)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 tests/entrypoints/test_responses_utils.py | 15 +++++++++++++++
 vllm/entrypoints/openai/protocol.py       |  5 +----
 vllm/entrypoints/responses_utils.py       |  9 +++++++++
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
index 91c818374e3f..893d806b6574 100644
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -2,6 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
 from openai.types.responses.response_reasoning_item import (
     Content,
     ResponseReasoningItem,
@@ -76,6 +79,18 @@ def test_construct_chat_message_with_tool_call(self):
             == 'Hmm, the user has just started with a simple "Hello,"'
         )
 
+        tool_call_output = ResponseFunctionToolCallOutputItem(
+            id="temp_id",
+            type="function_call_output",
+            call_id="temp",
+            output="1234",
+            status="completed",
+        )
+        formatted_item = construct_chat_message_with_tool_call(tool_call_output)
+        assert formatted_item["role"] == "tool"
+        assert formatted_item["content"] == "1234"
+        assert formatted_item["tool_call_id"] == "temp"
+
         item = ResponseReasoningItem(
             id="lol",
             summary=[],
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 98a385a1dcd5..688ea9697d9d 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -29,7 +29,6 @@
     ResponseOutputItemAddedEvent,
     ResponseOutputItemDoneEvent,
     ResponsePrompt,
-    ResponseReasoningItem,
     ResponseReasoningTextDeltaEvent,
     ResponseReasoningTextDoneEvent,
     ResponseStatus,
@@ -304,9 +303,7 @@ def get_logits_processors(
     return None
 
 
-ResponseInputOutputItem: TypeAlias = (
-    ResponseInputItemParam | ResponseReasoningItem | ResponseFunctionToolCall
-)
+ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem
 
 
 class ResponsesRequest(OpenAIBaseModel):
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index b02c43c7f824..07abb80ebc9e 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -10,6 +10,9 @@
     Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
@@ -94,6 +97,12 @@ def construct_chat_message_with_tool_call(
             "role": "assistant",
             "reasoning": reasoning_content,
         }
+    elif isinstance(item, ResponseFunctionToolCallOutputItem):
+        return ChatCompletionToolMessageParam(
+            role="tool",
+            content=item.output,
+            tool_call_id=item.call_id,
+        )
     elif item.get("type") == "function_call_output":
         # Append the function call output as a tool message.
         return ChatCompletionToolMessageParam(

From c32a18cbe7342ac0700802b94ae98bbf928a00f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eldar=20Kurti=C4=87?=
 <8884008+eldarkurtic@users.noreply.github.com>
Date: Tue, 25 Nov 2025 20:23:36 +0100
Subject: [PATCH 434/578] Attempt to fix GPU OOM in a spec-decoding test
 (#29419)

Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com>
---
 examples/offline_inference/spec_decode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 67a073245970..29b2e95d262f 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -133,7 +133,7 @@ def main(args):
         tensor_parallel_size=args.tp,
         enable_chunked_prefill=args.enable_chunked_prefill,
         enforce_eager=args.enforce_eager,
-        gpu_memory_utilization=0.8,
+        gpu_memory_utilization=0.9,
         speculative_config=speculative_config,
         disable_log_stats=False,
         max_model_len=args.max_model_len,

From e7d776273de379bb6c9fc11ce070c57e0fcd84f9 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Tue, 25 Nov 2025 20:58:56 +0100
Subject: [PATCH 435/578] [Compile] Refactor. Move PostGradPassManager out of
 Compilation config (#29340)

Signed-off-by: ilmarkov <markovilya197@gmail.com>
---
 vllm/compilation/backends.py          | 33 ++++++++++++++++-----------
 vllm/compilation/piecewise_backend.py |  2 +-
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 2d8dd4c51c7e..1773913d0b6c 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -11,6 +11,7 @@
 import time
 from collections.abc import Callable, Sequence
 from contextlib import contextmanager
+from copy import deepcopy
 from functools import partial
 from typing import Any
 
@@ -429,7 +430,7 @@ def call_module(
                 self.vllm_backend.compiler_manager.compile(
                     submod,
                     args,
-                    self.compilation_config.inductor_compile_config,
+                    self.vllm_backend.inductor_config,
                     self.compilation_config,
                     graph_index=index,
                     num_graphs=len(self.compile_submod_names),
@@ -531,6 +532,9 @@ class VllmBackend:
     sym_tensor_indices: list[int]
     input_buffers: list[torch.Tensor]
     compiler_manager: CompilerManager
+    # Copy of CompilationConfig.inductor_compile_config +
+    # an entry for PostGradPassManager
+    inductor_config: dict[str, Any]
 
     def __init__(
         self,
@@ -561,25 +565,30 @@ def __init__(
             self.compilation_config
         )
 
+        # Deepcopy the inductor config to detach the post-grad custom pass
+        # from CompilationConfig.
+        # We want to avoid PostGradPassManager in CompilationConfig because
+        # in future we need PostGradPassManager.uuid() to be executed
+        # only at compile time.
+        self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config)
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
     def configure_post_pass(self):
-        config = self.compilation_config
         self.pass_manager.configure(self.vllm_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
         # hook. If a pass for that hook exists, add it to the pass manager.
-        inductor_config = config.inductor_compile_config
-        if self.pass_key in inductor_config:
-            if isinstance(inductor_config[self.pass_key], PostGradPassManager):
-                # PassManager already added to config, make sure it's correct
-                assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid()
+        if self.pass_key in self.inductor_config:
+            if isinstance(self.inductor_config[self.pass_key], PostGradPassManager):
+                raise ValueError(
+                    "PostGradPassManager can not be kept in CompilationConfig."
+                )
             else:
                 # Config should automatically wrap all inductor passes
-                assert isinstance(inductor_config[self.pass_key], InductorPass)
-                self.pass_manager.add(inductor_config[self.pass_key])
-        inductor_config[self.pass_key] = self.pass_manager
+                assert isinstance(self.inductor_config[self.pass_key], InductorPass)
+                self.pass_manager.add(self.inductor_config[self.pass_key])
+        self.inductor_config[self.pass_key] = self.pass_manager
 
     def __call__(
         self, graph: fx.GraphModule, example_inputs
@@ -638,9 +647,7 @@ def __call__(
         self.compilation_config.local_cache_dir = local_cache_dir
 
         # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
-        disable_cache = not is_compile_cache_enabled(
-            self.compilation_config.inductor_compile_config
-        )
+        disable_cache = not is_compile_cache_enabled(self.inductor_config)
 
         if disable_cache:
             logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index 2931580afbbb..e535d2c461c6 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -107,7 +107,7 @@ def __call__(self, *args) -> Any:
             entry.runnable = self.vllm_backend.compiler_manager.compile(
                 self.graph,
                 args,
-                self.compilation_config.inductor_compile_config,
+                self.vllm_backend.inductor_config,
                 self.compilation_config,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,

From 4e57c6587fe062211177f6b5d6785f00c3aea562 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 25 Nov 2025 12:55:24 -0800
Subject: [PATCH 436/578] [Core] Support logprobs with spec decode + async
 scheduling  (#29223)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/e2e/test_async_scheduling.py |  7 ++++-
 vllm/v1/core/sched/scheduler.py       |  2 --
 vllm/v1/sample/rejection_sampler.py   | 14 ++++++++--
 vllm/v1/worker/gpu_model_runner.py    | 37 ++++++++++++---------------
 4 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 00d93e1ba0b5..945276376d66 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -87,6 +87,11 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
     # Set small draft model len to force doesn't-fit-in-drafter case.
     spec_config_short = spec_config | {"max_model_len": 50}
 
+    test_sampling_params = [
+        dict(),
+        dict(logprobs=2),
+    ]
+
     # test_preemption, executor, async_scheduling,
     # spec_config, test_prefill_chunking
     test_configs = [
@@ -103,7 +108,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
         (True, "uni", True, spec_config_short, True),
     ]
 
-    run_tests(monkeypatch, MTP_MODEL, test_configs, [{}])
+    run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)
 
 
 @dynamo_config.patch(cache_size_limit=16)
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index bea2f865bad4..0304a8ec48bf 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1089,8 +1089,6 @@ def update_from_output(
                 and request.sampling_params.logprobs is not None
                 and logprobs
             ):
-                # NOTE: once we support N tokens per step (spec decode),
-                # the outer lists can be of length > 1.
                 new_logprobs = logprobs.slice(req_index, req_index + 1)
 
             if new_token_ids and self.structured_output_manager.should_advance(request):
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 926305d25f56..ccaf07e18c46 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Sequence
 from dataclasses import replace
 
 import torch
@@ -204,7 +205,9 @@ def _get_logprobs_tensors(
     def parse_output(
         output_token_ids: torch.Tensor,
         vocab_size: int,
-    ) -> list[list[int]]:
+        discard_req_indices: Sequence[int] = (),
+        return_cu_num_tokens: bool = False,
+    ) -> tuple[list[list[int]], list[int] | None]:
         """Parse the output of the rejection sampler.
         Args:
             output_token_ids: The sampled token IDs in shape
@@ -212,6 +215,8 @@ def parse_output(
                 replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                 and will be filtered out in this function.
             vocab_size: The size of the vocabulary.
+            discard_req_indices: Optional row indices to discard tokens in.
+            return_cu_num_tokens: Whether to also return cumulative token counts.
         Returns:
             A list of lists of token IDs.
         """
@@ -220,10 +225,15 @@ def parse_output(
         valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
             output_token_ids_np < vocab_size
         )
+        cu_num_tokens = None
+        if return_cu_num_tokens:
+            cu_num_tokens = [0] + valid_mask.sum(axis=1).cumsum().tolist()
+        if len(discard_req_indices) > 0:
+            valid_mask[discard_req_indices] = False
         outputs = [
             row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
         ]
-        return outputs
+        return outputs, cu_num_tokens
 
     def apply_logits_processors(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e78d3c71af77..bb44c5ad84cc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -183,7 +183,7 @@ def __init__(
         self,
         model_runner_output: ModelRunnerOutput,
         sampled_token_ids: torch.Tensor,
-        logprobs_tensors: torch.Tensor | None,
+        logprobs_tensors: LogprobsTensors | None,
         invalid_req_indices: list[int],
         async_output_copy_stream: torch.cuda.Stream,
         vocab_size: int,
@@ -219,28 +219,29 @@ def get_output(self) -> ModelRunnerOutput:
 
         This function blocks until the copy is finished.
         """
+        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
         self.async_copy_ready_event.synchronize()
 
         # Release the device tensors once the copy has completed.
         del self._logprobs_tensors
         del self._sampled_token_ids
-        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
         if max_gen_len == 1:
             valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
+            for i in self._invalid_req_indices:
+                valid_sampled_token_ids[i].clear()
+            cu_num_tokens = None
         else:
-            valid_sampled_token_ids = RejectionSampler.parse_output(
+            valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
                 self.sampled_token_ids_cpu,
                 self.vocab_size,
+                self._invalid_req_indices,
+                return_cu_num_tokens=self._logprobs_tensors_cpu is not None,
             )
-        for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i].clear()
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
         if self._logprobs_tensors_cpu:
-            # NOTE(nick): this will need to be updated to use cu_num_accepted_tokens
-            # for async sched + spec decode + logprobs compatibility.
-            output.logprobs = self._logprobs_tensors_cpu.tolists()
+            output.logprobs = self._logprobs_tensors_cpu.tolists(cu_num_tokens)
         return output
 
 
@@ -2597,28 +2598,24 @@ def _bookkeeping_sync(
         sampled_token_ids = sampler_output.sampled_token_ids
         logprobs_tensors = sampler_output.logprobs_tensors
         invalid_req_indices = []
-        cu_num_new_tokens: list[int] | None = None
+        cu_num_tokens: list[int] | None = None
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
             if max_gen_len == 1:
                 # No spec decode tokens.
                 valid_sampled_token_ids = self._to_list(sampled_token_ids)
+                # Mask out the sampled tokens that should not be sampled.
+                for i in discard_sampled_tokens_req_indices:
+                    valid_sampled_token_ids[int(i)].clear()
             else:
                 # Includes spec decode tokens.
-                valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
                     sampled_token_ids,
                     self.input_batch.vocab_size,
+                    discard_sampled_tokens_req_indices,
+                    return_cu_num_tokens=logprobs_tensors is not None,
                 )
-                if logprobs_tensors:
-                    # Needed for extracting logprobs when spec decoding.
-                    # This must be done prior to discarding sampled tokens.
-                    cu_num_new_tokens = [0]
-                    for toks in valid_sampled_token_ids:
-                        cu_num_new_tokens.append(cu_num_new_tokens[-1] + len(toks))
-            # Mask out the sampled tokens that should not be sampled.
-            for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)].clear()
         else:
             valid_sampled_token_ids = []
             invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
@@ -2672,7 +2669,7 @@ def _bookkeeping_sync(
             req_state.output_token_ids.extend(sampled_ids)
 
         logprobs_lists = (
-            logprobs_tensors.tolists(cu_num_new_tokens)
+            logprobs_tensors.tolists(cu_num_tokens)
             if not self.use_async_scheduling and logprobs_tensors is not None
             else None
         )

From 0abc79482a6d476f58530907443c46134ba6e2e1 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Tue, 25 Nov 2025 16:46:41 -0500
Subject: [PATCH 437/578] [caching] Add enable_prompt_embeds and cpu_offload_gb
 to compile hashes. (#29435)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 vllm/config/cache.py | 4 +---
 vllm/config/model.py | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index ef6928d8ebd5..00530846fce0 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -144,7 +144,7 @@ class CacheConfig:
 
     kv_offloading_backend: KVOffloadingBackend | None = None
     """The backend to use for KV cache offloading. Supported backends include
-    'native' (vLLM native CPU offloading), 'lmcache' This option must be used 
+    'native' (vLLM native CPU offloading), 'lmcache' This option must be used
     together with kv_offloading_size."""
 
     def compute_hash(self) -> str:
@@ -167,8 +167,6 @@ def compute_hash(self) -> str:
             "num_gpu_blocks_override",
             "enable_prefix_caching",
             "prefix_caching_hash_algo",
-            # `cpu_offload_gb` does not use `torch.compile` yet.
-            "cpu_offload_gb",
             "cpu_kvcache_space_bytes",
             "mamba_page_size_padded",
             # Post-init/derived counters
diff --git a/vllm/config/model.py b/vllm/config/model.py
index ce5e824da5c2..25972f097f53 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -345,7 +345,6 @@ def compute_hash(self) -> str:
             "logprobs_mode",
             "disable_cascade_attn",
             "skip_tokenizer_init",
-            "enable_prompt_embeds",
             "served_model_name",
             "config_format",
             "hf_token",

From 7df0289782ab500b2713b7521979c28de2b21cac Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 25 Nov 2025 17:52:31 -0500
Subject: [PATCH 438/578] Change warning logs to debug for unimplemented MXFP4
 Linear/Attention (#29441)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 198feb03be3e..d975131f7cff 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -196,9 +196,10 @@ def get_quant_method(
             # TODO: Add support for MXFP4 Linear Method.
             # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation
             # if you are interested in enabling MXFP4 here.
-            logger.warning_once(
+            logger.debug_once(
                 "MXFP4 linear layer is not implemented - falling back to "
-                "UnquantizedLinearMethod."
+                "UnquantizedLinearMethod.",
+                scope="local",
             )
             return UnquantizedLinearMethod()
         elif isinstance(layer, FusedMoE):
@@ -208,9 +209,10 @@ def get_quant_method(
                 return Mxfp4MoEMethod(layer.moe_config)
         elif isinstance(layer, Attention):
             # TODO: Add support for MXFP4 Attention.
-            logger.warning_once(
+            logger.debug_once(
                 "MXFP4 attention layer is not implemented. "
-                "Skipping quantization for this layer."
+                "Skipping quantization for this layer.",
+                scope="local",
             )
         return None
 

From de75b0bb701c89c1b2bffe09b23f86446c951f73 Mon Sep 17 00:00:00 2001
From: Andrey Khalyavin <halyavin@yandex-team.ru>
Date: Wed, 26 Nov 2025 02:45:58 +0300
Subject: [PATCH 439/578] [BugFix] Fix initialization of draft model.  (#29319)

Signed-off-by: Andrey Khalyavin <halyavin@yandex-team.ru>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bb44c5ad84cc..9f3c34b15e2a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3460,6 +3460,10 @@ def load_model(self, eep_scale_up: bool = False) -> None:
             scope="local",
         )
         prepare_communication_buffer_for_model(self.model)
+        if (drafter := getattr(self, "drafter", None)) and (
+            drafter_model := getattr(drafter, "model", None)
+        ):
+            prepare_communication_buffer_for_model(drafter_model)
         mm_config = self.model_config.multimodal_config
         self.is_multimodal_pruning_enabled = (
             supports_multimodal_pruning(self.get_model())

From d8819c88eb633f1435ef389f87c23a9fd6010917 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Tue, 25 Nov 2025 16:14:23 -0800
Subject: [PATCH 440/578] fix assertion for single world use case (uni)
 (#29429)

Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
---
 vllm/config/parallel.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 913e97250d3d..7ba1da5db384 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -593,9 +593,10 @@ def __post_init__(self) -> None:
                 "max_parallel_loading_workers is currently "
                 "not supported and will be ignored."
             )
-        if self.distributed_executor_backend != "mp" and self.nnodes > 1:
+        if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
             raise ValueError(
-                "nnodes > 1 can only be set when distributed exectuor backend is mp."
+                "nnodes > 1 can only be set when distributed executor "
+                "backend is mp or uni."
             )
 
     @property

From 12866af748f3933b62891f02646526c64395b26f Mon Sep 17 00:00:00 2001
From: Xieyang Xu <ashlippers@gmail.com>
Date: Tue, 25 Nov 2025 16:20:35 -0800
Subject: [PATCH 441/578] dummy run corner case (#29433)

---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9f3c34b15e2a..d3c61794f8b0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2789,7 +2789,7 @@ def execute_model(
                         # returns True. before returning early here we call
                         # dummy run to ensure coordinate_batch_across_dp
                         # is called into to avoid out of sync issues.
-                        self._dummy_run(1)
+                        self._dummy_run(self._get_num_input_tokens(1))
                     if not has_kv_transfer_group():
                         # Return empty ModelRunnerOutput if no work to do.
                         return EMPTY_MODEL_RUNNER_OUTPUT

From 56531b79ccc746bb579a49411f32be31bc307d4b Mon Sep 17 00:00:00 2001
From: "George D. Torres" <41129492+geodavic@users.noreply.github.com>
Date: Tue, 25 Nov 2025 18:50:22 -0600
Subject: [PATCH 442/578] [Misc] Add backup hash algorithm for FIPS constrained
 environments (#28795)

Signed-off-by: George D. Torres <gdavtor@gmail.com>
Signed-off-by: George D. Torres <41129492+geodavic@users.noreply.github.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/compilation/caching.py                    |  4 ++--
 vllm/compilation/compiler_interface.py         | 14 +++++++-------
 vllm/config/device.py                          |  4 ++--
 vllm/config/kv_transfer.py                     |  4 ++--
 vllm/config/load.py                            |  4 ++--
 vllm/config/lora.py                            |  4 ++--
 vllm/config/multimodal.py                      |  4 ++--
 vllm/config/observability.py                   |  4 ++--
 vllm/config/pooler.py                          |  4 ++--
 vllm/config/scheduler.py                       |  4 ++--
 vllm/config/speculative.py                     |  4 ++--
 vllm/config/structured_outputs.py              |  4 ++--
 vllm/config/vllm.py                            | 10 +++++-----
 .../v1/shared_storage_connector.py             |  4 ++--
 vllm/model_executor/models/registry.py         |  4 ++--
 vllm/utils/hashing.py                          | 18 ++++++++++++++++++
 16 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 6297d9f995aa..ce482572b401 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 import inspect
 import os
 import pickle
@@ -14,6 +13,7 @@
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 
 try:
     from torch._dynamo.aot_compile import SerializableCallable
@@ -160,7 +160,7 @@ def _compute_code_hash_with_content(file_contents: dict[str, str]) -> str:
             # e.g. exec(). We can't actually check these.
             continue
         hash_content.append(content)
-    return hashlib.md5(
+    return safe_hash(
         "\n".join(hash_content).encode(), usedforsecurity=False
     ).hexdigest()
 
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 11cf0f85c178..7deaba1a99fa 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import copy
-import hashlib
 import os
 from collections.abc import Callable
 from contextlib import ExitStack
@@ -16,6 +15,7 @@
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
+from vllm.utils.hashing import safe_hash
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
@@ -197,9 +197,9 @@ def __init__(self, save_format: Literal["binary", "unpacked"]):
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         factors = get_inductor_factors()
-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
         return hash_str
 
     def initialize_cache(
@@ -286,9 +286,9 @@ class InductorAdaptor(CompilerInterface):
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         factors = get_inductor_factors()
-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
         return hash_str
 
     def initialize_cache(
diff --git a/vllm/config/device.py b/vllm/config/device.py
index e85cd15de8cf..85662ddff76b 100644
--- a/vllm/config/device.py
+++ b/vllm/config/device.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from dataclasses import field
 from typing import Any, Literal
 
@@ -10,6 +9,7 @@
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
 
@@ -45,7 +45,7 @@ def compute_hash(self) -> str:
         # the device/platform information will be summarized
         # by torch/vllm automatically.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py
index dfd7ef63712a..88f8b91c292b 100644
--- a/vllm/config/kv_transfer.py
+++ b/vllm/config/kv_transfer.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 import uuid
 from dataclasses import field
 from typing import Any, Literal, get_args
@@ -9,6 +8,7 @@
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 KVProducer = Literal["kv_producer", "kv_both"]
 KVConsumer = Literal["kv_consumer", "kv_both"]
@@ -79,7 +79,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self) -> None:
diff --git a/vllm/config/load.py b/vllm/config/load.py
index e424f8c5edb6..579a0bc31020 100644
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from typing import TYPE_CHECKING, Any
 
 from pydantic import Field, field_validator
@@ -9,6 +8,7 @@
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 
 if TYPE_CHECKING:
     from vllm.model_executor.model_loader import LoadFormats
@@ -104,7 +104,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @field_validator("load_format", mode="after")
diff --git a/vllm/config/lora.py b/vllm/config/lora.py
index 072e0ec2104f..6a8fd6359aad 100644
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from typing import TYPE_CHECKING, Any, Literal
 
 import torch
@@ -11,6 +10,7 @@
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -74,7 +74,7 @@ def compute_hash(self) -> str:
         factors.append(self.fully_sharded_loras)
         factors.append(self.lora_dtype)
 
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @model_validator(mode="after")
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 00a81a319bf7..590bc4dcd076 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from collections.abc import Mapping
 from typing import TYPE_CHECKING, Any, Literal, TypeAlias
 
@@ -9,6 +8,7 @@
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 if TYPE_CHECKING:
     from vllm.attention.backends.registry import AttentionBackendEnum
@@ -216,7 +216,7 @@ def compute_hash(self) -> str:
             if self.mm_encoder_attn_backend is not None
             else None
         ]
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def get_limit_per_prompt(self, modality: str) -> int:
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index 564c4f7aed41..ff35e12fe20e 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from functools import cached_property
 from typing import Any, Literal, cast
 
@@ -11,6 +10,7 @@
 
 from vllm import version
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 DetailedTraceModules = Literal["model", "worker", "all"]
 
@@ -78,7 +78,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @field_validator("show_hidden_metrics_for_version")
diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py
index 6bece8d0785b..85950bbcd666 100644
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from typing import Any
 
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 
 logger = init_logger(__name__)
 
@@ -102,7 +102,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index b6078706daac..2cf42d57ec21 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from collections.abc import Callable
 from dataclasses import InitVar
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
@@ -12,6 +11,7 @@
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 if TYPE_CHECKING:
@@ -178,7 +178,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @field_validator("scheduler_cls", "async_scheduling", mode="wrap")
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index d7c019c73d59..80d53a543f14 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
-import hashlib
 from typing import TYPE_CHECKING, Any, Literal, get_args
 
 from pydantic import Field, SkipValidation, model_validator
@@ -13,6 +12,7 @@
 from vllm.config.parallel import ParallelConfig
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 from vllm.utils.import_utils import LazyLoader, has_arctic_inference
 
 if TYPE_CHECKING:
@@ -162,7 +162,7 @@ def compute_hash(self) -> str:
         # Eagle3 affects the computation graph because it returns intermediate
         # hidden states in addition to the final hidden state.
         factors.append(self.method == "eagle3")
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @staticmethod
diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py
index 9530d3d81e15..1b32675c3dbd 100644
--- a/vllm/config/structured_outputs.py
+++ b/vllm/config/structured_outputs.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from typing import Any, Literal
 
 from pydantic import model_validator
@@ -9,6 +8,7 @@
 from typing_extensions import Self
 
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 StructuredOutputsBackend = Literal[
     "auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer"
@@ -58,7 +58,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @model_validator(mode="after")
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 8a3599416bc7..9342564aa3d3 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -3,7 +3,6 @@
 
 import copy
 import getpass
-import hashlib
 import json
 import os
 import tempfile
@@ -25,6 +24,7 @@
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid
+from vllm.utils.hashing import safe_hash
 
 from .cache import CacheConfig
 from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
@@ -193,7 +193,7 @@ def compute_hash(self) -> str:
             vllm_factors.append("None")
         if self.additional_config:
             if isinstance(additional_config := self.additional_config, dict):
-                additional_config_hash = hashlib.md5(
+                additional_config_hash = safe_hash(
                     json.dumps(additional_config, sort_keys=True).encode(),
                     usedforsecurity=False,
                 ).hexdigest()
@@ -204,9 +204,9 @@ def compute_hash(self) -> str:
             vllm_factors.append("None")
         factors.append(vllm_factors)
 
-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
         return hash_str
 
     def pad_for_cudagraph(self, batch_size: int) -> int:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 016d1d45b359..4611b4d1ff7b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import hashlib
 import os
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Optional
@@ -15,6 +14,7 @@
     KVConnectorRole,
 )
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
@@ -423,7 +423,7 @@ def _generate_foldername_debug(
         if mm_hashes:
             mm_str = "-".join(mm_hashes)
             token_bytes += mm_str.encode("utf-8")
-        input_ids_hash = hashlib.md5(token_bytes, usedforsecurity=False).hexdigest()
+        input_ids_hash = safe_hash(token_bytes, usedforsecurity=False).hexdigest()
 
         foldername = os.path.join(self._storage_path, input_ids_hash)
         if create_folder:
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a0d8a78a2ae7..53644f9cb878 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -5,7 +5,6 @@
 `tests/models/registry.py` with example HuggingFace models for it.
 """
 
-import hashlib
 import importlib
 import json
 import os
@@ -32,6 +31,7 @@
 from vllm.logger import init_logger
 from vllm.logging_utils import logtime
 from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
+from vllm.utils.hashing import safe_hash
 
 from .interfaces import (
     has_inner_state,
@@ -654,7 +654,7 @@ def inspect_model_cls(self) -> _ModelInfo:
 
         if model_path.exists():
             with open(model_path, "rb") as f:
-                module_hash = hashlib.md5(f.read(), usedforsecurity=False).hexdigest()
+                module_hash = safe_hash(f.read(), usedforsecurity=False).hexdigest()
 
             mi = self._load_modelinfo_from_cache(module_hash)
             if mi is not None:
diff --git a/vllm/utils/hashing.py b/vllm/utils/hashing.py
index 49f4f13d115f..edf1e9cb34e5 100644
--- a/vllm/utils/hashing.py
+++ b/vllm/utils/hashing.py
@@ -5,6 +5,7 @@
 
 import hashlib
 import pickle
+from _hashlib import HASH, UnsupportedDigestmodError
 from collections.abc import Callable
 from typing import Any
 
@@ -61,3 +62,20 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
         return sha256_cbor
 
     raise ValueError(f"Unsupported hash function: {hash_fn_name}")
+
+
+def safe_hash(data: bytes, usedforsecurity: bool = True) -> HASH:
+    """Hash for configs, defaulting to md5 but falling back to sha256
+    in FIPS constrained environments.
+
+    Args:
+        data: bytes
+        usedforsecurity: Whether the hash is used for security purposes
+
+    Returns:
+        Hash object
+    """
+    try:
+        return hashlib.md5(data, usedforsecurity=usedforsecurity)
+    except (UnsupportedDigestmodError, ValueError):
+        return hashlib.sha256(data)

From 8d6a89dffd9e2e238b3c6ead964783cfdb053756 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 25 Nov 2025 20:19:35 -0500
Subject: [PATCH 443/578] [UX] Suppress gloo log spam (#29250)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/distributed/parallel_state.py |  4 ++-
 vllm/distributed/utils.py          | 54 ++++++++++++++++--------------
 vllm/utils/system_utils.py         | 33 ++++++++++++++++++
 3 files changed, 64 insertions(+), 27 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index f81612fd1f4a..69c28e278f2d 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -51,6 +51,7 @@
 from vllm.logger import init_logger
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.network_utils import get_distributed_init_method
+from vllm.utils.system_utils import suppress_stdout
 from vllm.utils.torch_utils import (
     direct_register_custom_op,
     supports_custom_op,
@@ -329,7 +330,8 @@ def __init__(
             )
             # a group with `gloo` backend, to allow direct coordination between
             # processes through the CPU.
-            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            with suppress_stdout():
+                cpu_group = torch.distributed.new_group(ranks, backend="gloo")
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index debf69c49b7d..242ce393e4dc 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -30,6 +30,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.utils.network_utils import get_tcp_uri
+from vllm.utils.system_utils import suppress_stdout
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 logger = init_logger(__name__)
@@ -427,33 +428,34 @@ def init_gloo_process_group(
     Stateless init ProcessGroup with gloo backend compatible with
     different torch versions.
     """
-    if is_torch_equal_or_newer("2.6"):
-        pg = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-        )
-    else:
-        options = ProcessGroup.Options(backend="gloo")
-        pg = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-            options,
-        )
-    from torch.distributed.distributed_c10d import ProcessGroupGloo
+    with suppress_stdout():
+        if is_torch_equal_or_newer("2.6"):
+            pg = ProcessGroup(
+                prefix_store,
+                group_rank,
+                group_size,
+            )
+        else:
+            options = ProcessGroup.Options(backend="gloo")
+            pg = ProcessGroup(
+                prefix_store,
+                group_rank,
+                group_size,
+                options,
+            )
+        from torch.distributed.distributed_c10d import ProcessGroupGloo
 
-    backend_class = ProcessGroupGloo(
-        prefix_store, group_rank, group_size, timeout=timeout
-    )
-    backend_type = ProcessGroup.BackendType.GLOO
-    device = torch.device("cpu")
-    if is_torch_equal_or_newer("2.6"):
-        # _set_default_backend is supported in torch >= 2.6
-        pg._set_default_backend(backend_type)
-    backend_class._set_sequence_number_for_group()
-
-    pg._register_backend(device, backend_type, backend_class)
+        backend_class = ProcessGroupGloo(
+            prefix_store, group_rank, group_size, timeout=timeout
+        )
+        backend_type = ProcessGroup.BackendType.GLOO
+        device = torch.device("cpu")
+        if is_torch_equal_or_newer("2.6"):
+            # _set_default_backend is supported in torch >= 2.6
+            pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
     return pg
 
 
diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py
index cc872040b6c5..a4eb8f4d4fd7 100644
--- a/vllm/utils/system_utils.py
+++ b/vllm/utils/system_utils.py
@@ -56,6 +56,39 @@ def set_env_var(key: str, value: str) -> Iterator[None]:
             os.environ[key] = old
 
 
+@contextlib.contextmanager
+def suppress_stdout():
+    """
+    Suppress stdout from C libraries at the file descriptor level.
+
+    Only suppresses stdout, not stderr, to preserve error messages.
+    Suppression is disabled when VLLM_LOGGING_LEVEL is set to DEBUG.
+
+    Example:
+        with suppress_stdout():
+            # C library calls that would normally print to stdout
+            torch.distributed.new_group(ranks, backend="gloo")
+    """
+    # Don't suppress if logging level is DEBUG
+    if envs.VLLM_LOGGING_LEVEL == "DEBUG":
+        yield
+        return
+
+    stdout_fd = sys.stdout.fileno()
+    stdout_dup = os.dup(stdout_fd)
+    devnull_fd = os.open(os.devnull, os.O_WRONLY)
+
+    try:
+        sys.stdout.flush()
+        os.dup2(devnull_fd, stdout_fd)
+        yield
+    finally:
+        sys.stdout.flush()
+        os.dup2(stdout_dup, stdout_fd)
+        os.close(stdout_dup)
+        os.close(devnull_fd)
+
+
 # File path utilities
 
 
From c5ee430328789c37a54526da3d254ae77d53ea4f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 26 Nov 2025 01:57:08 +0000
Subject: [PATCH 444/578] Bump actions/checkout from 4 to 6 (#29293)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/cleanup_pr_body.yml  | 2 +-
 .github/workflows/macos-smoke-test.yml | 2 +-
 .github/workflows/pre-commit.yml       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index c3e132a536a4..861290ea43c8 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,7 +13,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Set up Python
         uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
index a183033c9add..3a12c4b3a830 100644
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
     timeout-minutes: 30
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
 
       - uses: astral-sh/setup-uv@v7
         with:
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index e21d13b8161f..d5e70f30ef63 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
     - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
       with:
         python-version: "3.12"

From 53d7f1f601a12b8fa58aa0bffb7fc27d63d1eb5e Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Tue, 25 Nov 2025 18:21:00 -0800
Subject: [PATCH 445/578] [Kernel] Use pre-allocated output buffer for triton
 kernel fused_experts (#29219)

Signed-off-by: Xin Yang <xyangx@amazon.com>
---
 .../fused_moe/gpt_oss_triton_kernels_moe.py   | 84 ++++++++++++++++---
 1 file changed, 73 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index badedfc54c38..128507639fdf 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -12,6 +13,7 @@
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP,
 )
+from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.triton_utils import tl, triton
 from vllm.utils.import_utils import has_triton_kernels
 
@@ -88,14 +90,17 @@ def triton_kernel_moe_forward(
         gating_output, topk, sm_first=not renormalize
     )
 
+    output = torch.empty_like(hidden_states)
+
     return triton_kernel_fused_experts(
-        None,
+        output,
         hidden_states,
         w1,
         w2,
         routing_data,
         gather_idx,
         scatter_idx,
+        topk=topk,
         activation=activation,
         quant_config=quant_config,
         apply_router_weight_on_input=apply_router_weight_on_input,
@@ -113,6 +118,7 @@ def triton_kernel_fused_experts(
     routing_data,  # RoutingData
     gather_indx,  # GatherIndx
     scatter_indx,  # ScatterIndx
+    topk: int,
     activation: str = "silu",
     quant_config: FusedMoEQuantConfig | None = None,
     swiglu_alpha: float = 1.702,
@@ -120,6 +126,7 @@ def triton_kernel_fused_experts(
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
+    intermediate_cache: torch.Tensor | None = None,
     a1q_scale: torch.Tensor | None = None,
 ) -> torch.Tensor:
     if quant_config is None:
@@ -131,14 +138,30 @@ def triton_kernel_fused_experts(
     assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
 
     # Shape check, only check non-mxfp4
+    assert hidden_states.ndim == 2
     assert hidden_states.shape[-1] == w1.shape[-2]
     assert w2.shape[-1] == w1.shape[1]
 
+    batch_dim = 1
+    M, K = hidden_states.shape[-2:]
     E, _, N = w1.shape
 
     if global_num_experts == -1:
         global_num_experts = E
 
+    if intermediate_cache is None:
+        intermediate_cache = torch.empty(
+            (batch_dim, M * topk, N // 2),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+
+    # Add batch_dim to output buffer because matmul_ogs expects 3D output
+    intermediate_cache = _resize_cache(
+        intermediate_cache, (batch_dim, M * topk, N // 2)
+    )
+    output_tensor = _resize_cache(output_tensor, (batch_dim, M, K))
+
     act = FusedActivation(
         FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
         (swiglu_alpha, swiglu_limit),
@@ -146,7 +169,7 @@ def triton_kernel_fused_experts(
     )
     gammas = routing_data.gate_scal if routing_data else None
 
-    intermediate_cache1 = matmul_ogs(
+    matmul_ogs(
         hidden_states,
         w1,
         quant_config.w1_bias,
@@ -155,10 +178,11 @@ def triton_kernel_fused_experts(
         precision_config=quant_config.w1_precision,
         gammas=gammas if apply_router_weight_on_input else None,
         fused_activation=act,
+        y=intermediate_cache,
     )
 
-    intermediate_cache3 = matmul_ogs(
-        intermediate_cache1,
+    matmul_ogs(
+        intermediate_cache.view(M * topk, N // 2),
         w2,
         quant_config.w2_bias,
         routing_data,
@@ -167,7 +191,8 @@ def triton_kernel_fused_experts(
         gammas=None if apply_router_weight_on_input else gammas,
         y=output_tensor,
     )
-    return intermediate_cache3
+    output_tensor = output_tensor.view(M, K)
+    return output_tensor
 
 
 def make_routing_data(
@@ -221,6 +246,42 @@ def __init__(self, quant_config: FusedMoEQuantConfig):
     def supports_expert_map(self) -> bool:
         return True
 
+    def moe_problem_size(
+        self,
+        a1: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> tuple[int, int, int, int, int]:
+        """
+        Extract the MoE problem size from the given tensor arguments:
+        - a: The hidden states, input to the MoE layer.
+        - w1: The first set of expert weights.
+        - w2: The second set of expert weights.
+        - topk_ids: The topk ids.
+        Note: extracting the problem shape from the weight and activation
+        tensors is not obvious.  It needs to be done this way specifically
+        due to subtle issues with particular kernels, e.g. the int4 kernels
+        divide the trailing dimension by two, so it's not "correct" to
+        extract N or K from the trailing dimension of w1 or w2.  Similarly,
+        some kernels transpose the weights, so this needs to be kept in mind.
+        Note: This implementation covers most cases. However, if experts
+        require a specialized implementation, like MarlinExperts, they are free
+        to override this function.
+        """
+        assert w1.dim() == 3 and w2.dim() == 3
+        E, _, N = w1.size()
+        K = a1.size(-1)
+
+        assert a1.dim() == 2
+        assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
+        M = a1.size(0)
+
+        assert topk_ids.dim() == 2
+        topk = topk_ids.size(1)
+
+        return E, M, N, K, topk
+
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         # Weight application and reduction happens in the fused_experts kernel.
         return TopKWeightAndReduceNoOP()
@@ -263,8 +324,8 @@ def workspace_shapes(
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # workspace are allocated inside the kernel
-        workspace1 = (M, K)
-        workspace2 = (0, 0)
+        workspace1 = (0, 0)
+        workspace2 = (M * topk, N // 2)
         output = (M, K)
         return (workspace1, workspace2, output)
 
@@ -297,20 +358,21 @@ def apply(
             topk_ids, topk_weights, local_num_experts
         )
 
-        experts_output = triton_kernel_fused_experts(
-            None,
+        topk = topk_ids.size(1)
+        triton_kernel_fused_experts(
+            output,
             hidden_states,
             w1,
             w2,
             routing_data,
             gather_indx,
             scatter_indx,
+            topk=topk,
             activation=activation,
             quant_config=self.quant_config,
             apply_router_weight_on_input=False,
             global_num_experts=local_num_experts,
             expert_map=None,  # applied already
+            intermediate_cache=workspace2,
             a1q_scale=a1q_scale,
         )
-
-        output.copy_(experts_output, non_blocking=True)

From d9d342d214b8c13f71215318a6d9252cc4a5ca47 Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Wed, 26 Nov 2025 12:45:28 +0800
Subject: [PATCH 446/578] [Performance][MLA][ROCm] Remove redundant D2D copy in
 deepseek (#27457)

Signed-off-by: ganyi <ygan@amd.com>
---
 csrc/attention/merge_attn_states.cu           | 27 +++++++--------
 csrc/ops.h                                    |  3 +-
 csrc/torch_bindings.cpp                       |  3 +-
 .../attention/ops/triton_merge_attn_states.py | 23 +++++++++----
 vllm/v1/attention/backends/mla/common.py      | 34 ++++++++++---------
 5 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 229d9862fb67..27d1e990c611 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
     scalar_t* output, float* output_lse, const scalar_t* prefix_output,
     const float* prefix_lse, const scalar_t* suffix_output,
     const float* suffix_lse, const uint num_tokens, const uint num_heads,
-    const uint head_size) {
+    const uint head_size, const uint prefix_head_stride,
+    const uint output_head_stride) {
   using pack_128b_t = uint4;
   const uint pack_size = 16 / sizeof(scalar_t);
   const uint threads_per_head = head_size / pack_size;
@@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
   const uint head_idx = token_head_idx % num_heads;
 
   const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
-  const uint head_offset =
-      token_idx * num_heads * head_size + head_idx * head_size;
-  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
-  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
-  scalar_t* output_head_ptr = output + head_offset;
+  const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
+                               head_idx * prefix_head_stride;
+  const uint dst_head_offset = token_idx * num_heads * output_head_stride +
+                               head_idx * output_head_stride;
+  const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
+  const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
+  scalar_t* output_head_ptr = output + dst_head_offset;
 
   float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
   float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
             reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
             reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
             reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
-            num_heads, head_size);                                          \
+            num_heads, head_size, prefix_head_stride, output_head_stride);  \
   }
 
 /*@brief Merges the attention states from prefix and suffix
@@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
   const uint num_tokens = output.size(0);
   const uint num_heads = output.size(1);
   const uint head_size = output.size(2);
+  const uint prefix_head_stride = prefix_output.stride(1);
+  const uint output_head_stride = output.stride(1);
   const uint pack_size = 16 / sizeof(scalar_t);
   TORCH_CHECK(head_size % pack_size == 0,
               "headsize must be multiple of pack_size:", pack_size);
-  TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
-              "output heads must be contiguous in memory");
-  TORCH_CHECK(
-      prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
-      "prefix_output heads must be contiguous in memory");
-  TORCH_CHECK(
-      suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
-      "suffix_output heads must be contiguous in memory");
   float* output_lse_ptr = nullptr;
   if (output_lse.has_value()) {
     output_lse_ptr = output_lse.value().data_ptr<float>();
diff --git a/csrc/ops.h b/csrc/ops.h
index f8bdc61aaa8e..4bb7857b1503 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -52,14 +52,13 @@ void paged_attention_v2(
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step);
 
-#ifndef USE_ROCM
 void merge_attn_states(torch::Tensor& output,
                        std::optional<torch::Tensor> output_lse,
                        const torch::Tensor& prefix_output,
                        const torch::Tensor& prefix_lse,
                        const torch::Tensor& suffix_output,
                        const torch::Tensor& suffix_lse);
-
+#ifndef USE_ROCM
 void convert_vertical_slash_indexes(
     torch::Tensor& block_count,      // [BATCH, N_HEADS, NUM_ROWS]
     torch::Tensor& block_offset,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 14913bef1312..e9c96bb8b56c 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -63,7 +63,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
 
-#ifndef USE_ROCM
   // Merge attn states
   // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
   // can be used to combine partial attention results (in the split-KV case)
@@ -76,7 +75,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor suffix_output,"
       "    Tensor suffix_lse) -> ()");
   ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
-
+#ifndef USE_ROCM
   ops.def(
       "convert_vertical_slash_indexes("
       "   Tensor! block_count, Tensor! block_offset, "
diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index 3c87a24afd9c..74e4d778ded8 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -20,7 +20,11 @@ def merge_attn_states(
     num_query_heads = output.shape[1]
     head_size = output.shape[2]
     padded_head_size = triton.next_power_of_2(head_size)
-
+    # We assume the output stride on num_head is not always as same as the
+    # `suffix_output` and `prefix_output`, as them might be padded by the attention
+    # backend.
+    prefix_head_stride = prefix_output.stride(1)
+    output_head_stride = output.stride(1)
     # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead.
     merge_attn_states_kernel[(num_tokens, num_query_heads)](
         output,
@@ -29,6 +33,8 @@ def merge_attn_states(
         prefix_lse,
         suffix_output,
         suffix_lse,
+        prefix_head_stride,
+        output_head_stride,
         head_size,
         padded_head_size,
         output_lse is not None,
@@ -43,6 +49,8 @@ def merge_attn_states_kernel(
     prefix_lse,  # [NUM_HEADS, NUM_TOKENS]
     suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
     suffix_lse,  # [NUM_HEADS, NUM_TOKENS]
+    prefix_head_stride,
+    output_head_stride,
     HEAD_SIZE: tl.constexpr,
     PADDED_HEAD_SIZE: tl.constexpr,
     OUTPUT_LSE: tl.constexpr,
@@ -79,15 +87,15 @@ def merge_attn_states_kernel(
     head_mask = head_arange < HEAD_SIZE
     p_out = tl.load(
         prefix_output
-        + token_idx * num_heads * HEAD_SIZE
-        + head_idx * HEAD_SIZE
+        + token_idx * num_heads * prefix_head_stride
+        + head_idx * prefix_head_stride
         + head_arange,
         mask=head_mask,
     )
     s_out = tl.load(
         suffix_output
-        + token_idx * num_heads * HEAD_SIZE
-        + head_idx * HEAD_SIZE
+        + token_idx * num_heads * prefix_head_stride
+        + head_idx * prefix_head_stride
         + head_arange,
         mask=head_mask,
     )
@@ -99,7 +107,10 @@ def merge_attn_states_kernel(
     s_scale = s_se / out_se
     out = p_out * p_scale + s_out * s_scale
     tl.store(
-        output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange,
+        output
+        + token_idx * num_heads * output_head_stride
+        + head_idx * output_head_stride
+        + head_arange,
         out,
         mask=head_mask,
     )
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 87a3aac21d2c..d94ed9183f63 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1238,15 +1238,13 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
     def _v_up_proj(self, x: torch.Tensor, out: torch.Tensor):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
+
         if self.is_aiter_triton_fp8_bmm_enabled:
+            out = out.view(-1, self.num_heads, self.v_head_dim)
             # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V)
             x = rocm_aiter_ops.triton_fp8_bmm(
-                x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
+                x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True, YQ=out
             )
-            # Convert from (B, N, V) to (B, N * V)
-            x = x.reshape(-1, self.num_heads * self.v_head_dim)
-            # Copy result
-            out.copy_(x)
         else:
             # Convert from (B, N * V) to (N, B, V)
             out = out.view(-1, self.num_heads, self.v_head_dim).transpose(0, 1)
@@ -1824,7 +1822,8 @@ def _forward_prefill(
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         k_scale: torch.Tensor,
-    ) -> torch.Tensor:
+        output: torch.Tensor,
+    ) -> None:
         # TODO (zyongye): Prefill function here
         assert attn_metadata.prefill is not None
         assert self.dcp_world_size is not None
@@ -1837,7 +1836,7 @@ def _forward_prefill(
 
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
-        output = self._run_prefill_new_tokens(
+        output_prefill = self._run_prefill_new_tokens(
             prefill=attn_metadata.prefill,
             q=q,
             k=k,
@@ -1846,7 +1845,7 @@ def _forward_prefill(
         )
 
         if has_context:
-            suffix_output, suffix_lse = output
+            suffix_output, suffix_lse = output_prefill
             if self.dcp_world_size > 1:
                 context_output, context_lse = (
                     self._context_parallel_compute_prefill_context(
@@ -1862,7 +1861,12 @@ def _forward_prefill(
                     q, kv_c_and_k_pe_cache, attn_metadata, k_scale
                 )
 
-            output = torch.empty_like(suffix_output)
+            # unpad if necessary
+            if self._pad_v:
+                context_output = context_output[..., : v.shape[-1]]
+                suffix_output = suffix_output[..., : v.shape[-1]]
+
+            output = output.view(-1, self.num_heads, self.v_head_dim)
             merge_attn_states(
                 output=output,
                 prefix_output=context_output,
@@ -1870,12 +1874,9 @@ def _forward_prefill(
                 suffix_output=suffix_output,
                 suffix_lse=suffix_lse,
             )
-
-        # unpad if necessary
-        if self._pad_v:
-            output = output[..., : v.shape[-1]]
-
-        return output.flatten(start_dim=-2)
+        else:
+            output_prefill = output_prefill[..., : v.shape[-1]].flatten(start_dim=-2)
+            output.copy_(output_prefill)
 
     @abstractmethod
     def _forward_decode(
@@ -1970,13 +1971,14 @@ def forward(
             kv_cache = kv_cache.view(current_platform.fp8_dtype())
 
         if has_prefill:
-            output[num_decode_tokens:] = self._forward_prefill(
+            self._forward_prefill(
                 prefill_q,
                 prefill_k_c_normed,
                 prefill_k_pe,
                 kv_cache,
                 attn_metadata,
                 layer._k_scale,
+                output=output[num_decode_tokens:],
             )
 
         if has_decode:

From 452a7c9f7c949cd20c3c0c81cd4352b2a0045076 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Wed, 26 Nov 2025 05:00:00 -0800
Subject: [PATCH 447/578] [Misc] Allow LM only loading for Pixtral (#29451)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 vllm/model_executor/models/pixtral.py | 73 +++++++++++++++++++--------
 1 file changed, 51 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 6011d93a795d..3464de472add 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -400,21 +400,30 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=maybe_prefix(prefix, "language_model"),
         )
 
-        self.vision_encoder = VisionTransformer(self.vision_args)
-
-        if self.vision_args.add_pre_mm_projector_layer_norm:
-            self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size, eps=1e-5)
-
-        if self.vision_args.mm_projector_id == PATCH_MERGE:
-            self.patch_merger = PatchMerger(
-                vision_encoder_dim=self.vision_args.hidden_size,
-                spatial_merge_size=self.vision_args.spatial_merge_size,
-                use_mlp_bias=False,
+        if multimodal_config.get_limit_per_prompt("image"):
+            self.vision_encoder = VisionTransformer(self.vision_args)
+            self.pre_mm_projector_norm = (
+                RMSNorm(self.vision_args.hidden_size, eps=1e-5)
+                if self.vision_args.add_pre_mm_projector_layer_norm
+                else None
             )
-
-        self.vision_language_adapter = VisionLanguageAdapter(
-            self.vision_args, dim=config.text_config.hidden_size
-        )
+            self.patch_merger = (
+                PatchMerger(
+                    vision_encoder_dim=self.vision_args.hidden_size,
+                    spatial_merge_size=self.vision_args.spatial_merge_size,
+                    use_mlp_bias=False,
+                )
+                if self.vision_args.mm_projector_id == PATCH_MERGE
+                else None
+            )
+            self.vision_language_adapter = VisionLanguageAdapter(
+                self.vision_args, dim=config.text_config.hidden_size
+            )
+        else:
+            self.vision_encoder = None
+            self.pre_mm_projector_norm = None
+            self.patch_merger = None
+            self.vision_language_adapter = None
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors
@@ -436,13 +445,17 @@ def _process_image_input(
         self,
         image_input: PixtralImagePixelInputs,
     ) -> tuple[torch.Tensor, ...]:
+        assert (
+            self.vision_encoder is not None and self.vision_language_adapter is not None
+        )
+
         images = image_input["images"]
         image_features = self.vision_encoder(images)
         feature_sizes = [image_feature.shape[0] for image_feature in image_features]
         image_features = torch.cat(image_features)
-        if self.vision_args.add_pre_mm_projector_layer_norm:
+        if self.pre_mm_projector_norm is not None:
             image_features = self.pre_mm_projector_norm(image_features)
-        if self.vision_args.mm_projector_id == PATCH_MERGE:
+        if self.patch_merger is not None:
             patch_size = self.vision_args.patch_size
             spatial_merge_size_square = self.vision_args.spatial_merge_size**2
             img_patch_dims = [
@@ -508,41 +521,57 @@ def is_pre_mm_projector_norm(weight: tuple[str, torch.Tensor]):
             return weight[0].startswith("pre_mm_projector_norm")
 
         # Get references to parameters for direct loading
-        vision_encoder_dict = dict(self.vision_encoder.named_parameters())
+        vision_encoder_dict = (
+            dict(self.vision_encoder.named_parameters())
+            if self.vision_encoder is not None
+            else {}
+        )
         patch_merger_dict = (
             dict(self.patch_merger.named_parameters())
-            if self.vision_args.mm_projector_id == PATCH_MERGE
-            else dict()
+            if self.patch_merger is not None
+            else {}
         )
         pre_mm_projector_norm_dict = (
             dict(self.pre_mm_projector_norm.named_parameters())
-            if self.vision_args.add_pre_mm_projector_layer_norm
-            else dict()
+            if self.pre_mm_projector_norm is not None
+            else {}
+        )
+        vision_lang_adapter_dict = (
+            dict(self.vision_language_adapter.named_parameters())
+            if self.vision_language_adapter is not None
+            else {}
         )
-        vision_lang_adapter_dict = dict(self.vision_language_adapter.named_parameters())
 
         def llm_weights_generator():
             # Single pass over weights
             for name, w in weights:
                 if is_vision_encoder_weights((name, w)):
+                    if self.vision_encoder is None:
+                        continue
                     # Load vision encoder weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = vision_encoder_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_patch_merger((name, w)):
+                    if self.patch_merger is None:
+                        continue
                     # Load vision patch merger weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = patch_merger_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_pre_mm_projector_norm((name, w)):
+                    if self.pre_mm_projector_norm is None:
+                        continue
                     # Load vision pre_mm_projector_norm weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = pre_mm_projector_norm_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_vision_lang_adapter_weights((name, w)):
+                    if self.vision_language_adapter is None:
+                        continue
                     # Load vision-language adapter weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = vision_lang_adapter_dict[trimmed_name]

From e30859dff3d93bd3e289f6e996afbb59ac475b72 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 26 Nov 2025 21:00:15 +0800
Subject: [PATCH 448/578] [Bugfix] Fix handling of image embeds in models
 (#29480)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/deepseek_vl2.py    | 15 ++-------------
 vllm/model_executor/models/llava_next.py      |  2 +-
 vllm/model_executor/models/llava_onevision.py |  2 +-
 3 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index e7b48e0f4e55..1b6e4110039c 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -48,7 +48,6 @@
 )
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils.collection_utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_dtype
 
@@ -595,19 +594,9 @@ def _pixel_values_to_embedding(
 
     def _process_image_input(
         self, image_input: DeepseekVL2ImageInputs
-    ) -> list[torch.Tensor]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
-            image_data = image_input["data"]
-            if is_list_of(image_data, torch.Tensor):
-                # it's already a list of tensors
-                return image_data
-            if len(image_data.shape) == 3:
-                # 3D tensor
-                return list(torch.unbind(image_data, dim=0))
-            raise ValueError(
-                "We expect batched 2D tensors; "
-                "this can be either a list of 2D tensors or a single 3D tensor."
-            )
+            return image_input["data"]
 
         pixel_values = image_input["data"]
         images_spatial_crop = image_input["images_spatial_crop"]
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 98b1b46045c3..b995cac47ac1 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -460,7 +460,7 @@ def _process_image_input(
         image_input: LlavaNextImageInputs,
     ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
-            return [image_input["data"]]
+            return image_input["data"]
 
         patch_embeddings = self._process_image_pixels(image_input)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 322bde94ff66..4e243ade6835 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -763,7 +763,7 @@ def _process_image_input(
         image_input: LlavaOnevisionImageInputs,
     ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
-            return [image_input["data"]]
+            return image_input["data"]
 
         patch_embeddings = self._process_image_pixels(image_input)
 

From bb706d60482233e2feb6bab894492e394dcdef94 Mon Sep 17 00:00:00 2001
From: Yejing Lai <yejing.lai@intel.com>
Date: Wed, 26 Nov 2025 21:15:00 +0800
Subject: [PATCH 449/578] Fix TeleChatForCausalLM not register issue (#29473)

Signed-off-by: Lai, Yejing <yejing.lai@intel.com>
---
 tests/models/registry.py               | 3 +++
 vllm/model_executor/models/registry.py | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index f8b3470e6d39..c9d4823d5279 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -436,6 +436,9 @@ def check_available_online(
     "SolarForCausalLM": _HfExamplesInfo(
         "upstage/solar-pro-preview-instruct", trust_remote_code=True
     ),
+    "TeleChatForCausalLM": _HfExamplesInfo(
+        "chuhac/TeleChat2-35B", trust_remote_code=True
+    ),
     "TeleChat2ForCausalLM": _HfExamplesInfo(
         "Tele-AI/TeleChat2-3B", trust_remote_code=True
     ),
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 53644f9cb878..ba9f33819c95 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -170,6 +170,7 @@
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
+    "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
     "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
@@ -207,6 +208,7 @@
     "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"),
     "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
     "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
+    "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     # [Multimodal]

From 3650a74ed8fb27d4d53199969f265e426c22891b Mon Sep 17 00:00:00 2001
From: yxt <xintong.yu@daocloud.io>
Date: Wed, 26 Nov 2025 21:16:12 +0800
Subject: [PATCH 450/578] =?UTF-8?q?Optimize=20the=20wording=20of=20the=20d?=
 =?UTF-8?q?ocument=20and=20unify=20the=20terminology=20and=20th=E2=80=A6?=
 =?UTF-8?q?=20(#29491)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/models/pooling_models.md | 46 +++++++++++++++++------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 18bb645ea9a9..aca865f4bf77 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -1,15 +1,15 @@
 # Pooling Models
 
-vLLM also supports pooling models, such as embedding, classification and reward models.
+vLLM also supports pooling models, such as embedding, classification, and reward models.
 
 In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
 These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
 before returning them.
 
 !!! note
-    We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly.
+    We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly.
 
-    We are now planning to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
+    We plan to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
 
 ## Configuration
 
@@ -19,7 +19,7 @@ Run a model in pooling mode via the option `--runner pooling`.
 
 !!! tip
     There is no need to set this option in the vast majority of cases as vLLM can automatically
-    detect the model runner to use via `--runner auto`.
+    detect the appropriate model runner via `--runner auto`.
 
 ### Model Conversion
 
@@ -78,7 +78,7 @@ When loading [Sentence Transformers](https://huggingface.co/sentence-transformer
 its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
 
 You can further customize this via the `--pooler-config` option,
-which takes priority over both the model's and Sentence Transformers's defaults.
+which takes priority over both the model's and Sentence Transformers' defaults.
 
 ## Offline Inference
 
@@ -168,11 +168,11 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
 
     - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
     - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
-    - For similarity scores, use `LLM.score(...)`.  
+    - For similarity scores, use `LLM.score(...)`.
     - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`.
     - For token classification, use `pooling_task="token_classify"`.
-    - For multi-vector retrieval, use `pooling_task="token_embed"`
-    - For IO Processor Plugins , use `pooling_task="plugin"`
+    - For multi-vector retrieval, use `pooling_task="token_embed"`.
+    - For IO Processor Plugins, use `pooling_task="plugin"`.
 
 ```python
 from vllm import LLM
@@ -194,15 +194,15 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides
 - [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
 
 !!! note
-    Please use one of the more specific methods or set the task directly when using  [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.:
+    Please use one of the more specific endpoints or set the task directly when using the [Pooling API](../serving/openai_compatible_server.md#pooling-api):
 
     - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`.
-    - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`.
-    - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).  
-    - For rewards, `task":"token_classify"`.
-    - For token classification, use `task":"token_classify"`.
-    - For multi-vector retrieval, use `task":"token_embed"`
-    - For IO Processor Plugins , use `task":"plugin"`
+    - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `"task":"classify"`.
+    - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).
+    - For rewards, use `"task":"token_classify"`.
+    - For token classification, use `"task":"token_classify"`.
+    - For multi-vector retrieval, use `"task":"token_embed"`.
+    - For IO Processor Plugins, use `"task":"plugin"`.
 
 ```python
 # start a supported embeddings model server with `vllm serve`, e.g.
@@ -232,7 +232,7 @@ for output in response.json()["data"]:
 
 ## Matryoshka Embeddings
 
-[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
+[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost.
 
 !!! warning
     Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
@@ -245,9 +245,9 @@ for output in response.json()["data"]:
 
 ### Manually enable Matryoshka Embeddings
 
-There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
+There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions.
 
-For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`,  `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
+For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'` (online).
 
 Here is an example to serve a model with Matryoshka Embeddings enabled.
 
@@ -278,7 +278,7 @@ A code example can be found here: [examples/offline_inference/pooling/embed_matr
 
 ### Online Inference
 
-Use the following command to start vllm server.
+Use the following command to start the vLLM server.
 
 ```bash
 vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
@@ -310,11 +310,11 @@ An OpenAI client example can be found here: [examples/online_serving/pooling/ope
 
 ### Encode task
 
-We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`:
+We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`:
 
-- `token_embed` is the same as embed, using normalize as activation.
-- `token_classify` is the same as classify, default using softmax as activation.
+- `token_embed` is the same as `embed`, using normalization as the activation.
+- `token_classify` is the same as `classify`, by default using softmax as the activation.
 
 ### Remove softmax from PoolingParams
 
-We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function.
+We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.

From 70d5953f820ec528e2b6050a7969130009410d1e Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Wed, 26 Nov 2025 06:27:26 -0800
Subject: [PATCH 451/578] Revert "[Bugfix] Fix GPT-OSS AR+NORM fusion (#28841)"
 (#29483)

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 .buildkite/test-pipeline.yaml                   |  1 -
 tests/compile/distributed/test_fusions_e2e.py   | 11 -----------
 .../device_communicators/symm_mem.py            |  2 +-
 vllm/model_executor/layers/fused_moe/layer.py   | 17 ++++++-----------
 4 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 10a19c52c72d..d14b524b793a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -972,7 +972,6 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - vllm/model_executor/layers/fused_moe/layer.py
   - tests/compile/test_fusion_attn.py
   - tests/compile/test_silu_mul_quant_fusion.py
   - tests/compile/distributed/test_fusion_all_reduce.py
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 53c3f875d200..661172e1965b 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -111,17 +111,6 @@ class ModelBackendTestCase(NamedTuple):
                 async_tp=96,  # MLP is MoE, half the fusions of dense
             ),
         ),
-        ModelBackendTestCase(
-            model_name="openai/gpt-oss-20b",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER,
-            matches=Matches(
-                attention_fusion=0,
-                allreduce_fusion=49,
-                sequence_parallel=49,
-                async_tp=48,
-            ),
-        ),
     ]
 
 elif current_platform.is_rocm():
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index 7a049b003cf7..eb1f173b1192 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -131,7 +131,7 @@ def all_reduce(
             return None
         if out is None:
             out = torch.empty_like(inp)
-        self.buffer[: inp.numel()].copy_(inp.reshape(-1))
+        self.buffer[: inp.numel()].copy_(inp.view(-1))
 
         # Determine which algorithm to use
         use_multimem = False
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index bb30f1292a5f..0ef3130b2633 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1690,10 +1690,6 @@ def forward_native(
             )
 
         def reduce_output(states: torch.Tensor) -> torch.Tensor:
-            # Slice before all_reduce to enable possible fusion
-            if self.hidden_size != og_hidden_states:
-                states = states[..., :og_hidden_states]
-
             if (
                 not self.is_sequence_parallel
                 and not self.use_dp_chunking
@@ -1716,12 +1712,11 @@ def reduce_output(states: torch.Tensor) -> torch.Tensor:
             if self.zero_expert_num is not None and self.zero_expert_num > 0:
                 assert isinstance(fused_output, tuple)
                 fused_output, zero_expert_result = fused_output
-                return (
-                    reduce_output(fused_output)
-                    + zero_expert_result[..., :og_hidden_states]
-                )
+                return (reduce_output(fused_output) + zero_expert_result)[
+                    ..., :og_hidden_states
+                ]
             else:
-                return reduce_output(fused_output)
+                return reduce_output(fused_output)[..., :og_hidden_states]
         else:
             if current_platform.is_tpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
@@ -1734,8 +1729,8 @@ def reduce_output(states: torch.Tensor) -> torch.Tensor:
                     hidden_states, router_logits, self.layer_name
                 )
             return (
-                reduce_output(shared_output),
-                reduce_output(fused_output),
+                reduce_output(shared_output)[..., :og_hidden_states],
+                reduce_output(fused_output)[..., :og_hidden_states],
             )
 
     def forward_cuda(

From 0b0aa874e85431a0f08a0d1fad95ae673e034392 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 26 Nov 2025 11:38:52 -0500
Subject: [PATCH 452/578] [Perf] Optimize batch invariant BMM, 18.1% Throughput
 improvement, 10.7% TTFT improvement (#29345)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/v1/determinism/test_batch_invariance.py |   3 -
 tests/v1/determinism/utils.py                 |   5 +-
 vllm/model_executor/layers/batch_invariant.py | 225 +++++++++++++++++-
 3 files changed, 217 insertions(+), 16 deletions(-)

diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index b9e2daafb870..4311547baccf 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -159,7 +159,6 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     "backend",
     BACKENDS,
 )
-@pytest.mark.forked
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     backend, monkeypatch: pytest.MonkeyPatch
 ):
@@ -429,7 +428,6 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
     "backend",
     BACKENDS,
 )
-@pytest.mark.forked
 def test_logprobs_without_batch_invariance_should_fail(
     backend, monkeypatch: pytest.MonkeyPatch
 ):
@@ -646,7 +644,6 @@ def test_logprobs_without_batch_invariance_should_fail(
 
 @skip_unsupported
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
-@pytest.mark.forked
 def test_decode_logprobs_match_prefill_logprobs(
     backend, monkeypatch: pytest.MonkeyPatch
 ):
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
index ecbb6a112693..0d7da107728b 100644
--- a/tests/v1/determinism/utils.py
+++ b/tests/v1/determinism/utils.py
@@ -8,6 +8,7 @@
 
 from vllm.attention.utils.fa_utils import flash_attn_supports_mla
 from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
 
 skip_unsupported = pytest.mark.skipif(
     not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
@@ -16,9 +17,11 @@
 
 BACKENDS: list[str] = [
     "FLASH_ATTN",
-    "FLASHINFER",
 ]
 
+if has_flashinfer():
+    BACKENDS.append("FLASHINFER")
+
 if flash_attn_supports_mla():
     BACKENDS.append("FLASH_ATTN_MLA")
 
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index be7f673e5618..4154122636dc 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -215,6 +215,139 @@ def grid(META):
     return c
 
 
+@triton.jit
+def bmm_kernel(
+    a_ptr,  # (*, ) pointer to A, (B, M, K)
+    b_ptr,  # (*, ) pointer to B, (B, K, N)
+    c_ptr,  # (*, ) pointer to C, (B, M, N)
+    B,  # int, batch size
+    M,  # int, output rows
+    N,  # int, output cols
+    K,  # int, reduction dim
+    stride_ab,
+    stride_am,
+    stride_ak,
+    stride_bb,
+    stride_bk,
+    stride_bn,
+    stride_cb,
+    stride_cm,
+    stride_cn,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    A_LARGE: tl.constexpr,
+    B_LARGE: tl.constexpr,
+    C_LARGE: tl.constexpr,
+):
+    """Batched GEMM: (B, M, K) x (B, K, N) -> (B, M, N)
+
+    Each program computes one (batch_idx, tile_m, tile_n) tile, accumulating
+    along K in a fixed order to preserve batch invariance.
+    """
+    pid_b = tl.program_id(0)
+    pid = tl.program_id(1)
+
+    if pid_b >= B:
+        return
+
+    # number of tiles along M / N
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    if pid_m >= num_pid_m or pid_n >= num_pid_n:
+        return
+
+    # offs_m / offs_n: raw global row/col indices for this tile
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    # masks for valid logical rows/cols within (M, N)
+    mask_m = offs_m < M  # [BLOCK_SIZE_M]
+    mask_n = offs_n < N  # [BLOCK_SIZE_N]
+
+    if A_LARGE or B_LARGE or C_LARGE:
+        offs_m = offs_m.to(tl.int64)
+        offs_n = offs_n.to(tl.int64)
+
+    offs_m = tl.where(mask_m, offs_m, 0)
+    offs_n = tl.where(mask_n, offs_n, 0)
+
+    # hint for triton contiguous memory
+    offs_m = tl.max_contiguous(tl.multiple_of(offs_m, BLOCK_SIZE_M), BLOCK_SIZE_M)
+    offs_n = tl.max_contiguous(tl.multiple_of(offs_n, BLOCK_SIZE_N), BLOCK_SIZE_N)
+
+    # base pointers for current batch, shape-wise:
+    #   a_batch_ptr points to A[pid_b, 0, 0]
+    #   b_batch_ptr points to B[pid_b, 0, 0]
+    #   c_batch_ptr points to C[pid_b, 0, 0]
+    a_batch_ptr = a_ptr + pid_b * stride_ab
+    b_batch_ptr = b_ptr + pid_b * stride_bb
+    c_batch_ptr = c_ptr + pid_b * stride_cb
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    # number of K-blocks this tile iterates over
+    k_tiles = tl.cdiv(K, BLOCK_SIZE_K)
+    offs_k_mask = tl.arange(0, BLOCK_SIZE_K)
+
+    for ki in range(k_tiles):
+        if A_LARGE or B_LARGE:
+            # offs_k: [BLOCK_SIZE_K], global K indices
+            offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
+        else:
+            offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+
+        # a_ptrs: [BLOCK_SIZE_M, BLOCK_SIZE_K]
+        #   element (i, j) points to A[pid_b, offs_m[i], offs_k[j]]
+        a_ptrs = a_batch_ptr + (
+            offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+        )
+        # b_ptrs: [BLOCK_SIZE_K, BLOCK_SIZE_N]
+        #   element (i, j) points to B[pid_b, offs_k[i], offs_n[j]]
+        b_ptrs = b_batch_ptr + (
+            offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn
+        )
+
+        # valid K lanes for this block
+        k_valid = offs_k_mask < (K - ki * BLOCK_SIZE_K)
+        # A mask within (M, K): [BLOCK_SIZE_M, BLOCK_SIZE_K]
+        a_mask = mask_m[:, None] & k_valid[None, :]
+        # B mask within (K, N): [BLOCK_SIZE_K, BLOCK_SIZE_N]
+        b_mask = k_valid[:, None] & mask_n[None, :]
+
+        # a: [BLOCK_SIZE_M, BLOCK_SIZE_K] from A[offs_m, offs_k]
+        a = tl.load(
+            a_ptrs,
+            mask=a_mask,
+            other=0.0,
+        )
+        # b: [BLOCK_SIZE_K, BLOCK_SIZE_N] from B[offs_k, offs_n]
+        b = tl.load(
+            b_ptrs,
+            mask=b_mask,
+            other=0.0,
+        )
+        accumulator = tl.dot(a, b, accumulator)
+
+    # c_m / c_n: [BLOCK_SIZE_M] / [BLOCK_SIZE_N], row/col indices for C
+    c_m = offs_m
+    c_n = offs_n
+    if C_LARGE:
+        c_m = c_m.to(tl.int64)
+        c_n = c_n.to(tl.int64)
+
+    # c_ptrs: [BLOCK_SIZE_M, BLOCK_SIZE_N]
+    #   element (i, j) points to C[pid_b, c_m[i], c_n[j]]
+    c_ptrs = c_batch_ptr + stride_cm * c_m[:, None] + stride_cn * c_n[None, :]
+    # mask out elements that fall outside logical (M, N) range
+    c_mask = mask_m[:, None] & mask_n[None, :]
+    # cast FP32 accumulator back to original dtype of C
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
 @triton.jit
 def _log_softmax_kernel(
     input_ptr,
@@ -526,23 +659,91 @@ def matmul_batch_invariant(a, b, *, out=None):
 
 def bmm_batch_invariant(a, b, *, out=None):
     # Batched matrix multiply: (B, M, K) x (B, K, N) -> (B, M, N)
-    # Process each batch separately with our persistent kernel
-    if a.ndim == 3 and b.ndim == 3:
-        results = []
-        for i in range(a.shape[0]):
-            results.append(matmul_persistent(a[i], b[i]))
-        result = torch.stack(results, dim=0)
-
-        if out is not None:
-            out.copy_(result)
-            return out
-        return result
-    else:
+    if not (a.ndim == 3 and b.ndim == 3):
         raise ValueError(
             f"bmm_batch_invariant expects 3D tensors, "
             f"got shapes {a.shape} and {b.shape}"
         )
 
+    if a.shape[0] != b.shape[0]:
+        raise ValueError(
+            f"Batch dimensions of tensors must match, "
+            f"but got {a.shape[0]} and {b.shape[0]}."
+        )
+    if a.shape[2] != b.shape[1]:
+        raise ValueError(
+            f"Incompatible inner dimensions for matmul: got {a.shape} and {b.shape}."
+        )
+    if a.dtype != b.dtype:
+        raise ValueError(f"Incompatible dtypes: got {a.dtype} and {b.dtype}.")
+
+    B, M, K = a.shape
+    _, _, N = b.shape
+    dtype = a.dtype
+
+    if out is None:
+        c = torch.empty((B, M, N), device=a.device, dtype=dtype)
+    else:
+        assert out.shape == (B, M, N), "out tensor has incorrect shape"
+        assert out.dtype == dtype and out.device == a.device, "out tensor mismatch"
+        c = out
+
+    configs = {
+        torch.bfloat16: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 64,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+        torch.float16: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 256,
+            "BLOCK_SIZE_K": 64,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+        torch.float32: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+    }
+
+    cfg = configs[dtype]
+    # grid = (B, num_tiles_per_matrix)
+    grid = (
+        B,
+        triton.cdiv(M, cfg["BLOCK_SIZE_M"]) * triton.cdiv(N, cfg["BLOCK_SIZE_N"]),
+    )
+
+    bmm_kernel[grid](
+        a,
+        b,
+        c,
+        B,
+        M,
+        N,
+        K,
+        a.stride(0),
+        a.stride(1),
+        a.stride(2),
+        b.stride(0),
+        b.stride(1),
+        b.stride(2),
+        c.stride(0),
+        c.stride(1),
+        c.stride(2),
+        A_LARGE=a.numel() > 2**31,
+        B_LARGE=b.numel() > 2**31,
+        C_LARGE=c.numel() > 2**31,
+        **cfg,
+    )
+
+    return c
+
 
 def addmm_batch_invariant(bias, a, b):
     return matmul_persistent(a, b, bias=bias)

From e603129505fdf39b0784fe9600feb9101ed5170d Mon Sep 17 00:00:00 2001
From: HDCharles <39544797+HDCharles@users.noreply.github.com>
Date: Wed, 26 Nov 2025 12:21:58 -0500
Subject: [PATCH 453/578] [refactor] CTConfig methods to static/class methods
 (#28870)

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../compressed_tensors/compressed_tensors.py  | 55 +++++++++++--------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 6c7d4cd7bd9a..2800f90ce0b6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -266,8 +266,9 @@ def _quantization_scheme_map_from_config(
     def get_config_filenames(cls) -> list[str]:
         return []
 
+    @staticmethod
     def _check_scheme_supported(
-        self, min_capability: int, error: bool = True, match_exact: bool = False
+        min_capability: int, error: bool = True, match_exact: bool = False
     ) -> bool:
         capability_tuple = current_platform.get_device_capability()
 
@@ -293,9 +294,8 @@ def _check_scheme_supported(
         else:
             return False
 
-    def _is_fp4a4_nvfp4(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
-    ):
+    @staticmethod
+    def _is_fp4a4_nvfp4(weight_quant: QuantizationArgs, input_quant: QuantizationArgs):
         if weight_quant is None or input_quant is None:
             return False
 
@@ -322,9 +322,8 @@ def _is_fp4a4_nvfp4(
             and is_symmetric
         )
 
-    def _is_fp4a16_nvfp4(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
-    ):
+    @staticmethod
+    def _is_fp4a16_nvfp4(weight_quant: QuantizationArgs, input_quant: QuantizationArgs):
         is_weight_only = weight_quant is not None and input_quant is None
         is_tensor_group_quant = (
             weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value
@@ -344,8 +343,9 @@ def _is_fp4a16_nvfp4(
             and is_symmetric
         )
 
+    @staticmethod
     def _is_static_tensor_w8a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
@@ -362,8 +362,9 @@ def _is_static_tensor_w8a8(
         # Only symmetric weight quantization supported.
         return is_8_bits and is_tensor and weight_quant.symmetric and is_static
 
+    @staticmethod
     def _is_dynamic_token_w8a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
@@ -379,8 +380,9 @@ def _is_dynamic_token_w8a8(
         # Only symmetric weight quantization supported.
         return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
 
+    @staticmethod
     def _is_dynamic_token_w4a8_int(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         is_weight_4_bits = weight_quant.num_bits == 4
         is_activation_8_bits = input_quant.num_bits == 8
@@ -403,8 +405,9 @@ def _is_dynamic_token_w4a8_int(
             and is_dynamic
         )
 
+    @staticmethod
     def _is_fp8_w8a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         # Confirm weights and activations quantized.
         if weight_quant is None or input_quant is None:
@@ -439,8 +442,9 @@ def _is_fp8_w8a8(
         is_per_tensor_activation = input_quant.strategy == QuantizationStrategy.TENSOR
         return is_symmetric_activation and is_per_tensor_activation
 
+    @staticmethod
     def _is_fp8_w4a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         if not weight_quant or not input_quant:
             return False
@@ -462,29 +466,33 @@ def _is_fp8_w4a8(
             and is_dynamic
         )
 
+    @classmethod
     def _is_fp8_w4a8_sm90(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
-        return self._check_scheme_supported(
+        return cls._check_scheme_supported(
             90, error=False, match_exact=True
-        ) and self._is_fp8_w4a8(weight_quant, input_quant)
+        ) and cls._is_fp8_w4a8(weight_quant, input_quant)
 
+    @classmethod
     def _is_fp8_w8a8_sm90(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
-        return self._check_scheme_supported(
+        return cls._check_scheme_supported(
             90, error=False, match_exact=True
-        ) and self._is_fp8_w8a8(weight_quant, input_quant)
+        ) and cls._is_fp8_w8a8(weight_quant, input_quant)
 
+    @classmethod
     def _is_fp8_w8a8_sm100(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
-        return self._check_scheme_supported(
+        return cls._check_scheme_supported(
             100, error=False, match_exact=True
-        ) and self._is_fp8_w8a8(weight_quant, input_quant)
+        ) and cls._is_fp8_w8a8(weight_quant, input_quant)
 
+    @staticmethod
     def _is_fp8_w8a16(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         # Confirm weights quantized.
         if weight_quant is None:
@@ -508,8 +516,9 @@ def _is_fp8_w8a16(
             and is_tensor_or_channel_or_block_weight
         )
 
+    @staticmethod
     def _is_wNa16_group_channel(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         input_quant_none = input_quant is None
         is_channel_group = (

From c4c0354eec86f3486285f121fa184dd6d9cacb9d Mon Sep 17 00:00:00 2001
From: Alec <35311602+alec-flowers@users.noreply.github.com>
Date: Wed, 26 Nov 2025 09:41:16 -0800
Subject: [PATCH 454/578] [CI/Build] allow user modify pplx and deepep ref by
 ENV or command line (#29131)

Signed-off-by: alec-flowers <aflowers@nvidia.com>
---
 docker/Dockerfile                            |  8 ++-
 tools/ep_kernels/install_python_libraries.sh | 66 +++++++++++++++++---
 2 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 84a1802dbe03..aa3aad21d6c0 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -244,9 +244,15 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
 
 COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
 # Install EP kernels(pplx-kernels and DeepEP)
+ARG PPLX_COMMIT_HASH
+ARG DEEPEP_COMMIT_HASH
 RUN --mount=type=cache,target=/root/.cache/uv \
     export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
-    /tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
+    /tmp/install_python_libraries.sh \
+        --workspace /tmp/ep_kernels_workspace \
+        --mode wheel \
+        ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
+        ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
     find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
 
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 1cea1bef8dbc..88be5cd778ff 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -1,22 +1,68 @@
 #!/usr/bin/env bash
 set -ex
 
-# usage: ./build.sh [workspace_dir] [mode]
-#   mode: "install" (default) → install directly into current Python env
-#         "wheel"              → build wheels into WORKSPACE/dist
+# usage: ./install_python_libraries.sh [options]
+#   --workspace <dir>    workspace directory (default: ./ep_kernels_workspace)
+#   --mode <mode>        "install" (default) or "wheel"
+#   --pplx-ref <commit>  pplx-kernels commit hash
+#   --deepep-ref <commit> DeepEP commit hash
+
+CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
+PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"}
+DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"}
+NVSHMEM_VER=3.3.9
+WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace}
+MODE=${MODE:-install}
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --workspace)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --workspace requires an argument." >&2
+                exit 1
+            fi
+            WORKSPACE="$2"
+            shift 2
+            ;;
+        --mode)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --mode requires an argument." >&2
+                exit 1
+            fi
+            MODE="$2"
+            shift 2
+            ;;
+        --pplx-ref)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --pplx-ref requires an argument." >&2
+                exit 1
+            fi
+            PPLX_COMMIT_HASH="$2"
+            shift 2
+            ;;
+        --deepep-ref)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --deepep-ref requires an argument." >&2
+                exit 1
+            fi
+            DEEPEP_COMMIT_HASH="$2"
+            shift 2
+            ;;
+        *)
+            echo "Error: Unknown argument '$1'" >&2
+            exit 1
+            ;;
+    esac
+done
 
-WORKSPACE=${1:-$(pwd)/ep_kernels_workspace}
-MODE=${2:-install}
 mkdir -p "$WORKSPACE"
 
 WHEEL_DIR="$WORKSPACE/dist"
 mkdir -p "$WHEEL_DIR"
-NVSHMEM_VER=3.3.9
 
 pushd "$WORKSPACE"
 
-CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
-
 # install dependencies if not installed
 if [ -z "$VIRTUAL_ENV" ]; then
   uv pip install --system cmake torch ninja
@@ -133,7 +179,7 @@ do_build \
     "https://github.com/ppl-ai/pplx-kernels" \
     "pplx-kernels" \
     "setup.py" \
-    "12cecfd" \
+    "$PPLX_COMMIT_HASH" \
     ""
 
 # build DeepEP
@@ -141,7 +187,7 @@ do_build \
     "https://github.com/deepseek-ai/DeepEP" \
     "DeepEP" \
     "setup.py" \
-    "73b6ea4" \
+    "$DEEPEP_COMMIT_HASH" \
     "export NVSHMEM_DIR=$WORKSPACE/nvshmem; "
 
 if [ "$MODE" = "wheel" ]; then

From 430dd4d9eb7e342e28012351df06d93892f86741 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 26 Nov 2025 12:53:15 -0500
Subject: [PATCH 455/578] [Attention] Remove imports from
 `vllm/attention/__init__.py` (#29342)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 docs/contributing/model/basic.md              |  2 +-
 tests/compile/test_fusion_attn.py             |  3 ++-
 tests/compile/test_qk_norm_rope_fusion.py     |  3 ++-
 tests/kernels/utils.py                        |  2 +-
 tests/v1/worker/test_gpu_model_runner.py      |  2 +-
 tests/v1/worker/test_utils.py                 |  4 ++--
 vllm/attention/__init__.py                    | 19 -------------------
 vllm/attention/backends/abstract.py           |  2 +-
 vllm/attention/layer.py                       |  7 +++++--
 vllm/compilation/fusion_attn.py               |  2 +-
 vllm/compilation/qk_norm_rope_fusion.py       |  2 +-
 .../kv_connector/v1/nixl_connector.py         |  2 +-
 .../kv_connector/v1/offloading_connector.py   |  3 ++-
 .../layers/mamba/linear_attn.py               |  2 +-
 vllm/model_executor/model_loader/utils.py     |  3 +--
 vllm/model_executor/models/afmoe.py           |  3 ++-
 vllm/model_executor/models/apertus.py         |  3 ++-
 vllm/model_executor/models/arctic.py          |  2 +-
 vllm/model_executor/models/baichuan.py        |  2 +-
 vllm/model_executor/models/bailing_moe.py     |  2 +-
 vllm/model_executor/models/bloom.py           |  2 +-
 vllm/model_executor/models/chameleon.py       |  2 +-
 vllm/model_executor/models/chatglm.py         |  2 +-
 vllm/model_executor/models/clip.py            |  3 +--
 vllm/model_executor/models/commandr.py        |  2 +-
 vllm/model_executor/models/dbrx.py            |  2 +-
 vllm/model_executor/models/deepseek_v2.py     |  2 +-
 vllm/model_executor/models/dots1.py           |  2 +-
 vllm/model_executor/models/ernie45_moe.py     |  2 +-
 vllm/model_executor/models/ernie45_vl_moe.py  |  2 +-
 vllm/model_executor/models/exaone.py          |  2 +-
 vllm/model_executor/models/exaone4.py         |  2 +-
 vllm/model_executor/models/falcon.py          |  2 +-
 vllm/model_executor/models/gemma.py           |  2 +-
 vllm/model_executor/models/gemma2.py          |  2 +-
 vllm/model_executor/models/gemma3.py          |  3 ++-
 vllm/model_executor/models/gemma3n.py         |  2 +-
 vllm/model_executor/models/glm4.py            |  3 ++-
 vllm/model_executor/models/glm4_moe.py        |  2 +-
 vllm/model_executor/models/gpt2.py            |  2 +-
 vllm/model_executor/models/gpt_bigcode.py     |  2 +-
 vllm/model_executor/models/gpt_j.py           |  2 +-
 vllm/model_executor/models/gpt_neox.py        |  2 +-
 vllm/model_executor/models/gpt_oss.py         |  3 ++-
 vllm/model_executor/models/granite.py         |  2 +-
 vllm/model_executor/models/granitemoe.py      |  2 +-
 vllm/model_executor/models/grok1.py           |  2 +-
 vllm/model_executor/models/hunyuan_v1.py      |  3 ++-
 vllm/model_executor/models/internlm2.py       |  2 +-
 vllm/model_executor/models/jais.py            |  2 +-
 vllm/model_executor/models/lfm2.py            |  2 +-
 vllm/model_executor/models/lfm2_moe.py        |  2 +-
 vllm/model_executor/models/llama.py           |  3 ++-
 vllm/model_executor/models/llama4.py          |  2 +-
 vllm/model_executor/models/minicpm.py         |  2 +-
 vllm/model_executor/models/minicpm3.py        |  2 +-
 vllm/model_executor/models/minimax_m2.py      |  2 +-
 vllm/model_executor/models/minimax_text_01.py |  3 ++-
 vllm/model_executor/models/mixtral.py         |  2 +-
 vllm/model_executor/models/molmo.py           |  3 +--
 vllm/model_executor/models/mpt.py             |  2 +-
 vllm/model_executor/models/nemotron.py        |  2 +-
 vllm/model_executor/models/nemotron_nas.py    |  2 +-
 vllm/model_executor/models/olmo.py            |  2 +-
 vllm/model_executor/models/olmo2.py           |  2 +-
 vllm/model_executor/models/olmoe.py           |  2 +-
 vllm/model_executor/models/openpangu.py       |  3 ++-
 vllm/model_executor/models/opt.py             |  2 +-
 vllm/model_executor/models/orion.py           |  2 +-
 vllm/model_executor/models/ouro.py            |  3 ++-
 vllm/model_executor/models/persimmon.py       |  2 +-
 vllm/model_executor/models/phi.py             |  2 +-
 vllm/model_executor/models/phimoe.py          |  2 +-
 vllm/model_executor/models/qwen.py            |  2 +-
 vllm/model_executor/models/qwen2.py           |  3 ++-
 vllm/model_executor/models/qwen2_moe.py       |  2 +-
 vllm/model_executor/models/qwen3.py           |  3 ++-
 vllm/model_executor/models/qwen3_moe.py       |  2 +-
 vllm/model_executor/models/qwen3_next.py      |  3 ++-
 vllm/model_executor/models/seed_oss.py        |  3 ++-
 vllm/model_executor/models/solar.py           |  2 +-
 vllm/model_executor/models/stablelm.py        |  2 +-
 vllm/model_executor/models/starcoder2.py      |  2 +-
 vllm/model_executor/models/step3_text.py      |  2 +-
 .../models/transformers/base.py               |  3 ++-
 vllm/model_executor/models/whisper.py         |  4 ++--
 vllm/platforms/cuda.py                        |  2 +-
 vllm/v1/attention/backends/cpu_attn.py        |  2 +-
 vllm/v1/attention/backends/flash_attn.py      |  2 +-
 vllm/v1/attention/backends/flex_attention.py  |  2 +-
 vllm/v1/kv_offload/cpu.py                     |  2 +-
 vllm/v1/kv_offload/spec.py                    |  2 +-
 vllm/v1/kv_offload/worker/cpu_gpu.py          |  2 +-
 vllm/v1/worker/gpu_model_runner.py            |  3 ++-
 .../worker/kv_connector_model_runner_mixin.py |  2 +-
 vllm/v1/worker/tpu_model_runner.py            |  3 +--
 96 files changed, 120 insertions(+), 121 deletions(-)

diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index e828de0adf3c..a68d1f0162a1 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -29,7 +29,7 @@ The initialization code should look like this:
     ```python
     from torch import nn
     from vllm.config import VllmConfig
-    from vllm.attention import Attention
+    from vllm.attention.layer import Attention
 
     class MyAttention(nn.Module):
         def __init__(self, vllm_config: VllmConfig, prefix: str):
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index ea61c94953a7..dbe12dc5de70 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -9,8 +9,9 @@
 from tests.utils import flat_product
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import Attention
 from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
index 511e50f5fdc2..5ebb95b6db33 100644
--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -5,7 +5,8 @@
 import torch
 
 from tests.compile.backend import TestBackend
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 9307ef7814a8..b8148ce06b3f 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -14,7 +14,7 @@
 from torch._prims_common import TensorLikeType
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
-from vllm.attention import AttentionType
+from vllm.attention.backends.abstract import AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.utils import (
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index d0f1b703fcb9..89669ee8b71a 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -5,8 +5,8 @@
 import pytest
 import torch
 
-from vllm.attention import Attention
 from vllm.attention.backends.abstract import MultipleOf
+from vllm.attention.layer import Attention
 from vllm.config import (
     CacheConfig,
     ModelConfig,
diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py
index f987b09e603e..bcf5611e3522 100644
--- a/tests/v1/worker/test_utils.py
+++ b/tests/v1/worker/test_utils.py
@@ -7,7 +7,7 @@
 
 
 def test_bind_kv_cache():
-    from vllm.attention import Attention
+    from vllm.attention.layer import Attention
 
     ctx = {
         "layers.0.self_attn": Attention(32, 128, 0.1),
@@ -35,7 +35,7 @@ def test_bind_kv_cache():
 
 
 def test_bind_kv_cache_non_attention():
-    from vllm.attention import Attention
+    from vllm.attention.layer import Attention
 
     # example from Jamba PP=2
     ctx = {
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index 8b4dc4013362..e69de29bb2d1 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -1,19 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionMetadata,
-    AttentionType,
-)
-from vllm.attention.layer import Attention
-from vllm.attention.selector import get_attn_backend, get_mamba_attn_backend
-
-__all__ = [
-    "Attention",
-    "AttentionBackend",
-    "AttentionMetadata",
-    "AttentionType",
-    "get_attn_backend",
-    "get_mamba_attn_backend",
-]
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index bd7e81b15bfc..a321167b8090 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -178,7 +178,7 @@ def supports_attn_type(cls, attn_type: str) -> bool:
         By default, only supports decoder attention.
         Backends should override this to support other attention types.
         """
-        from vllm.attention import AttentionType
+        from vllm.attention.backends.abstract import AttentionType
 
         return attn_type == AttentionType.DECODER
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index f1d57ac50fb9..62ac38751aa0 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -10,8 +10,11 @@
 import torch.nn.functional as F
 
 import vllm.envs as envs
-from vllm.attention import AttentionType
-from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionType,
+    MLAAttentionImpl,
+)
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index 4f44faece75e..6dcbbd85d703 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -10,7 +10,7 @@
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py
index e3c399e07906..794cd8e3fce5 100644
--- a/vllm/compilation/qk_norm_rope_fusion.py
+++ b/vllm/compilation/qk_norm_rope_fusion.py
@@ -9,7 +9,7 @@
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 493938d4aad9..ff51840b84b1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -20,7 +20,7 @@
 import zmq
 
 from vllm import envs
-from vllm.attention import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index 8cd09014cab1..0ad9d4ae1b39 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -8,7 +8,8 @@
 
 import torch
 
-from vllm.attention import Attention, AttentionBackend, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index d85b3e61c5d6..278713408c28 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -8,7 +8,7 @@
 from einops import rearrange
 from torch import nn
 
-from vllm.attention import AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 2021b68b8a60..eeb2444150ee 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -11,8 +11,7 @@
 from torch import nn
 from typing_extensions import assert_never
 
-from vllm.attention import Attention
-from vllm.attention.layer import MLAAttention
+from vllm.attention.layer import Attention, MLAAttention
 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py
index 4eb5665a71fc..85827d54c911 100644
--- a/vllm/model_executor/models/afmoe.py
+++ b/vllm/model_executor/models/afmoe.py
@@ -9,7 +9,8 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index b75e91319bba..f38b09bf5506 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -32,7 +32,8 @@
 from torch import nn
 from transformers import ApertusConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index b75a254761d4..266d29a8d9b2 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -8,7 +8,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 024788918d02..beb22995a071 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -29,7 +29,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index cc10e936a2d3..f7a5d4e7889e 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -32,7 +32,7 @@
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 00fba93423d8..507fbf1fdd0a 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -27,7 +27,7 @@
 from torch import nn
 from transformers import BloomConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index b5a6d00dc309..3aa01bb1905f 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -16,7 +16,7 @@
     ChameleonVQVAEConfig,
 )
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index dbfcd62d0bca..3d485fdd0a2e 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -12,7 +12,7 @@
 from torch import nn
 from torch.nn import LayerNorm
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 5d611deb942d..c2993b47dc3f 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -14,8 +14,7 @@
     CLIPVisionConfig,
 )
 
-from vllm.attention import Attention
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 5ed920927c77..f837502c468f 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -30,7 +30,7 @@
 from torch import nn
 from transformers import Cohere2Config, CohereConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 2c729019081a..946baffc8817 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -8,7 +8,7 @@
 import torch.nn as nn
 from transformers import DbrxConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
     get_pp_group,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index ad932559b983..73cac2556c55 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -33,8 +33,8 @@
 from transformers import DeepseekV2Config, DeepseekV3Config
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.layer import Attention
 from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index e65c275106a4..1c2abbe7b3a7 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -32,7 +32,7 @@
 from torch import nn
 from transformers import Dots1Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index a7df3509e3ec..278ba45e9684 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -32,7 +32,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index 50e033d77606..72f9957fc882 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -31,7 +31,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 
 # from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index d13275488fe9..99002baa8752 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -32,7 +32,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 70f3cce2b7c5..9d2c67d6c4f8 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -28,7 +28,7 @@
 from torch import nn
 from transformers import Exaone4Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index dc2d51f340c8..32d9e7b92559 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -30,7 +30,7 @@
 from torch.nn import LayerNorm
 from transformers import FalconConfig as HF_FalconConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 00c7f59a0809..dd5a74c8ed00 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -26,7 +26,7 @@
 from torch import nn
 from transformers import GemmaConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 9b6cfe693230..cb36e0482458 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -23,7 +23,7 @@
 from torch import nn
 from transformers import Gemma2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 4ad6fc89dcaf..73176eba95ed 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -23,7 +23,8 @@
 from torch import nn
 from transformers import Gemma3TextConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 8f1447ba34a8..f4427c9fd1d1 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -21,7 +21,7 @@
 from torch import nn
 from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index f8ef3b0385fb..002cdb721e1d 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -29,7 +29,8 @@
 from torch import nn
 from transformers import Glm4Config
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 5aa51af54a00..c99f824e1bd4 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -31,7 +31,7 @@
 from torch import nn
 from transformers.models.glm4_moe import Glm4MoeConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index a5e8131c7fba..da5d48a94ff3 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -27,7 +27,7 @@
 from torch import nn
 from transformers import GPT2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed.parallel_state import (
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index cdf038ba25c9..a405fd184513 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -28,7 +28,7 @@
 from torch import nn
 from transformers import GPTBigCodeConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index bd1bfea3c0fe..f0a34c47da54 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -26,7 +26,7 @@
 from torch import nn
 from transformers import GPTJConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 815c2fba4d9f..b9959682cbce 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -26,7 +26,7 @@
 from torch import nn
 from transformers import GPTNeoXConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 1bc0ad38765d..9de3e261941b 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -7,7 +7,8 @@
 from torch import nn
 from transformers import GptOssConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index cd7ce2fc8f00..eac9ef9478a6 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -31,7 +31,7 @@
 from torch import nn
 from transformers import GraniteConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 8f4139d63c3f..02c6c5862141 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -31,7 +31,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index cfca56492011..6f62a1d11e52 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -31,7 +31,7 @@
 import torch.nn.functional as F
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 53fb444ed622..ccdfa3fe175f 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -33,7 +33,8 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index dc8f821bd134..c79934e12144 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -10,7 +10,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 5549a1fc1cd3..6012288814f1 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -28,7 +28,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index 74bdde27ece5..69615f8b6a09 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -7,7 +7,7 @@
 import torch.nn as nn
 from transformers import Lfm2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index c088a0821152..aaeb2cc38999 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -6,7 +6,7 @@
 import torch
 import torch.nn as nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f6af2bb3b12e..6dfbde7a17f5 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -31,7 +31,8 @@
 from torch import nn
 from transformers import LlamaConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index e1bdfc3405f7..423be45e8014 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -24,7 +24,7 @@
 from torch import nn
 from transformers import Llama4TextConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 04923833065f..67911ba8c1c8 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -33,7 +33,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 2d775219fc97..0a2bcbd7f608 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -29,7 +29,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.layernorm import RMSNorm
diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
index 4955c68c0cda..dd98e36ec085 100644
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -30,7 +30,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 50f7396e2de6..390de78cc27b 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -14,7 +14,8 @@
 from torch import nn
 from transformers import MiniMaxConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import (
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0a9c3f136964..e21656dbd635 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -32,7 +32,7 @@
 from torch import nn
 from transformers import MixtralConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index dc06938d5d6e..7b53299cccbe 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -17,8 +17,7 @@
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention import Attention
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 106ad971a321..1e285646b9ec 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -10,7 +10,7 @@
 import torch.nn as nn
 from transformers import MptConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index c3337bd1ea69..93ad2064a2fc 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -30,7 +30,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 2eebe38051cb..34ea2945b711 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -31,7 +31,7 @@
 from torch import nn
 from transformers import LlamaConfig
 
-from vllm.attention import AttentionType
+from vllm.attention.backends.abstract import AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index bd8a8e317544..3bbb4dd24226 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -31,7 +31,7 @@
 from torch import nn
 from transformers import OlmoConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index f0f6b2f6b3e6..88e9c2d8541a 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -32,7 +32,7 @@
 from torch import nn
 from transformers import Olmo2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index c39e338d72e2..1376583a9972 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -21,7 +21,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index 4124a181a14c..bddd9fa50957 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -29,7 +29,8 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 5df700d1a2e1..bba5291ea5ef 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -27,7 +27,7 @@
 from torch import nn
 from transformers import OPTConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index b30be93ca726..544a44ed5468 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -15,7 +15,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
index 63d2fff6ec8b..dcae92ed2088 100644
--- a/vllm/model_executor/models/ouro.py
+++ b/vllm/model_executor/models/ouro.py
@@ -33,7 +33,8 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 98963d52e484..795cd25f1675 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -30,7 +30,7 @@
 from torch import nn
 from transformers import PersimmonConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index da476f621627..70016d9ed246 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -45,7 +45,7 @@
 from torch import nn
 from transformers import PhiConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 8ffac95d9396..a5a669139b2f 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -31,7 +31,7 @@
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index c973e7917098..12285cf9c196 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -16,7 +16,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 5831ce0b3d64..34c31d8deee2 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -33,7 +33,8 @@
 from torch import nn
 from transformers import Qwen2Config
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 6b97d0b2ca2e..5a428740082f 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -34,7 +34,7 @@
 from torch import nn
 from transformers import Qwen2MoeConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 93a629d81e8f..7d2b3e5f9bc7 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -30,7 +30,8 @@
 from torch import nn
 from transformers import Qwen3Config
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 8ee3dd99e11d..6f520706a317 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -31,7 +31,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index bfed64728305..661a182151d7 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -10,7 +10,8 @@
 from torch import nn
 from transformers.activations import ACT2FN
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CacheConfig,
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index 4744d8e44f39..267c60157506 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -30,7 +30,8 @@
 from torch import nn
 from transformers import PretrainedConfig as SeedOssConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 7e9fc51036d2..c576154b1ecf 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -30,7 +30,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index a738fcbb4ee2..6cb98b7b72a5 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -29,7 +29,7 @@
 from torch import nn
 from transformers import StableLmConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 1118fca3cac9..46422f303ff4 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -28,7 +28,7 @@
 from torch import nn
 from transformers import Starcoder2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 3c377a2c539d..077cce84a98d 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -9,7 +9,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index f4ba4758bcc4..b33ce35427f5 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -27,7 +27,8 @@
 from transformers import AutoModel
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.config.utils import getattr_iter
 from vllm.distributed import get_pp_group, get_tp_group
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 50587c627160..c72b5e1c091f 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -16,8 +16,8 @@
 )
 from transformers.models.whisper.modeling_whisper import sinusoids
 
-from vllm.attention import Attention, AttentionType
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.attention.layers.cross_attention import CrossAttention
 from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 75b6bc77e4c1..e8e14387bb7f 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -335,7 +335,7 @@ def get_attn_backend_cls(
         use_sparse: bool,
         attn_type: str | None = None,
     ) -> str:
-        from vllm.attention import AttentionType
+        from vllm.attention.backends.abstract import AttentionType
 
         if attn_type is None:
             attn_type = AttentionType.DECODER
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 590bf91b0d05..d0b1f8c1b807 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -51,7 +51,7 @@ def get_name() -> str:
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """CPU attention supports decoder and encoder-only attention."""
-        from vllm.attention import AttentionType
+        from vllm.attention.backends.abstract import AttentionType
 
         return attn_type in (
             AttentionType.DECODER,
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a9a4af5ac118..0fc57cfb1f9d 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -84,7 +84,7 @@ def get_name() -> str:
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """FlashAttention supports all attention types."""
-        from vllm.attention import AttentionType
+        from vllm.attention.backends.abstract import AttentionType
 
         return attn_type in (
             AttentionType.DECODER,
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 7768827d26dc..3869f1f4164c 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -87,7 +87,7 @@ def get_name() -> str:
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """FlexAttention supports both decoder and encoder-only attention."""
-        from vllm.attention import AttentionType
+        from vllm.attention.backends.abstract import AttentionType
 
         return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
 
diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py
index 86747299eb10..2f2e85c0ff33 100644
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from vllm.attention import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py
index c1813a4ff4ea..3afce5589075 100644
--- a/vllm/v1/kv_offload/spec.py
+++ b/vllm/v1/kv_offload/spec.py
@@ -11,7 +11,7 @@
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 if TYPE_CHECKING:
-    from vllm.attention import AttentionBackend
+    from vllm.attention.backends.abstract import AttentionBackend
     from vllm.config import VllmConfig
 
 logger = init_logger(__name__)
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index bb163f0043fc..461458c1f6ce 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -5,7 +5,7 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d3c61794f8b0..581921a9bfe5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,12 +19,13 @@
 from tqdm import tqdm
 
 import vllm.envs as envs
-from vllm.attention import Attention, AttentionType
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionMetadata,
+    AttentionType,
     MultipleOf,
 )
+from vllm.attention.layer import Attention
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index ff047d8d03f0..b799f1be73d9 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -13,7 +13,7 @@
 
 import torch
 
-from vllm.attention import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.config.cache import CacheDType
 from vllm.distributed.kv_transfer import (
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 72d4474b8962..9c1fbfd24149 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -17,9 +17,8 @@
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
-from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionType
-from vllm.attention.layer import MLAAttention
+from vllm.attention.layer import Attention, MLAAttention
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
 from vllm.config import (

From 56539cddac9eeab0a91941d8de689a6cae5dbe05 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 26 Nov 2025 14:07:13 -0500
Subject: [PATCH 456/578] [Core] Refactor padding logic and pad for CUDA graphs
 before attention metadata building  (#28579)

---
 docs/design/cuda_graphs.md                    |   8 +-
 tests/v1/cudagraph/test_cudagraph_dispatch.py |  43 +-
 vllm/forward_context.py                       |  18 +-
 vllm/v1/attention/backends/flashinfer.py      |  21 +-
 vllm/v1/attention/backends/mamba_attn.py      |   2 +
 vllm/v1/attention/backends/utils.py           |   5 +-
 vllm/v1/cudagraph_dispatcher.py               |  97 ++--
 vllm/v1/worker/dp_utils.py                    |  17 +-
 vllm/v1/worker/gpu_model_runner.py            | 432 ++++++++++--------
 vllm/v1/worker/gpu_worker.py                  |  41 +-
 10 files changed, 401 insertions(+), 283 deletions(-)

diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
index 66bf3b27d1f5..7baadf8ba23c 100644
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -84,12 +84,14 @@ See the following figures for a quick comparison between the previous and curren
 ```python
 class BatchDescriptor(NamedTuple):
     num_tokens: int
-    uniform_decode: bool = False
+    num_reqs: int
+    uniform: bool = False
+    has_lora: bool = False
 ```
 
-where `num_tokens` can be the padded token length, and `uniform_decode` is determined by if `max_query_len` of a batch is equal to the desired `max_query_len` of a uniform_decode, and the num_scheduled_tokens is divisible by that desired `max_query_len`.
+where `num_tokens` can be the padded token length, and `uniform` indicates if all the requests have the same query lengths. Many attention backends only support full cudagraphs when the batches are uniform; pure decode batches are uniform but may not be query length 1 (i.e. `num_tokens == num_reqs`), this occurs in the validation pass of spec-decode where "decode" batches will have a query length of  `1+num_spec_tokens`.
 
-The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
+The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item.
 
 !!! note
     The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index bb953e5c70c8..314e7094ef97 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -42,12 +42,24 @@ def _create_vllm_config(
     mock_config.compilation_config = compilation_config
     mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs)
     mock_config.parallel_config = ParallelConfig()
+    mock_config.speculative_config = None  # No speculative decoding
     if not lora_config:
         mock_config.lora_config = None
     # Mimic the behavior of VllmConfig.__post_init__()
     if compilation_config.mode == CompilationMode.VLLM_COMPILE:
         compilation_config.set_splitting_ops_for_v1()
 
+    # mimic VllmConfig.__post_init__
+    if compilation_config.cudagraph_capture_sizes:
+        compilation_config.max_cudagraph_capture_size = (
+            compilation_config.cudagraph_capture_sizes[-1]
+        )
+
+        compilation_config.post_init_cudagraph_sizes()
+        mock_config.pad_for_cudagraph = (
+            lambda batch_size: compilation_config.bs_to_padded_graph_size[batch_size]
+        )
+
     return mock_config
 
 
@@ -109,9 +121,11 @@ def test_dispatcher(self, cudagraph_mode_str, compilation_mode, lora_config):
         # 1. non-uniform batch, size in cudagraph size list
         desc_full_exact = BatchDescriptor(
             num_tokens=8,
-            uniform_decode=False,
+            uniform=False,
+        )
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=8, uniform_decode=False, has_lora=False
         )
-        rt_mode, key = dispatcher.dispatch(desc_full_exact)
         if cudagraph_mode_str == "FULL":
             assert rt_mode == CUDAGraphMode.FULL
             assert key == desc_full_exact
@@ -122,32 +136,37 @@ def test_dispatcher(self, cudagraph_mode_str, compilation_mode, lora_config):
             assert rt_mode == CUDAGraphMode.NONE
 
         # 2. uniform decode batch, size in cudagraph size list
-        desc_uniform_exact = BatchDescriptor(num_tokens=8, uniform_decode=True)
-        rt_mode, key = dispatcher.dispatch(desc_uniform_exact)
+        desc_uniform_exact = BatchDescriptor(num_tokens=8, num_reqs=8, uniform=True)
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=8, uniform_decode=True, has_lora=False
+        )
         if cudagraph_mode_str == "FULL":
             assert rt_mode == CUDAGraphMode.FULL
-            assert key == desc_uniform_exact.non_uniform
+            assert key == desc_uniform_exact.relax_for_mixed_batch_cudagraphs()
         elif cudagraph_mode_str in ["FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"]:
             assert rt_mode == CUDAGraphMode.FULL
             assert key == desc_uniform_exact
         elif cudagraph_mode_str == "PIECEWISE":
             assert rt_mode == CUDAGraphMode.PIECEWISE
-            assert key == desc_uniform_exact.non_uniform
+            assert key == desc_uniform_exact.relax_for_mixed_batch_cudagraphs()
         else:
             assert rt_mode == CUDAGraphMode.NONE
 
         # 3. No key match
-        desc_no_match = BatchDescriptor(num_tokens=15, uniform_decode=False)
-        rt_mode, key = dispatcher.dispatch(desc_no_match)
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=15, uniform_decode=False, has_lora=False
+        )
         assert rt_mode == CUDAGraphMode.NONE
-        assert key is None
+        assert key == BatchDescriptor(num_tokens=15)
 
         # 4. Cascade attention should have a fall back mode
-        desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False)
-        rt_mode, key = dispatcher.dispatch(desc_full_exact, use_cascade_attn=True)
+        desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False)
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True
+        )
         if "PIECEWISE" in cudagraph_mode_str:  # string contains check
             assert rt_mode == CUDAGraphMode.PIECEWISE
-            assert key == desc_full_exact.non_uniform
+            assert key == desc_full_exact.relax_for_mixed_batch_cudagraphs()
         else:
             assert rt_mode == CUDAGraphMode.NONE
 
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 7cb490e391ab..635419bc7cad 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -35,23 +35,27 @@ class BatchDescriptor(NamedTuple):
     """
 
     num_tokens: int
-    uniform_decode: bool = False
+    num_reqs: int | None = None
     """
-    False can also be used for an uniform decode batch to dispatch to the 
-    cudagraph supporting non-uniform batches.
+    Number of requests in the batch. Can be None for PIECEWISE cudagraphs where
+    the cudagraphs can handle any number of requests.
+    """
+    uniform: bool = False
+    """
+    True if all the requests in the batch have the same number of tokens.
     """
     has_lora: bool = False
     """
     Whether this batch has active LoRA adapters.
     """
 
-    @property
-    def non_uniform(self) -> "BatchDescriptor":
+    def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor":
         """
-        Return a non-uniform version of current batch descriptor.
+        Return a relaxed version of current batch descriptor that is still compatible
+        with PIECEWISE cudagraphs (or mixed prefill-decode FA cudagraphs).
         """
         return BatchDescriptor(
-            self.num_tokens, uniform_decode=False, has_lora=self.has_lora
+            self.num_tokens, num_reqs=None, uniform=False, has_lora=self.has_lora
         )
 
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 8159f4096107..dbd72b298b1f 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -930,31 +930,12 @@ def build(
 
             if num_decodes > 0:
                 pure_decode = num_prefills == 0
-                # possible required padding for cudagraph replay
                 use_cudagraph = (
                     self.enable_cuda_graph
                     and pure_decode
                     and num_decode_tokens <= self._decode_cudagraph_max_bs
                 )
-                if use_cudagraph:
-                    num_input_tokens = self.vllm_config.pad_for_cudagraph(
-                        num_decode_tokens
-                    )
-                    # Carefully fulfill the padding region with reasonable value
-                    # on cpu.
-                    # Make sure paged_kv_indptr_cpu is not decreasing
-                    self.paged_kv_indptr_cpu[
-                        1 + num_decodes : 1 + num_input_tokens
-                    ].fill_(paged_kv_indptr_cpu[-1])
-                    # Fill the remaining paged_kv_last_page_len_cpu with 1.
-                    # This is because flashinfer treats 0 as a full page
-                    # instead of empty.
-                    self.paged_kv_last_page_len_cpu[num_decodes:num_input_tokens].fill_(
-                        1
-                    )
-
-                else:
-                    num_input_tokens = num_decode_tokens
+                num_input_tokens = num_decode_tokens
 
                 attn_metadata.decode_wrapper = self._get_decode_wrapper(
                     num_input_tokens, use_cudagraph
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index 0d875565fc99..a9705db59f19 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -107,6 +107,8 @@ def _compute_prefix_caching_block_indices(
         )
         # -1 in case it's non-computed and causes later issues with indexing
         block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0)
+        # -1 in the case we have a padded request (0 seq-len)
+        block_idx_last_scheduled_token = block_idx_last_scheduled_token.clamp(min=0)
 
         return (
             block_idx_last_computed_token,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index cebfe8a3ff04..18e91fd4fd6a 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -72,6 +72,7 @@ class CommonAttentionMetadata:
 
     num_reqs: int
     """Number of requests"""
+    # TODO(lucas): rename to num_tokens since it may be padded and this is misleading
     num_actual_tokens: int
     """Total number of tokens in batch"""
     max_query_len: int
@@ -857,7 +858,9 @@ def split_decodes_and_prefills(
     if require_uniform:
         is_prefill = query_lens != query_lens[0]
     else:
-        is_prefill = query_lens > decode_threshold
+        # 0-query len indicates a padded request; leave this at the back
+        # of the batch with the prefills
+        is_prefill = (query_lens > decode_threshold) | (query_lens == 0)
 
     if not torch.any(is_prefill):
         return num_reqs, 0, num_tokens, 0
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index b480ac78f23c..ef0f8d9e6745 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -4,6 +4,9 @@
 
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.forward_context import BatchDescriptor
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class CudagraphDispatcher:
@@ -28,7 +31,11 @@ class CudagraphDispatcher:
     def __init__(self, vllm_config: VllmConfig):
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
-        self.cudagraph_mode = self.compilation_config.cudagraph_mode
+        self.uniform_decode_query_len = (
+            1
+            if not self.vllm_config.speculative_config
+            else 1 + self.vllm_config.speculative_config.num_speculative_tokens
+        )
 
         # Dict to store valid cudagraph dispatching keys.
         self.cudagraph_keys: dict[CUDAGraphMode, set[BatchDescriptor]] = {
@@ -36,25 +43,42 @@ def __init__(self, vllm_config: VllmConfig):
             CUDAGraphMode.FULL: set(),
         }
 
-        not_use_piecewise_compilation = (
-            not self.cudagraph_mode.requires_piecewise_compilation()
-        )
-
         assert (
-            not_use_piecewise_compilation
+            not self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
             or self.compilation_config.is_attention_compiled_piecewise()
         ), (
             "Compilation mode should be CompilationMode.VLLM_COMPILE when "
             "cudagraph_mode piecewise cudagraphs is used, "
             "and attention should be in splitting_ops or "
             "inductor splitting should be used. "
-            f"cudagraph_mode={self.cudagraph_mode}, "
+            f"cudagraph_mode={self.compilation_config.cudagraph_mode}, "
             f"compilation_mode={self.compilation_config.mode}, "
             f"splitting_ops={self.compilation_config.splitting_ops}"
         )
 
         self.keys_initialized = False
 
+    def _create_padded_batch_descriptor(
+        self, num_tokens: int, uniform_decode: bool, has_lora: bool
+    ) -> BatchDescriptor:
+        max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs
+        uniform_decode_query_len = self.uniform_decode_query_len
+        num_tokens_padded = self.vllm_config.pad_for_cudagraph(num_tokens)
+
+        if uniform_decode and self.cudagraph_mode.has_mode(CUDAGraphMode.FULL):
+            num_reqs = num_tokens_padded // uniform_decode_query_len
+            assert num_tokens_padded % uniform_decode_query_len == 0
+        else:
+            uniform_decode = False
+            num_reqs = min(num_tokens_padded, max_num_seqs)
+
+        return BatchDescriptor(
+            num_tokens=num_tokens_padded,
+            num_reqs=num_reqs,
+            uniform=uniform_decode,
+            has_lora=has_lora,
+        )
+
     def add_cudagraph_key(
         self, runtime_mode: CUDAGraphMode, batch_descriptor: BatchDescriptor
     ):
@@ -66,7 +90,9 @@ def add_cudagraph_key(
     def initialize_cudagraph_keys(
         self, cudagraph_mode: CUDAGraphMode, uniform_decode_query_len: int
     ):
-        # This should be called only after attention backend is initialized.
+        # This should be called only after attention backend is initialized. So we can
+        # get the correct cudagraph mode after backend support is resolved.
+        self.cudagraph_mode = cudagraph_mode
 
         # LoRA activation cases to specialize the cuda graphs on
         if self.vllm_config.lora_config:
@@ -86,9 +112,9 @@ def initialize_cudagraph_keys(
             ):
                 self.add_cudagraph_key(
                     cudagraph_mode.mixed_mode(),
-                    BatchDescriptor(
-                        num_tokens=bs, uniform_decode=False, has_lora=has_lora
-                    ),
+                    self._create_padded_batch_descriptor(
+                        bs, False, has_lora
+                    ).relax_for_mixed_batch_cudagraphs(),
                 )
 
         # if decode cudagraph mode is FULL, and we don't already have mixed
@@ -109,40 +135,49 @@ def initialize_cudagraph_keys(
             for bs, has_lora in product(cudagraph_capture_sizes_for_decode, lora_cases):
                 self.add_cudagraph_key(
                     CUDAGraphMode.FULL,
-                    BatchDescriptor(
-                        num_tokens=bs, uniform_decode=True, has_lora=has_lora
-                    ),
+                    self._create_padded_batch_descriptor(bs, True, has_lora),
                 )
+
         self.keys_initialized = True
 
     def dispatch(
-        self, batch_descriptor: BatchDescriptor, use_cascade_attn: bool = False
-    ) -> tuple[CUDAGraphMode, BatchDescriptor | None]:
+        self,
+        num_tokens: int,
+        uniform_decode: bool,
+        has_lora: bool,
+        use_cascade_attn: bool = False,
+    ) -> tuple[CUDAGraphMode, BatchDescriptor]:
         """
         Given conditions(e.g.,batch descriptor and if using cascade attention),
         dispatch to a cudagraph runtime mode and the valid batch descriptor.
         A new batch descriptor is returned as we might dispatch a uniform batch
         to a graph that supports a more general batch (uniform to non-uniform).
         """
-        # if not initialized, just skip dispatching.
-        if not self.keys_initialized:
-            return CUDAGraphMode.NONE, None
+        if (
+            not self.keys_initialized
+            or self.cudagraph_mode == CUDAGraphMode.NONE
+            or num_tokens > self.compilation_config.max_cudagraph_capture_size
+        ):
+            return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
+
+        batch_desc = self._create_padded_batch_descriptor(
+            num_tokens, uniform_decode, has_lora
+        )
+        relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
 
-        non_uniform_key = batch_descriptor.non_uniform
-        # if a batch use cascade attention, bypass checking full cudagraphs
         if not use_cascade_attn:
             # check if key exists for full cudagraph
-            if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]:
-                return CUDAGraphMode.FULL, batch_descriptor
+            if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, batch_desc
 
-            # otherwise, check if non-uniform key exists
-            if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]:
-                return CUDAGraphMode.FULL, non_uniform_key
+            # otherwise, check if the relaxed key exists
+            if relaxed_batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, relaxed_batch_desc
 
-        # also check if non-uniform key exists for more "general"
+        # also check if the relaxed key exists for more "general"
         # piecewise cudagraph
-        if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
-            return CUDAGraphMode.PIECEWISE, non_uniform_key
+        if relaxed_batch_desc in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
+            return CUDAGraphMode.PIECEWISE, relaxed_batch_desc
 
-        # finally, just return no cudagraphs
-        return CUDAGraphMode.NONE, None
+        # finally, just return no cudagraphs and a trivial batch descriptor
+        return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 464fbf11a21a..064f2f0360cb 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -9,6 +9,7 @@
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.logger import init_logger
 from vllm.v1.worker.ubatch_utils import (
+    UBatchSlice,
     UBatchSlices,
     check_ubatch_thresholds,
     create_ubatch_slices,
@@ -88,6 +89,17 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
         return num_tokens_across_dp.cpu()
 
 
+# This just pads the second ubatch slice out to the total number of tokens
+# (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
+def _pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int):
+    padded_second_ubatch_slice = slice(
+        ubatch_slices[1].token_slice.start, num_total_tokens
+    )
+    ubatch_slices[1] = UBatchSlice(
+        padded_second_ubatch_slice, padded_second_ubatch_slice
+    )
+
+
 def _synchronize_dp_ranks(
     num_tokens_unpadded: int,
     num_tokens_padded: int,
@@ -220,11 +232,14 @@ def coordinate_batch_across_dp(
     # to the second ubatch in pad_out_ubatch_slice after attention
     # metadata creation
     assert num_tokens_after_padding is not None
-    token_split_point = int(num_tokens_after_padding[0].item()) // 2
+    num_tokens_padded = int(num_tokens_after_padding[0].item())
+    token_split_point = int(num_tokens_padded) // 2
 
     assert num_scheduled_tokens_per_request is not None
     ubatch_slices = create_ubatch_slices(
         num_scheduled_tokens_per_request, token_split_point
     )
+    ubatch_slices = _pad_out_ubatch_slice(ubatch_slices, num_tokens_padded)
+    assert sum(s.num_tokens for s in ubatch_slices) == num_tokens_padded
 
     return (ubatch_slices, num_tokens_after_padding)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 581921a9bfe5..0ae4eb48acf2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -151,7 +151,6 @@
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 from vllm.v1.worker.ubatch_utils import (
-    UBatchSlice,
     UBatchSlices,
     check_ubatch_thresholds,
 )
@@ -1239,17 +1238,13 @@ def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
         num_scheduled_tokens: np.ndarray,
-        max_num_scheduled_tokens: int,
     ) -> tuple[
         torch.Tensor,
         SpecDecodeMetadata | None,
-        UBatchSlices | None,
-        torch.Tensor | None,
     ]:
         """
         :return: tuple[
             logits_indices, spec_decode_metadata,
-            ubatch_slices, num_tokens_across_dp,
         ]
         """
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -1364,28 +1359,6 @@ def _prepare_inputs(
         self.query_start_loc.copy_to_gpu()
         query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
 
-        num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
-        num_tokens_padded = self._get_num_input_tokens(num_tokens_unpadded)
-        uniform_decode = (
-            max_num_scheduled_tokens == self.uniform_decode_query_len
-        ) and (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
-
-        # Disable DP padding when running eager to avoid excessive padding when
-        # running prefills. This lets us set enforce_eager on the prefiller in
-        # a P/D setup and still use CUDA graphs (enabled by this padding) on the
-        # decoder.
-        allow_dp_padding = self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-
-        ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
-            num_tokens_unpadded=num_tokens_unpadded,
-            parallel_config=self.parallel_config,
-            allow_microbatching=True,
-            allow_dp_padding=allow_dp_padding,
-            num_tokens_padded=num_tokens_padded,
-            uniform_decode=uniform_decode,
-            num_scheduled_tokens_per_request=num_scheduled_tokens,
-        )
-
         self.seq_lens.np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens
         )
@@ -1486,15 +1459,15 @@ def _prepare_inputs(
         return (
             logits_indices,
             spec_decode_metadata,
-            ubatch_slices,
-            num_tokens_across_dp,
         )
 
     def _build_attention_metadata(
         self,
-        total_num_scheduled_tokens: int,
-        max_num_scheduled_tokens: int,
+        num_tokens: int,
         num_reqs: int,
+        max_query_len: int,
+        num_tokens_padded: int | None = None,
+        num_reqs_padded: int | None = None,
         ubatch_slices: UBatchSlices | None = None,
         logits_indices: torch.Tensor | None = None,
         use_spec_decode: bool = False,
@@ -1505,6 +1478,9 @@ def _build_attention_metadata(
         """
         :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
         """
+        num_tokens_padded = num_tokens_padded or num_tokens
+        num_reqs_padded = num_reqs_padded or num_reqs
+
         logits_indices_padded = None
         num_logits_indices = None
         if logits_indices is not None:
@@ -1522,28 +1498,13 @@ def _build_attention_metadata(
                 self.dcp_rank,
                 self.parallel_config.cp_kv_cache_interleave_size,
             )
-            self.dcp_local_seq_lens.copy_to_gpu(num_reqs)
+            self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0)
+            self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded)
 
         attn_metadata: PerLayerAttnMetadata = {}
         if ubatch_slices is not None:
             attn_metadata = [dict() for _ in range(len(ubatch_slices))]
 
-        # Used in the below loop
-        query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
-        query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs + 1]
-        seq_lens = self.seq_lens.gpu[:num_reqs]
-        seq_lens_cpu = self.seq_lens.cpu[:num_reqs]
-        num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
-            :num_reqs
-        ]
-
-        dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None
-        if self.dcp_world_size > 1:
-            dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs]
-            dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs]
-
-        spec_decode_common_attn_metadata = None
-
         if for_cudagraph_capture:
             # For some attention backends (e.g. FA) with sliding window models we need
             # to make sure the backend see a max_seq_len that is larger to the sliding
@@ -1559,6 +1520,22 @@ def _build_attention_metadata(
             self.num_accepted_tokens.np[num_reqs:].fill(1)
             self.num_accepted_tokens.copy_to_gpu()
 
+        # Used in the below loop, uses padded shapes
+        query_start_loc = self.query_start_loc.gpu[: num_reqs_padded + 1]
+        query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs_padded + 1]
+        seq_lens = self.seq_lens.gpu[:num_reqs_padded]
+        seq_lens_cpu = self.seq_lens.cpu[:num_reqs_padded]
+        num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
+            :num_reqs_padded
+        ]
+
+        dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None
+        if self.dcp_world_size > 1:
+            dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded]
+            dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs_padded]
+
+        spec_decode_common_attn_metadata = None
+
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_gid, kv_cache_group in enumerate(
@@ -1567,30 +1544,31 @@ def _build_attention_metadata(
             encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens(
                 num_scheduled_tokens or {},
                 kv_cache_group.kv_cache_spec,
-                num_reqs,
+                num_reqs_padded,
             )
 
             if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec):
                 # Encoder-only layers do not have KV cache, so we need to
                 # create a dummy block table and slot mapping for them.
                 blk_table_tensor = torch.zeros(
-                    (num_reqs, 1),
+                    (num_tokens_padded, 1),
                     dtype=torch.int32,
                     device=self.device,
                 )
                 slot_mapping = torch.zeros(
-                    (total_num_scheduled_tokens,),
+                    (num_tokens_padded,),
                     dtype=torch.int64,
                     device=self.device,
                 )
             else:
                 blk_table = self.input_batch.block_table[kv_cache_gid]
-                blk_table_tensor = blk_table.get_device_tensor(num_reqs)
-                slot_mapping = blk_table.slot_mapping.gpu[:total_num_scheduled_tokens]
+                blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded)
+                slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded]
 
                 # Fill unused with -1. Needed for reshape_and_cache in full cuda
-                # graph mode.
-                blk_table.slot_mapping.gpu[total_num_scheduled_tokens:].fill_(-1)
+                # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
+                slot_mapping[num_tokens:num_tokens_padded].fill_(-1)
+                blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1)
 
             common_attn_metadata = CommonAttentionMetadata(
                 query_start_loc=query_start_loc,
@@ -1598,9 +1576,9 @@ def _build_attention_metadata(
                 seq_lens=seq_lens,
                 seq_lens_cpu=seq_lens_cpu,
                 num_computed_tokens_cpu=num_computed_tokens_cpu,
-                num_reqs=num_reqs,
-                num_actual_tokens=total_num_scheduled_tokens,
-                max_query_len=max_num_scheduled_tokens,
+                num_actual_tokens=num_tokens_padded,
+                num_reqs=num_reqs_padded,
+                max_query_len=max_query_len,
                 max_seq_len=max_seq_len,
                 block_table_tensor=blk_table_tensor,
                 slot_mapping=slot_mapping,
@@ -1631,9 +1609,11 @@ def _build_attention_metadata(
                 extra_attn_metadata_args = {}
                 if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
                     extra_attn_metadata_args = dict(
-                        num_accepted_tokens=self.num_accepted_tokens.gpu[:num_reqs],
+                        num_accepted_tokens=self.num_accepted_tokens.gpu[
+                            :num_reqs_padded
+                        ],
                         num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[
-                            :num_reqs
+                            :num_reqs_padded
                         ],
                     )
 
@@ -1677,6 +1657,7 @@ def _build_attention_metadata(
     def _compute_cascade_attn_prefix_lens(
         self,
         num_scheduled_tokens: np.ndarray,
+        num_computed_tokens: np.ndarray,
         num_common_prefix_blocks: list[int],
     ) -> list[list[int]] | None:
         """
@@ -1699,6 +1680,7 @@ def _compute_cascade_attn_prefix_lens(
                     # 0 if cascade attention should not be used
                     cascade_attn_prefix_len = self._compute_cascade_attn_prefix_len(
                         num_scheduled_tokens,
+                        num_computed_tokens,
                         num_common_prefix_blocks[kv_cache_gid],
                         attn_group.kv_cache_spec,
                         attn_group.get_metadata_builder(),
@@ -1711,6 +1693,7 @@ def _compute_cascade_attn_prefix_lens(
     def _compute_cascade_attn_prefix_len(
         self,
         num_scheduled_tokens: np.ndarray,
+        num_computed_tokens: np.ndarray,
         num_common_prefix_blocks: int,
         kv_cache_spec: KVCacheSpec,
         attn_metadata_builder: AttentionMetadataBuilder,
@@ -1777,10 +1760,7 @@ def _compute_cascade_attn_prefix_len(
         # and the second kernel will get an empty input. While this is not
         # a fundamental problem, our current implementation does not support
         # this case.
-        num_reqs = len(num_scheduled_tokens)
-        common_prefix_len = min(
-            common_prefix_len, self.input_batch.num_computed_tokens_cpu[:num_reqs].min()
-        )
+        common_prefix_len = min(common_prefix_len, num_computed_tokens.min())
         # common_prefix_len should be a multiple of the block size.
         common_prefix_len = (
             common_prefix_len // kv_cache_spec.block_size * kv_cache_spec.block_size
@@ -2334,19 +2314,6 @@ def eplb_step(self, is_dummy: bool = False, is_profile: bool = False) -> None:
             log_stats=self.parallel_config.eplb_config.log_balancedness,
         )
 
-    # This is where the second ubatch is adjusted to account for the padding.
-    # Should be called after attention metadata creation. This just pads
-    # the second ubatch slice out to the total number of tokens
-    # (num_tokens + padding)
-    @staticmethod
-    def pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int):
-        padded_second_ubatch_slice = slice(
-            ubatch_slices[1].token_slice.start, num_total_tokens
-        )
-        ubatch_slices[1] = UBatchSlice(
-            padded_second_ubatch_slice, padded_second_ubatch_slice
-        )
-
     def _pool(
         self,
         hidden_states: torch.Tensor,
@@ -2391,18 +2358,7 @@ def _pool(
             pooler_output=pooler_output,
         )
 
-    def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int:
-        if (
-            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            and hasattr(self, "cudagraph_batch_sizes")
-            and self.cudagraph_batch_sizes
-            and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]
-        ):
-            # Use CUDA graphs.
-            # Add padding to the batch size.
-            return self.vllm_config.pad_for_cudagraph(num_scheduled_tokens)
-
-        # Eager mode.
+    def _pad_for_sequence_parallelism(self, num_scheduled_tokens: int) -> int:
         # Pad tokens to multiple of tensor_parallel_size when
         # enabled collective fusion for SP
         tp_size = self.vllm_config.parallel_config.tensor_parallel_size
@@ -2738,6 +2694,87 @@ def _model_forward(
             **model_kwargs,
         )
 
+    def _determine_batch_execution_and_padding(
+        self,
+        num_tokens: int,
+        num_reqs: int,
+        num_scheduled_tokens_np: np.ndarray,
+        max_num_scheduled_tokens: int,
+        use_cascade_attn: bool,
+        allow_microbatching: bool = True,
+        force_eager: bool = False,
+        # For cudagraph capture TODO(lucas): Refactor how we capture cudagraphs (will
+        # be improved in model runner v2)
+        force_uniform_decode: bool | None = None,
+        force_has_lora: bool | None = None,
+    ) -> tuple[
+        CUDAGraphMode, BatchDescriptor, UBatchSlices | None, torch.Tensor | None
+    ]:
+        num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
+        uniform_decode = (
+            (
+                (max_num_scheduled_tokens == self.uniform_decode_query_len)
+                and (num_tokens_padded == max_num_scheduled_tokens * num_reqs)
+            )
+            if force_uniform_decode is None
+            else force_uniform_decode
+        )
+
+        has_lora = (
+            len(self.input_batch.lora_id_to_lora_request) > 0
+            if force_has_lora is None
+            else force_has_lora
+        )
+
+        dispatch_cudagraph = (
+            lambda num_tokens: self.cudagraph_dispatcher.dispatch(
+                num_tokens=num_tokens,
+                has_lora=has_lora,
+                use_cascade_attn=use_cascade_attn,
+                uniform_decode=uniform_decode,
+            )
+            if not force_eager
+            else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
+        )
+
+        cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
+        num_tokens_padded = batch_descriptor.num_tokens
+
+        # Extra coordination when running data-parallel since we need to coordinate
+        # across ranks
+        ubatch_slices, num_tokens_across_dp = None, None
+        if self.vllm_config.parallel_config.data_parallel_size > 1:
+            # Disable DP padding when running eager to avoid excessive padding when
+            # running prefills. This lets us set cudagraph_mode="NONE" on the prefiller
+            # in a P/D setup and still use CUDA graphs (enabled by this padding) on the
+            # decoder.
+            allow_dp_padding = (
+                self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            )
+
+            ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
+                num_tokens_unpadded=num_tokens_padded,
+                parallel_config=self.parallel_config,
+                allow_microbatching=allow_microbatching,
+                allow_dp_padding=allow_dp_padding,
+                num_tokens_padded=num_tokens_padded,
+                uniform_decode=uniform_decode,
+                num_scheduled_tokens_per_request=num_scheduled_tokens_np,
+            )
+
+            # Extract DP padding if there is any
+            if num_tokens_across_dp is not None:
+                dp_rank = self.parallel_config.data_parallel_rank
+                num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
+
+                # Re-dispatch with DP padding
+                cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
+                # Assert to make sure the agreed upon token count is correct otherwise
+                # num_tokens_across_dp will no-longer be valid
+                assert batch_descriptor.num_tokens == num_tokens_padded
+
+        return cudagraph_mode, batch_descriptor, ubatch_slices, num_tokens_across_dp
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -2790,7 +2827,7 @@ def execute_model(
                         # returns True. before returning early here we call
                         # dummy run to ensure coordinate_batch_across_dp
                         # is called into to avoid out of sync issues.
-                        self._dummy_run(self._get_num_input_tokens(1))
+                        self._dummy_run(1)
                     if not has_kv_transfer_group():
                         # Return empty ModelRunnerOutput if no work to do.
                         return EMPTY_MODEL_RUNNER_OUTPUT
@@ -2809,36 +2846,63 @@ def execute_model(
                 tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
                 num_scheduled_tokens_np = np.array(tokens, dtype=np.int32)
                 max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
+                num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
 
                 (
                     logits_indices,
                     spec_decode_metadata,
-                    ubatch_slices,
-                    num_tokens_across_dp,
                 ) = self._prepare_inputs(
-                    scheduler_output, num_scheduled_tokens_np, max_num_scheduled_tokens
+                    scheduler_output,
+                    num_scheduled_tokens_np,
                 )
 
                 cascade_attn_prefix_lens = None
                 # Disable cascade attention when using microbatching (DBO)
-                if self.cascade_attn_enabled and ubatch_slices is None:
+                if self.cascade_attn_enabled and not self.parallel_config.enable_dbo:
                     # Pre-compute cascade attention prefix lengths
-                    # NOTE: Must be AFTER _prepare_inputs uses self.input_batch state
                     cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
                         num_scheduled_tokens_np,
+                        self.input_batch.num_computed_tokens_cpu[:num_reqs],
                         scheduler_output.num_common_prefix_blocks,
                     )
 
-                # TODO(lucas): move cudagraph dispatching here:
-                #   https://github.com/vllm-project/vllm/issues/23789
+                (
+                    cudagraph_mode,
+                    batch_desc,
+                    ubatch_slices,
+                    num_tokens_across_dp,
+                ) = self._determine_batch_execution_and_padding(
+                    num_tokens=num_tokens_unpadded,
+                    num_reqs=num_reqs,
+                    num_scheduled_tokens_np=num_scheduled_tokens_np,
+                    max_num_scheduled_tokens=max_num_scheduled_tokens,
+                    use_cascade_attn=cascade_attn_prefix_lens is not None,
+                )
+
+                logger.debug(
+                    "Running batch with cudagraph_mode: %s, batch_descriptor: %s, "
+                    "ubatch_slices: %s, num_tokens_across_dp: %s",
+                    cudagraph_mode,
+                    batch_desc,
+                    ubatch_slices,
+                    num_tokens_across_dp,
+                )
+
+                num_tokens_padded = batch_desc.num_tokens
+                num_reqs_padded = (
+                    batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
+                )
 
-                total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
                 use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
-                attn_metadata, spec_decode_common_attn_metadata = (
+                pad_attn = cudagraph_mode == CUDAGraphMode.FULL
+
+                (attn_metadata, spec_decode_common_attn_metadata) = (
                     self._build_attention_metadata(
-                        total_num_scheduled_tokens=total_num_scheduled_tokens,
-                        max_num_scheduled_tokens=max_num_scheduled_tokens,
+                        num_tokens=num_tokens_unpadded,
+                        num_tokens_padded=num_tokens_padded if pad_attn else None,
                         num_reqs=num_reqs,
+                        num_reqs_padded=num_reqs_padded if pad_attn else None,
+                        max_query_len=max_num_scheduled_tokens,
                         ubatch_slices=ubatch_slices,
                         logits_indices=logits_indices,
                         use_spec_decode=use_spec_decode,
@@ -2847,49 +2911,22 @@ def execute_model(
                     )
                 )
 
-                dp_rank = self.parallel_config.data_parallel_rank
-                if ubatch_slices:
-                    assert num_tokens_across_dp is not None
-                    num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
-                    self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
-                elif num_tokens_across_dp is not None:
-                    num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
-                else:
-                    num_input_tokens = self._get_num_input_tokens(
-                        scheduler_output.total_num_scheduled_tokens
-                    )
-
-                (
-                    input_ids,
-                    inputs_embeds,
-                    positions,
-                    intermediate_tensors,
-                    model_kwargs,
-                    ec_connector_output,
-                ) = self._preprocess(
-                    scheduler_output, num_input_tokens, intermediate_tensors
-                )
-
-            uniform_decode = (
-                max_num_scheduled_tokens == self.uniform_decode_query_len
-            ) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
-            batch_desc = BatchDescriptor(
-                num_tokens=num_input_tokens,
-                uniform_decode=uniform_decode,
-                has_lora=len(self.input_batch.lora_id_to_lora_request) > 0,
-            )
-            cudagraph_runtime_mode, batch_descriptor = (
-                self.cudagraph_dispatcher.dispatch(
-                    batch_desc,
-                    use_cascade_attn=cascade_attn_prefix_lens is not None,
-                )
+            (
+                input_ids,
+                inputs_embeds,
+                positions,
+                intermediate_tensors,
+                model_kwargs,
+                ec_connector_output,
+            ) = self._preprocess(
+                scheduler_output, num_tokens_padded, intermediate_tensors
             )
 
         # Set cudagraph mode to none if calc_kv_scales is true.
         # KV scales calculation involves dynamic operations that are incompatible
         # with CUDA graph capture.
         if self.calculate_kv_scales:
-            cudagraph_runtime_mode = CUDAGraphMode.NONE
+            cudagraph_mode = CUDAGraphMode.NONE
             # Mark KV scales as calculated after the first forward pass
             self.calculate_kv_scales = False
 
@@ -2899,10 +2936,10 @@ def execute_model(
             set_forward_context(
                 attn_metadata,
                 self.vllm_config,
-                num_tokens=num_input_tokens,
+                num_tokens=num_tokens_padded,
                 num_tokens_across_dp=num_tokens_across_dp,
-                cudagraph_runtime_mode=cudagraph_runtime_mode,
-                batch_descriptor=batch_descriptor,
+                cudagraph_runtime_mode=cudagraph_mode,
+                batch_descriptor=batch_desc,
                 ubatch_slices=ubatch_slices,
             ),
             record_function_or_nullcontext("gpu_model_runner: forward"),
@@ -2952,7 +2989,7 @@ def execute_model(
                 if not get_pp_group().is_last_rank:
                     all_gather_tensors = {
                         "residual": not is_residual_scattered_for_sp(
-                            self.vllm_config, num_input_tokens
+                            self.vllm_config, num_tokens_padded
                         )
                     }
                     get_pp_group().send_tensor_dict(
@@ -3841,52 +3878,44 @@ def _dummy_run(
         assert sum(num_scheduled_tokens_list) == num_tokens
         assert len(num_scheduled_tokens_list) == num_reqs
         num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32)
-        total_num_scheduled_tokens = int(num_scheduled_tokens.sum())
+        num_tokens_unpadded = int(num_scheduled_tokens.sum())
+
         num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
 
-        # Disable DP padding when running eager
-        allow_dp_padding = self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-
-        # We currently only microbatch if the number of tokens is
-        # over a certain threshold.
-        ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
-            num_tokens_unpadded=total_num_scheduled_tokens,
-            parallel_config=self.vllm_config.parallel_config,
-            allow_microbatching=allow_microbatching,
-            allow_dp_padding=allow_dp_padding,
-            num_tokens_padded=total_num_scheduled_tokens,
-            uniform_decode=uniform_decode,
-            num_scheduled_tokens_per_request=num_scheduled_tokens,
-        )
-        num_tokens_after_padding = num_tokens
-        if num_tokens_across_dp is not None:
-            dp_rank = self.parallel_config.data_parallel_rank
-            num_tokens_after_padding = int(num_tokens_across_dp[dp_rank])
-
-        # filter out the valid batch descriptor
-        _cg_mode, batch_descriptor = (
-            self.cudagraph_dispatcher.dispatch(
-                BatchDescriptor(
-                    num_tokens=num_tokens_after_padding,
-                    uniform_decode=uniform_decode,
-                    has_lora=activate_lora and self.lora_config is not None,
-                )
+        _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp = (
+            self._determine_batch_execution_and_padding(
+                num_tokens=num_tokens_unpadded,
+                num_reqs=num_reqs,
+                num_scheduled_tokens_np=num_scheduled_tokens,
+                max_num_scheduled_tokens=max_query_len,
+                use_cascade_attn=False,
+                allow_microbatching=allow_microbatching,
+                force_eager=is_profile
+                or (cudagraph_runtime_mode == CUDAGraphMode.NONE),
+                # `force_uniform_decode` is used for cudagraph capture; because for
+                # capturing mixed prefill-decode batches, we sometimes use
+                # num_tokens == num_reqs which looks like a uniform decode batch to the
+                # dispatcher; but we actually want to capture a piecewise cudagraph
+                force_uniform_decode=uniform_decode,
+                # `force_has_lora` is used for cudagraph capture; because LoRA is
+                # activated later in the context manager, but we need to know the
+                # LoRA state when determining the batch descriptor for capture
+                force_has_lora=activate_lora,
             )
-            if not is_profile
-            else (CUDAGraphMode.NONE, None)
         )
-        if cudagraph_runtime_mode is not None:
-            # we allow forcing NONE when the dispatcher disagrees to support
-            # warm ups for cudagraph capture
-            assert (
-                cudagraph_runtime_mode == CUDAGraphMode.NONE
-                or cudagraph_runtime_mode == _cg_mode
-            ), (
-                f"Cudagraph runtime mode mismatch at dummy_run. "
-                f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}."
-            )
+
+        if cudagraph_runtime_mode is None:
+            cudagraph_runtime_mode = _cudagraph_mode
         else:
-            cudagraph_runtime_mode = _cg_mode
+            assert cudagraph_runtime_mode == _cudagraph_mode, (
+                f"Cudagraph runtime mode mismatch in dummy_run. "
+                f"Expected {_cudagraph_mode}, but got {cudagraph_runtime_mode}."
+            )
+
+        num_tokens_padded = batch_desc.num_tokens
+        num_reqs_padded = (
+            batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
+        )
 
         attn_metadata: PerLayerAttnMetadata | None = None
 
@@ -3909,9 +3938,9 @@ def _dummy_run(
             self.query_start_loc.copy_to_gpu()
 
             attn_metadata, _ = self._build_attention_metadata(
-                total_num_scheduled_tokens=num_tokens,
-                max_num_scheduled_tokens=max_query_len,
-                num_reqs=num_reqs,
+                num_tokens=num_tokens_unpadded,
+                num_reqs=num_reqs_padded,
+                max_query_len=max_query_len,
                 ubatch_slices=ubatch_slices,
                 for_cudagraph_capture=True,
             )
@@ -3924,29 +3953,29 @@ def _dummy_run(
             remove_lora,
         ):
             # Make sure padding doesn't exceed max_num_tokens
-            assert num_tokens_after_padding <= self.max_num_tokens
-            model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
+            assert num_tokens_padded <= self.max_num_tokens
+            model_kwargs = self._init_model_kwargs(num_tokens_padded)
             if self.supports_mm_inputs and not self.model_config.is_encoder_decoder:
                 input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
                 model_kwargs = {
                     **model_kwargs,
                     **self._dummy_mm_kwargs(num_reqs),
                 }
             elif self.enable_prompt_embeds:
                 input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
-                model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
+                model_kwargs = self._init_model_kwargs(num_tokens_padded)
             else:
-                input_ids = self.input_ids.gpu[:num_tokens_after_padding]
+                input_ids = self.input_ids.gpu[:num_tokens_padded]
                 inputs_embeds = None
 
             if self.uses_mrope:
-                positions = self.mrope_positions.gpu[:, :num_tokens_after_padding]
+                positions = self.mrope_positions.gpu[:, :num_tokens_padded]
             elif self.uses_xdrope_dim > 0:
-                positions = self.xdrope_positions.gpu[:, :num_tokens_after_padding]
+                positions = self.xdrope_positions.gpu[:, :num_tokens_padded]
             else:
-                positions = self.positions.gpu[:num_tokens_after_padding]
+                positions = self.positions.gpu[:num_tokens_padded]
 
             if get_pp_group().is_first_rank:
                 intermediate_tensors = None
@@ -3961,26 +3990,26 @@ def _dummy_run(
                     )
 
                 intermediate_tensors = self.sync_and_slice_intermediate_tensors(
-                    num_tokens_after_padding, None, False
+                    num_tokens_padded, None, False
                 )
 
             if ubatch_slices is not None:
                 # Adjust values to reflect a single ubatch.
                 # TODO(sage,lucas): this is cruft that should be addressed in
                 #  the padding refactor.
-                num_tokens_after_padding = ubatch_slices[0].num_tokens
+                num_tokens_padded = ubatch_slices[0].num_tokens
                 if num_tokens_across_dp is not None:
-                    num_tokens_across_dp[:] = num_tokens_after_padding
+                    num_tokens_across_dp[:] = num_tokens_padded
 
             with (
                 self.maybe_randomize_inputs(input_ids),
                 set_forward_context(
                     attn_metadata,
                     self.vllm_config,
-                    num_tokens=num_tokens_after_padding,
+                    num_tokens=num_tokens_padded,
                     num_tokens_across_dp=num_tokens_across_dp,
                     cudagraph_runtime_mode=cudagraph_runtime_mode,
-                    batch_descriptor=batch_descriptor,
+                    batch_descriptor=batch_desc,
                     ubatch_slices=ubatch_slices,
                 ),
             ):
@@ -4706,8 +4735,7 @@ def _check_and_update_cudagraph_mode(
 
         # Trigger cudagraph dispatching keys initialization after
         # resolved cudagraph mode.
-        cudagraph_mode = self.compilation_config.cudagraph_mode
-        assert cudagraph_mode is not None
+        self.compilation_config.cudagraph_mode = cudagraph_mode
         self.cudagraph_dispatcher.initialize_cudagraph_keys(
             cudagraph_mode, self.uniform_decode_query_len
         )
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 6a4bfde5f972..d0c6091ce2a6 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -8,12 +8,13 @@
 from types import NoneType
 from typing import TYPE_CHECKING, Any, cast
 
+import numpy as np
 import torch
 import torch.distributed
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.distributed import (
     ensure_model_parallel_initialized,
     init_distributed_environment,
@@ -487,6 +488,7 @@ def compile_or_warm_up_model(self) -> None:
             hidden_states, last_hidden_states = self.model_runner._dummy_run(
                 num_tokens=max_num_reqs,
                 skip_eplb=True,
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
             )
             if self.model_runner.is_pooling_model:
                 self.model_runner._dummy_pooler_run(hidden_states)
@@ -534,12 +536,39 @@ def execute_model(
         intermediate_tensors = None
         forward_pass = scheduler_output.total_num_scheduled_tokens > 0
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        num_input_tokens = self.model_runner._get_num_input_tokens(num_scheduled_tokens)
-        all_gather_tensors = {
-            "residual": not is_residual_scattered_for_sp(
-                self.vllm_config, num_input_tokens
+        all_gather_tensors = {}
+        compilation_config = self.vllm_config.compilation_config
+        parallel_config = self.vllm_config.parallel_config
+
+        if (
+            parallel_config.pipeline_parallel_size > 1
+            and compilation_config.pass_config.enable_sequence_parallelism
+            and forward_pass
+        ):
+            # currently only supported by V1 GPUModelRunner
+            assert isinstance(self.model_runner, GPUModelRunner)
+            num_scheduled_tokens_np = np.array(
+                list(scheduler_output.num_scheduled_tokens.values()),
+                dtype=np.int32,
             )
-        }
+            # TODO(lucas): This is pretty gross; ideally we should only ever call
+            # `_determine_batch_execution_and_padding` once (will get called again
+            # in `execute_model`) but this requires a larger refactor of PP.
+            _, batch_desc, _, _ = (
+                self.model_runner._determine_batch_execution_and_padding(
+                    num_tokens=num_scheduled_tokens,
+                    num_reqs=len(num_scheduled_tokens_np),
+                    num_scheduled_tokens_np=num_scheduled_tokens_np,
+                    max_num_scheduled_tokens=num_scheduled_tokens_np.max(),
+                    use_cascade_attn=False,  # TODO(lucas): Handle cascade attention
+                )
+            )
+            all_gather_tensors = {
+                "residual": not is_residual_scattered_for_sp(
+                    self.vllm_config, batch_desc.num_tokens
+                )
+            }
+
         if forward_pass and not get_pp_group().is_first_rank:
             tensor_dict = get_pp_group().recv_tensor_dict(
                 all_gather_group=get_tp_group(),

From ba1fcd84a7f1dc907c17bf4ba4fab6762a9f33a1 Mon Sep 17 00:00:00 2001
From: Johnny Yang <24908445+jcyang43@users.noreply.github.com>
Date: Wed, 26 Nov 2025 14:46:36 -0800
Subject: [PATCH 457/578] [TPU] add tpu_inference (#27277)

Signed-off-by: Johnny Yang <johnnyyang@google.com>
---
 requirements/tpu.txt                                      | 4 +---
 vllm/distributed/device_communicators/tpu_communicator.py | 8 --------
 vllm/platforms/tpu.py                                     | 4 +++-
 vllm/v1/worker/tpu_worker.py                              | 2 +-
 4 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 4241cbb2b033..e6fff58f7b79 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -12,6 +12,4 @@ ray[data]
 setuptools==78.1.0
 nixl==0.3.0
 tpu_info==0.4.0
-
-# Install torch_xla
-torch_xla[tpu, pallas]==2.8.0
\ No newline at end of file
+tpu-inference==0.11.1
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index a7724a86cc6a..fa99078e9ff0 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -97,11 +97,3 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         assert dim == -1, "TPUs only support dim=-1 for all-gather."
         return xm.all_gather(input_, dim=dim)
-
-
-if USE_TPU_INFERENCE:
-    from tpu_inference.distributed.device_communicators import (
-        TpuCommunicator as TpuInferenceCommunicator,
-    )
-
-    TpuCommunicator = TpuInferenceCommunicator  # type: ignore
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 944344a22957..aa5ddbe43659 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -267,7 +267,9 @@ def check_max_model_len(cls, max_model_len: int) -> int:
 
 
 try:
-    from tpu_inference.platforms import TpuPlatform as TpuInferencePlatform
+    from tpu_inference.platforms.tpu_platforms import (
+        TpuPlatform as TpuInferencePlatform,
+    )
 
     TpuPlatform = TpuInferencePlatform  # type: ignore
     USE_TPU_INFERENCE = True
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index e1a109eca0a8..ce18ca6c3716 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -346,6 +346,6 @@ def apply_model(self, fn: Callable[[nn.Module], _R]) -> _R:
 
 
 if USE_TPU_INFERENCE:
-    from tpu_inference.worker import TPUWorker as TpuInferenceWorker
+    from tpu_inference.worker.tpu_worker import TPUWorker as TpuInferenceWorker
 
     TPUWorker = TpuInferenceWorker  # type: ignore

From df01eda4dc570dbf9aa45dd196e288d13f427fab Mon Sep 17 00:00:00 2001
From: HDCharles <39544797+HDCharles@users.noreply.github.com>
Date: Wed, 26 Nov 2025 21:35:13 -0500
Subject: [PATCH 458/578] [Bugfix] Make compressed-tensors MoEs respect ignored
 layers (#28878)

Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 tests/quantization/test_compressed_tensors.py | 48 +++++++++++++
 .../layers/fused_moe/__init__.py              |  4 ++
 .../compressed_tensors/compressed_tensors.py  | 72 ++++++++++++++-----
 .../compressed_tensors_moe.py                 | 60 +++++++---------
 5 files changed, 133 insertions(+), 52 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d14b524b793a..375645fde747 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -632,6 +632,7 @@ steps:
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: LM Eval Small Models # 53min
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 31b65189b5ec..412b21328a32 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -10,6 +10,7 @@
 from compressed_tensors.quantization import QuantizationType
 
 from tests.models.utils import check_logprobs_close
+from vllm.model_executor.layers.fused_moe import UnquantizedFusedMoEMethod
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensors24,
     CompressedTensorsLinearMethod,
@@ -767,3 +768,50 @@ def check_model(model):
 
         output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="This test is not for non-CUDA platforms",
+)
+def test_compressed_tensors_moe_ignore_with_model(vllm_runner):
+    """
+    Integration test for MoE layer ignore functionality with a real model.
+
+    This test would verify that when loading a compressed-tensors quantized
+    MoE model where some MoE layers are in the ignore list, those layers
+    use UnquantizedFusedMoEMethod while non-ignored layers use the
+    quantized method.
+
+    Expected model structure:
+    - Compressed-tensors quantized MoE model (e.g., Mixtral-based)
+    - Config with ignore list containing specific MoE layers
+    - Multiple MoE layers where some are quantized and some are not
+    """
+
+    # model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only" # CT 12.3
+    model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only-CTstable"  # CT 12.2
+
+    with vllm_runner(model_path, enforce_eager=True) as llm:
+
+        def check_model(model):
+            from vllm.model_executor.layers.fused_moe import FusedMoE
+            from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
+                CompressedTensorsMoEMethod,
+            )
+
+            # Check layer 0 MoE (should be quantized)
+            layer_quantized = model.model.layers[0].mlp.experts
+            assert isinstance(layer_quantized, FusedMoE)
+            assert isinstance(layer_quantized.quant_method, CompressedTensorsMoEMethod)
+
+            # Check layer 10 MoE (should be unquantized + ignored)
+            layer_unquantized = model.model.layers[3].mlp.experts
+            assert isinstance(layer_unquantized, FusedMoE)
+            assert isinstance(layer_unquantized.quant_method, UnquantizedFusedMoEMethod)
+
+        llm.apply_model(check_model)
+
+        # Verify the model can generate output
+        output = llm.generate_greedy("Hello, my name is", max_tokens=4)
+        assert output
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 53d98d0650b4..669abcb3d6ff 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -18,6 +18,9 @@
     FusedMoEPrepareAndFinalize,
 )
 from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
+    UnquantizedFusedMoEMethod,
+)
 from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
 from vllm.triton_utils import HAS_TRITON
 
@@ -41,6 +44,7 @@ def get_config() -> dict[str, Any] | None:
     "FusedMoE",
     "FusedMoEConfig",
     "FusedMoEMethodBase",
+    "UnquantizedFusedMoEMethod",
     "FusedMoeWeightScaleSupported",
     "FusedMoEPermuteExpertsUnpermute",
     "FusedMoEActivationFormat",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 2800f90ce0b6..7f61746a4e45 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -158,9 +158,23 @@ def get_quant_method(
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
-            return CompressedTensorsMoEMethod.get_moe_method(self, layer)
+            return CompressedTensorsMoEMethod.get_moe_method(self, layer, prefix)
         return None
 
+    def _add_fused_moe_to_target_scheme_map(self):
+        """
+        Helper function to update target_scheme_map
+        since linear layers get fused into FusedMoE
+        targetting 'Linear' needs to also match
+        FusedMoE modules.
+        """
+        if (
+            "Linear" not in self.target_scheme_map
+            or "FusedMoE" in self.target_scheme_map
+        ):
+            return
+        self.target_scheme_map["FusedMoE"] = self.target_scheme_map["Linear"]
+
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig":
         ignore: list[str] = cast(list[str], config.get("ignore", []))
@@ -655,25 +669,13 @@ def get_scheme(
         to select the CompressedTensorsScheme used for inference.
         """
 
-        # Find the "target" in the compressed-tensors config
-        # that our layer conforms to.
-        # TODO (@kylesayrs): support ignore module names with ct matching utils
-        if should_ignore_layer(
-            layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
-        ):
-            return None
-
-        # Will be empty for models with only sparsity
-        weight_quant = input_quant = None
-        if self.target_scheme_map:
-            matched_target = find_matched_target(
-                layer_name=layer_name,
-                module=layer,
-                targets=self.target_scheme_map.keys(),
-                fused_mapping=self.packed_modules_mapping,
-            )
+        # Use the new get_quant_args method to extract QuantizationArgs
+        scheme_dict = self.get_scheme_dict(layer, layer_name)
 
-            scheme_dict = self.target_scheme_map[matched_target]
+        weight_quant = None
+        input_quant = None
+        format = None
+        if scheme_dict:
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
             format = scheme_dict.get("format")
@@ -732,6 +734,38 @@ def get_scheme(
         logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
         return scheme
 
+    def get_scheme_dict(
+        self, layer: torch.nn.Module, layer_name: str | None = None
+    ) -> dict[str, QuantizationArgs | str | None] | None:
+        """
+        Extract the QuantizationArgs for a given layer.
+
+        Returns:
+            dict with {
+                "weights": QuantizationArgs,
+                "input_activations": QuantizationArgs | None,
+                "format": str | None
+            } | None
+        """
+        # TODO (@kylesayrs): support ignore module names with ct matching utils
+        if should_ignore_layer(
+            layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
+        ):
+            return None
+
+        # Will be empty for models with only sparsity
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys(),
+                fused_mapping=self.packed_modules_mapping,
+            )
+
+            return self.target_scheme_map[matched_target]
+
+        return None
+
     def get_cache_scale(self, name: str) -> str | None:
         """
         Check whether the param name matches the format for k/v cache scales
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 71d7de97d4a1..c7dfd1787cc8 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -23,6 +23,7 @@
     FusedMoEMethodBase,
     FusedMoEPermuteExpertsUnpermute,
     FusedMoeWeightScaleSupported,
+    UnquantizedFusedMoEMethod,
 )
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
@@ -45,9 +46,6 @@
     WNA16_SUPPORTED_BITS,
     WNA16_SUPPORTED_TYPES_MAP,
 )
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    find_matched_target,
-)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize,
@@ -113,39 +111,35 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         layer: torch.nn.Module,
+        prefix: str,
     ) -> "CompressedTensorsMoEMethod":
-        # TODO: @dsikka: refactor this to use schemes as other kernels
-        # are supported + check if the layer is being ignored.
-        # Check if a using "Linear" to select schemes
-        if "Linear" in quant_config.target_scheme_map:
-            matched_target = "Linear"
-        else:
-            # May have instead defined the linear layers in the fused model
-
-            fused_layers = ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"]
-            current_scheme = None
-            for fused_layer in fused_layers:
-                # Check if one of the fused layers are defined in quant_config
-                matched_target = find_matched_target(
-                    layer_name=fused_layer,
-                    module=layer,
-                    targets=quant_config.target_scheme_map.keys(),
-                    fused_mapping=quant_config.packed_modules_mapping,
-                )
+        # FusedMoE was made by combining multiple Linears so need to
+        # make sure quantization config for Linear can target it
+        quant_config._add_fused_moe_to_target_scheme_map()
+        unfused_names = [
+            prefix + proj_name
+            for proj_name in [".0.gate_proj", ".0.up_proj", ".0.down_proj"]
+        ]
+        # TODO: refactor this to use expert_mapping and check all layer numbers
+        all_scheme_dicts = [
+            quant_config.get_scheme_dict(layer, name) for name in unfused_names
+        ]
+        scheme_dict = all_scheme_dicts.pop()
+
+        # multiple schemes found
+        if not all([cur_dict == scheme_dict for cur_dict in all_scheme_dicts]):
+            raise ValueError(
+                "All MoE projections need to have same "
+                "quantization scheme but found multiple"
+            )
 
-                # Only valid if down_proj, gate_proj, and up_proj
-                # are mapped to the same quant scheme in the quant_config
-                if current_scheme is None:
-                    current_scheme = quant_config.target_scheme_map.get(matched_target)
-                else:
-                    assert current_scheme == quant_config.target_scheme_map.get(
-                        matched_target
-                    )
+        if scheme_dict is None:  # ignored layer
+            return UnquantizedFusedMoEMethod(layer.moe_config)
 
-        weight_quant = quant_config.target_scheme_map[matched_target].get("weights")
-        input_quant = quant_config.target_scheme_map[matched_target].get(
-            "input_activations"
-        )
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+        weight_quant = scheme_dict.get("weights")
+        input_quant = scheme_dict.get("input_activations")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
             # group_size=None means channelwise

From 77740191de965329e143e501321637a3e242e2f6 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 26 Nov 2025 21:48:43 -0500
Subject: [PATCH 459/578] [Attention][Async] Eliminate `seq_lens_cpu` in
 FlashAttention metadata building with DCP > 1 (#29449)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 vllm/v1/attention/backends/flash_attn.py | 27 ++++++++++++------------
 vllm/v1/attention/backends/utils.py      |  6 ++++--
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 0fc57cfb1f9d..a1558073003f 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -328,7 +328,6 @@ def build(
         max_seq_len = common_attn_metadata.max_seq_len
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
         causal = common_attn_metadata.causal
@@ -401,20 +400,23 @@ def schedule(
         prefix_scheduler_metadata = None
 
         if self.dcp_world_size > 1:
-            query_kv_lens_cpu = (
-                common_attn_metadata.query_start_loc_cpu[1:]
-                - common_attn_metadata.query_start_loc_cpu[:-1]
-            )
-            dcp_context_kv_lens_cpu = seq_lens_cpu - query_kv_lens_cpu
+            query_kv_lens = query_start_loc[1:] - query_start_loc[:-1]
+            dcp_context_kv_lens = seq_lens - query_kv_lens
 
-            dcp_context_kv_lens_cpu = get_dcp_local_seq_lens(
-                dcp_context_kv_lens_cpu,
+            dcp_context_kv_lens = get_dcp_local_seq_lens(
+                dcp_context_kv_lens,
                 self.dcp_world_size,
                 self.dcp_rank,
                 self.cp_kv_cache_interleave_size,
             )
-            dcp_context_kv_lens = dcp_context_kv_lens_cpu.to(self.device)
-            max_dcp_context_kv_len = dcp_context_kv_lens.max().item()
+            # After DCP distribution, the maximum number of tokens for any rank is
+            # ceil(L / (N * I)) * I, where L is max_seq_len, N is dcp_world_size,
+            # and I is cp_kv_cache_interleave_size.
+            # This eliminates GPU->CPU sync while minimizing workspace over-allocation.
+            num_partitions = self.dcp_world_size * self.cp_kv_cache_interleave_size
+            max_dcp_context_kv_len = (
+                (max_seq_len + num_partitions - 1) // num_partitions
+            ) * self.cp_kv_cache_interleave_size
 
             scheduler_metadata = schedule(
                 batch_size=num_reqs,
@@ -431,9 +433,8 @@ def schedule(
             prefix_kv_lens = torch.tensor(
                 [common_prefix_len], dtype=torch.int32, device=self.device
             )
-            suffix_kv_lens = (seq_lens_cpu[:num_reqs] - common_prefix_len).to(
-                self.device, non_blocking=True
-            )
+            # Use GPU tensor directly - no CPU sync needed
+            suffix_kv_lens = seq_lens[:num_reqs] - common_prefix_len
             prefix_scheduler_metadata = schedule(
                 batch_size=1,
                 cu_query_lens=cu_prefix_query_lens,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 18e91fd4fd6a..ea9dccc702a0 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -1095,12 +1095,14 @@ def get_dcp_local_seq_lens(
     num_requests = seq_lens.size(0)
     if dcp_rank is None:
         rank_offsets = (
-            torch.arange(dcp_size, dtype=torch.int32)
+            torch.arange(dcp_size, dtype=torch.int32, device=seq_lens.device)
             .unsqueeze(0)
             .repeat(num_requests, 1)
         )
     else:
-        rank_offsets = torch.Tensor([[dcp_rank]]).to(dtype=torch.int32)
+        rank_offsets = torch.tensor(
+            [[dcp_rank]], dtype=torch.int32, device=seq_lens.device
+        )
     seq_lens_tiled = (
         seq_lens.to(torch.int32).unsqueeze(-1).repeat(1, rank_offsets.shape[1])
     )

From a67dec7cba6239022b5b713845e29d9cbb294ec7 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Thu, 27 Nov 2025 11:02:21 +0800
Subject: [PATCH 460/578] [Bugfix] fix IMA issue in certain cases of the moe
 marlin kernel (#28619)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 csrc/moe/marlin_moe_wna16/marlin_template.h    | 18 ++++++++++--------
 .../layers/fused_moe/shared_fused_moe.py       |  1 -
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index dd86a9a5ba6e..4dbca30da57a 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -489,14 +489,16 @@ __global__ void Marlin(
   #pragma unroll
         for (int i = 0; i < 4; i++) {
           int idx = tid4 * 4 + i;
-          idx = idx < block_num_valid_tokens ? idx : 0;
-          if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-            sh_block_topk_weights[idx] = __hmul2(
-                global_scale, Dtype::num2num2(Dtype::float2num(
-                                  topk_weights_ptr[sh_block_sorted_ids[idx]])));
-          } else {
-            sh_block_topk_weights[idx] = Dtype::num2num2(
-                Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
+          if (idx < block_num_valid_tokens) {
+            if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+              sh_block_topk_weights[idx] =
+                  __hmul2(global_scale,
+                          Dtype::num2num2(Dtype::float2num(
+                              topk_weights_ptr[sh_block_sorted_ids[idx]])));
+            } else {
+              sh_block_topk_weights[idx] = Dtype::num2num2(
+                  Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
+            }
           }
         }
       }
diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
index 6ec8b33ed930..9aaeec4f98a6 100644
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -38,7 +38,6 @@ def __init__(
                 # TODO(wentao): find the root cause and remove this condition
                 self.enable_eplb
                 or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
-                or self.use_marlin_kernels
             )
             and self._shared_experts is not None
         )

From 9bb33c8919024a50e48687c08d706df3fa3302ed Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Wed, 26 Nov 2025 19:30:50 -0800
Subject: [PATCH 461/578] add xpu supported model and model id for cpu (#29380)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 docs/models/hardware_supported_models/cpu.md | 26 +++++---
 docs/models/hardware_supported_models/xpu.md | 65 ++++++++++++++++++++
 2 files changed, 82 insertions(+), 9 deletions(-)
 create mode 100644 docs/models/hardware_supported_models/xpu.md

diff --git a/docs/models/hardware_supported_models/cpu.md b/docs/models/hardware_supported_models/cpu.md
index 0832755f8fbe..811778b2ad52 100644
--- a/docs/models/hardware_supported_models/cpu.md
+++ b/docs/models/hardware_supported_models/cpu.md
@@ -1,25 +1,33 @@
 # CPU - Intel® Xeon®
 
+## Validated Hardware
+
+| Hardware                                 |
+| ----------------------------------------- |
+| [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html)                   |
+| [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html)              |
+
 ## Supported Models
 
 ### Text-only Language Models
 
 | Model                                | Architecture                             | Supported |
 |--------------------------------------|-------------------------------------------|-----------|
-| meta-llama/Llama-3.1 / 3.3           | LlamaForCausalLM                          | ✅        |
-| meta-llama/Llama-4-Scout             | Llama4ForConditionalGeneration            | ✅        |
-| meta-llama/Llama-4-Maverick          | Llama4ForConditionalGeneration            | ✅        |
-| ibm-granite/granite (Granite-MOE)    | GraniteMoeForCausalLM                     | ✅        |
-| Qwen/Qwen3                           | Qwen3ForCausalLM                          | ✅        |
-| zai-org/GLM-4.5                      | GLMForCausalLM                            | ✅        |
-| google/gemma                         | GemmaForCausalLM                          | ✅        |
+| meta-llama/Llama-3.1-8B-Instruct     | LlamaForCausalLM                          | ✅        |
+| meta-llama/Llama-3.2-3B-Instruct     | LlamaForCausalLM                          | ✅        |
+| ibm-granite/granite-3.2-2b-instruct  | GraniteForCausalLM                        | ✅        |
+| Qwen/Qwen3-1.7B                      | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-4B                        | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-8B                        | Qwen3ForCausalLM                          | ✅        |
+| zai-org/glm-4-9b-hf                  | GLMForCausalLM                            | ✅        |
+| google/gemma-7b                      | GemmaForCausalLM                          | ✅        |
 
 ### Multimodal Language Models
 
 | Model                                | Architecture                             | Supported |
 |--------------------------------------|-------------------------------------------|-----------|
-| Qwen/Qwen2.5-VL                      | Qwen2VLForConditionalGeneration           | ✅        |
-| openai/whisper                       | WhisperForConditionalGeneration           | ✅        |
+| Qwen/Qwen2.5-VL-7B-Instruct          | Qwen2VLForConditionalGeneration           | ✅        |
+| openai/whisper-large-v3              | WhisperForConditionalGeneration           | ✅        |
 
 ✅ Runs and optimized.  
 🟨 Runs and correct but not optimized to green yet.  
diff --git a/docs/models/hardware_supported_models/xpu.md b/docs/models/hardware_supported_models/xpu.md
new file mode 100644
index 000000000000..7b8dcf5c9af2
--- /dev/null
+++ b/docs/models/hardware_supported_models/xpu.md
@@ -0,0 +1,65 @@
+# XPU - Intel® GPUs
+
+## Validated Hardware
+
+| Hardware                                 |
+| ----------------------------------------- |
+| [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html)                   |
+
+## Supported Models
+
+### Text-only Language Models
+
+| Model                                     | Architecture                                         | FP16 | Dynamic FP8 | MXFP4 |
+| ----------------------------------------- | ---------------------------------------------------- | ---- | ----------- | ----- |
+| openai/gpt-oss-20b                        | GPTForCausalLM                                       |      |             | ✅     |
+| openai/gpt-oss-120b                       | GPTForCausalLM                                       |      |             | ✅     |
+| deepseek-ai/DeepSeek-R1-Distill-Llama-8B  | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Qwen-14B  | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Qwen-32B  | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen2.5-72B-Instruct                 | Qwen2ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-14B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-32B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-30B-A3B                        | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-30B-A3B-GPTQ-Int4              | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-coder-30B-A3B-Instruct         | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/QwQ-32B                              | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-V2-Lite              | DeepSeekForCausalLM                                  | ✅    | ✅           |       |
+| meta-llama/Llama-3.1-8B-Instruct          | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
+| THUDM/GLM-4-9B-chat                       | GLMForCausalLM                                       | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| chuhac/TeleChat2-35B                      | LlamaForCausalLM (TeleChat2 based on Llama arch)     | ✅    | ✅           |       |
+| 01-ai/Yi1.5-34B-Chat                      | YiForCausalLM                                        | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-Coder-33B-base       | DeepSeekCoderForCausalLM                             | ✅    | ✅           |       |
+| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
+| meta-llama/Llama-2-13b-chat-hf            | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| Qwen/Qwen1.5-14B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
+| Qwen/Qwen1.5-32B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
+
+### Multimodal Language Models
+
+| Model                        | Architecture                     | FP16 | Dynamic FP8 | MXFP4 |
+| ---------------------------- | -------------------------------- | ---- | ----------- | ----- |
+| OpenGVLab/InternVL3_5-8B     | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| OpenGVLab/InternVL3_5-14B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| OpenGVLab/InternVL3_5-38B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| Qwen/Qwen2-VL-7B-Instruct    | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| THUDM/GLM-4v-9B              | GLM4vForConditionalGeneration    | ✅    | ✅           |       |
+| openbmb/MiniCPM-V-4          | MiniCPMVForConditionalGeneration | ✅    | ✅           |       |
+
+### Embedding and Reranker Language Models
+
+| Model                   | Architecture                   | FP16 | Dynamic FP8 | MXFP4 |
+| ----------------------- | ------------------------------ | ---- | ----------- | ----- |
+| Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding          | ✅    | ✅           |       |
+| Qwen/Qwen3-Reranker-8B  | Qwen3ForSequenceClassification | ✅    | ✅           |       |
+
+✅ Runs and optimized.  
+🟨 Runs and correct but not optimized to green yet.  
+❌ Does not pass accuracy test or does not run.  

From 0aeb698b774e2d8593b14988e3af9ebbdd773730 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 26 Nov 2025 19:47:17 -0800
Subject: [PATCH 462/578] [Model Runner V2] Minor code cleanup (#29570)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py | 11 ++---------
 vllm/v1/worker/gpu/dp_utils.py        |  9 +++++++++
 vllm/v1/worker/gpu/model_runner.py    | 16 +++++++---------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index ba783e2d0c6f..6b056641c903 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -16,6 +16,7 @@
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
 from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp
 from vllm.v1.worker.gpu.input_batch import InputBuffers
 
 
@@ -127,15 +128,7 @@ def capture_graph(
             slot_mappings=slot_mappings,
             kv_cache_config=kv_cache_config,
         )
-        if self.dp_size > 1:
-            num_tokens_across_dp = torch.full(
-                (self.dp_size,),
-                batch_size,
-                dtype=torch.int32,
-                device="cpu",
-            )
-        else:
-            num_tokens_across_dp = None
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, batch_size)
 
         # Warm up.
         with set_forward_context(
diff --git a/vllm/v1/worker/gpu/dp_utils.py b/vllm/v1/worker/gpu/dp_utils.py
index 9bfc7f25bef3..d71d91d1e5cb 100644
--- a/vllm/v1/worker/gpu/dp_utils.py
+++ b/vllm/v1/worker/gpu/dp_utils.py
@@ -20,3 +20,12 @@ def get_batch_metadata_across_dp(
     tensor[1][dp_rank] = cudagraph_size
     dist.all_reduce(tensor, group=group)
     return tensor[0], tensor[1]
+
+
+def make_num_tokens_across_dp(
+    dp_size: int,
+    num_tokens: int,
+) -> torch.Tensor | None:
+    if dp_size == 1:
+        return None
+    return torch.full((dp_size,), num_tokens, dtype=torch.int32, device="cpu")
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index e34a45f97980..6a78776b0a8a 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -35,7 +35,10 @@
 )
 from vllm.v1.worker.gpu.block_table import BlockTables
 from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
-from vllm.v1.worker.gpu.dp_utils import get_batch_metadata_across_dp
+from vllm.v1.worker.gpu.dp_utils import (
+    get_batch_metadata_across_dp,
+    make_num_tokens_across_dp,
+)
 from vllm.v1.worker.gpu.input_batch import (
     InputBatch,
     InputBuffers,
@@ -255,12 +258,7 @@ def _dummy_run(
         if not skip_attn:
             self.prepare_dummy_attn_metadata(input_batch)
 
-        if self.dp_size == 1:
-            num_tokens_across_dp: torch.Tensor | None = None
-        else:
-            num_tokens_across_dp = torch.full(
-                (self.dp_size,), num_tokens, dtype=torch.int32, device="cpu"
-            )
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
         num_sampled_tokens = np.ones(input_batch.num_reqs, dtype=np.int32)
         with (
             self.maybe_dummy_run_with_lora(
@@ -816,7 +814,6 @@ def propose_draft(
             self.req_states.last_sampled_tokens,
             next_prefill_tokens,
         )
-        self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
         return draft_tokens
 
     def get_cudagraph_and_dp_padding(
@@ -1006,7 +1003,7 @@ def sample_tokens(
             input_batch, sampler_output.sampled_token_ids, num_sampled, num_rejected
         )
         if self.do_spec_decode:
-            _ = self.propose_draft(
+            draft_tokens = self.propose_draft(
                 input_batch,
                 sampling_metadata,
                 hidden_states,
@@ -1014,6 +1011,7 @@ def sample_tokens(
                 num_sampled,
                 num_rejected,
             )
+            self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
 
         if self.use_async_scheduling:
             return async_output

From ee80aee1cab6f0b6893cf54c8aa2f2c23512ec82 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 26 Nov 2025 20:10:12 -0800
Subject: [PATCH 463/578] [Model Runner V2] Minor cleanup for
 build_attn_metadata (#29576)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/attn_utils.py      |  8 +++-----
 vllm/v1/worker/gpu/cudagraph_utils.py |  3 ++-
 vllm/v1/worker/gpu/model_runner.py    | 10 ++++++++--
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index 4510a1c5ca1e..5aa1a33d851c 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -18,7 +18,6 @@
     KVCacheConfig,
     KVCacheSpec,
 )
-from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.utils import bind_kv_cache
 
 
@@ -145,7 +144,8 @@ def build_attn_metadata(
     attn_metadata_builders: list[AttentionMetadataBuilder],
     num_reqs: int,
     num_tokens: int,
-    query_start_loc: CpuGpuBuffer,
+    query_start_loc_gpu: torch.Tensor,
+    query_start_loc_cpu: torch.Tensor,
     seq_lens: torch.Tensor,
     seq_lens_np: np.ndarray,
     num_computed_tokens_cpu: torch.Tensor | None,
@@ -153,9 +153,7 @@ def build_attn_metadata(
     slot_mappings: torch.Tensor,
     kv_cache_config: KVCacheConfig,
 ) -> dict[str, Any]:
-    query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
-    query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
-    max_query_len = int(query_start_loc.np[: num_reqs + 1].max())
+    max_query_len = int(query_start_loc_cpu.max())
     seq_lens = seq_lens[:num_reqs]
     seq_lens_cpu = torch.from_numpy(seq_lens_np)
     max_seq_len = int(seq_lens_np.max())
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 6b056641c903..b5fc2edea130 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -120,7 +120,8 @@ def capture_graph(
             attn_metadata_builders=attn_metadata_builders,
             num_reqs=batch_size,
             num_tokens=batch_size,
-            query_start_loc=input_buffers.query_start_loc,
+            query_start_loc_gpu=input_buffers.query_start_loc.gpu[: batch_size + 1],
+            query_start_loc_cpu=input_buffers.query_start_loc.cpu[: batch_size + 1],
             seq_lens=input_buffers.seq_lens,
             seq_lens_np=np.full(batch_size, self.max_model_len, dtype=np.int32),
             num_computed_tokens_cpu=None,  # FIXME
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 6a78776b0a8a..ed41e5a1a6c5 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -226,11 +226,15 @@ def prepare_dummy_attn_metadata(self, input_batch: InputBatch) -> None:
         num_computed_tokens = torch.zeros(
             input_batch.num_reqs, dtype=torch.int32, device=self.device
         )
+        query_start_loc = self.input_buffers.query_start_loc
+        query_start_loc_gpu = query_start_loc.gpu[: input_batch.num_reqs + 1]
+        query_start_loc_cpu = query_start_loc.cpu[: input_batch.num_reqs + 1]
         attn_metadata = build_attn_metadata(
             attn_metadata_builders=self.attn_metadata_builders,
             num_reqs=input_batch.num_reqs,
             num_tokens=input_batch.num_tokens,
-            query_start_loc=self.input_buffers.query_start_loc,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
             seq_lens=self.input_buffers.seq_lens,
             seq_lens_np=input_batch.seq_lens_np,
             num_computed_tokens_cpu=num_computed_tokens,
@@ -515,6 +519,7 @@ def prepare_inputs(
         self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
         self.input_buffers.query_start_loc.copy_to_gpu()
         query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        query_start_loc_cpu = self.input_buffers.query_start_loc.cpu[: num_reqs + 1]
         query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
 
         # Copy prefill tokens from CPU to GPU.
@@ -572,7 +577,8 @@ def prepare_inputs(
             attn_metadata_builders=self.attn_metadata_builders,
             num_reqs=num_reqs,
             num_tokens=num_tokens,
-            query_start_loc=self.input_buffers.query_start_loc,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
             seq_lens=self.input_buffers.seq_lens,
             seq_lens_np=seq_lens_np,
             num_computed_tokens_cpu=num_computed_tokens,

From da8e1a1bf9b0f6d3b18608d8f99a456cd8833a0f Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Thu, 27 Nov 2025 12:42:50 +0800
Subject: [PATCH 464/578] [DOC] Add vLLM Bangkok Meetup info (#29561)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 README.md                 | 1 +
 docs/community/meetups.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 033e1035d891..abbb63158f16 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 
 *Latest News* 🔥
 
+- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
 - [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 0735f452df96..d8cf4ecdd5a3 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -10,6 +10,7 @@ Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_pro
 
 Below you'll find slides and recordings from our previous meetups:
 
+- [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing)
 - [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)

From ecb1952378dda5c7a3a8d89cc6c2f1d806248b9f Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Thu, 27 Nov 2025 05:09:41 +0000
Subject: [PATCH 465/578] [cpu][fix] Fix Arm CI tests (#29552)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .../scripts/hardware_ci/run-cpu-test-arm.sh   | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index d0036f24c8d0..b5f6b2494792 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -7,53 +7,51 @@ set -ex
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-0-16}
 OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
-NUMA_NODE=${NUMA_NODE:-0}
 
-export CMAKE_BUILD_PARALLEL_LEVEL=32
+export CMAKE_BUILD_PARALLEL_LEVEL=16
 
 # Setup cleanup
 remove_docker_container() {
     set -e;
-    docker rm -f cpu-test-"$NUMA_NODE" || true;
+    docker rm -f cpu-test || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
 
-# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+# Run the image
+docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
 
 function cpu_tests() {
   set -e
-  export NUMA_NODE=$2
 
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
     set -e
     pip list"
 
   # offline inference
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
     set -e
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
   # Run kernel tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
     set -e
     pytest -x -v -s tests/kernels/test_onednn.py
     pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
 
   # basic online serving
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
+  docker exec cpu-test bash -c '
     set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
     server_pid=$!
     timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
     vllm bench serve \
       --backend vllm \
       --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
+      --model Qwen/Qwen3-0.6B \
       --num-prompts 20 \
       --endpoint /v1/completions
     kill -s SIGTERM $server_pid &'
@@ -61,4 +59,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c cpu_tests

From 11ea5ec1ff7afc5ba181cba41f0cc2e4053e27f3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 26 Nov 2025 21:37:59 -0800
Subject: [PATCH 466/578] [Model Runner V2] Refactor CudaGraphManager (#29583)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py | 249 ++++++++++++++++----------
 1 file changed, 154 insertions(+), 95 deletions(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index b5fc2edea130..8f1718e493b1 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from unittest.mock import patch
+from collections.abc import Callable, Iterable
+from typing import Any
 
 import numpy as np
 import torch
@@ -32,6 +33,7 @@ def __init__(
 
         self.max_model_len = vllm_config.model_config.max_model_len
         self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
         self.dp_size = vllm_config.parallel_config.data_parallel_size
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
@@ -40,102 +42,60 @@ def __init__(
             self.cudagraph_mode = CUDAGraphMode.NONE
         else:
             self.cudagraph_mode = self.compilation_config.cudagraph_mode
-        if self.compilation_config.cudagraph_capture_sizes is not None:
-            cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
-            # Limit the cudagraph sizes to the max decode batch size.
-            self.cudagraph_sizes = [
-                x for x in cudagraph_sizes if x <= self.max_num_reqs
-            ]
-        else:
-            self.cudagraph_sizes = []
-        self.padded_sizes = self._init_padded_sizes()
+        self.cudagraph_sizes = get_cudagraph_sizes(
+            self.compilation_config.cudagraph_capture_sizes,
+            self.max_num_reqs,
+            self.max_num_tokens,
+            self.cudagraph_mode,
+        )
 
         self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
         self.pool = torch.cuda.graph_pool_handle()
         self.hidden_states: torch.Tensor | None = None
 
-    def _init_padded_sizes(self) -> dict[int, int]:
-        if not self.cudagraph_mode.has_full_cudagraphs():
-            # Full cuda graphs are not used.
-            return {}
-        if not self.cudagraph_sizes:
-            return {}
-
-        padded_sizes: dict[int, int] = {}
-        for i in range(1, self.cudagraph_sizes[-1] + 1):
-            for x in self.cudagraph_sizes:
-                if i <= x:
-                    padded_sizes[i] = x
-                    break
-        return padded_sizes
-
     def needs_capture(self) -> bool:
-        return len(self.padded_sizes) > 0
+        return len(self.cudagraph_sizes) > 0
 
     def get_cudagraph_size(
         self,
         scheduler_output: SchedulerOutput,
         num_tokens_after_padding: int,
     ) -> int | None:
-        if not self.cudagraph_mode.has_full_cudagraphs():
-            return None
-        if self.cudagraph_mode != CUDAGraphMode.FULL:
-            # TODO(woosuk): Support uniform decode with multiple tokens (spec decoding).
-            all_decode = all(
-                x == 1 for x in scheduler_output.num_scheduled_tokens.values()
-            )
-            if not all_decode:
-                # Prefill is included.
-                return None
-        return self.padded_sizes.get(num_tokens_after_padding)
+        return get_cudagraph_size(
+            num_tokens_after_padding,
+            scheduler_output.num_scheduled_tokens.values(),
+            self.cudagraph_sizes,
+            self.cudagraph_mode,
+        )
 
     def capture_graph(
         self,
-        batch_size: int,
+        num_tokens: int,
         model: nn.Module,
         input_buffers: InputBuffers,
         block_tables: BlockTables,
         attn_metadata_builders: list[AttentionMetadataBuilder],
         kv_cache_config: KVCacheConfig,
     ) -> None:
-        assert batch_size not in self.graphs
-
-        # Prepare dummy inputs.
-        input_ids = input_buffers.input_ids.gpu[:batch_size]
-        positions = input_buffers.positions[:batch_size]
-
-        input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1)
-        input_buffers.query_start_loc.np[batch_size:] = batch_size
-        input_buffers.query_start_loc.copy_to_gpu()
-        # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len)
-        # for seq_lens. This leads to a mismatch between seq_lens (GPU) and
-        # seq_lens_np (CPU), which might cause issues in some attention backends.
-        input_buffers.seq_lens[:batch_size] = 1
-        input_buffers.seq_lens[batch_size:] = 0
-
-        input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables]
-        slot_mappings = block_tables.slot_mappings[:, :batch_size]
-
-        attn_metadata = build_attn_metadata(
-            attn_metadata_builders=attn_metadata_builders,
-            num_reqs=batch_size,
-            num_tokens=batch_size,
-            query_start_loc_gpu=input_buffers.query_start_loc.gpu[: batch_size + 1],
-            query_start_loc_cpu=input_buffers.query_start_loc.cpu[: batch_size + 1],
-            seq_lens=input_buffers.seq_lens,
-            seq_lens_np=np.full(batch_size, self.max_model_len, dtype=np.int32),
-            num_computed_tokens_cpu=None,  # FIXME
-            block_tables=input_block_tables,
-            slot_mappings=slot_mappings,
-            kv_cache_config=kv_cache_config,
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        input_ids = input_buffers.input_ids.gpu[:num_tokens]
+        positions = input_buffers.positions[:num_tokens]
+        attn_metadata = prepare_inputs_to_capture(
+            num_reqs,
+            num_tokens,
+            input_buffers,
+            block_tables,
+            attn_metadata_builders,
+            self.max_model_len,
+            kv_cache_config,
         )
-        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, batch_size)
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
 
         # Warm up.
         with set_forward_context(
             attn_metadata,
             self.vllm_config,
-            num_tokens=batch_size,
+            num_tokens=num_tokens,
             cudagraph_runtime_mode=CUDAGraphMode.NONE,
             num_tokens_across_dp=num_tokens_across_dp,
         ):
@@ -147,13 +107,13 @@ def capture_graph(
                 self.hidden_states = torch.empty_like(hidden_states)
 
         # Capture the graph.
+        assert num_tokens not in self.graphs
         graph = torch.cuda.CUDAGraph()
         with (
-            patch("torch.cuda.empty_cache", lambda: None),
             set_forward_context(
                 attn_metadata,
                 self.vllm_config,
-                num_tokens=batch_size,
+                num_tokens=num_tokens,
                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
                 num_tokens_across_dp=num_tokens_across_dp,
             ),
@@ -163,8 +123,8 @@ def capture_graph(
                 input_ids=input_ids,
                 positions=positions,
             )
-            self.hidden_states[:batch_size] = hidden_states
-        self.graphs[batch_size] = graph
+            self.hidden_states[:num_tokens] = hidden_states
+        self.graphs[num_tokens] = graph
 
     @torch.inference_mode()
     def capture(
@@ -175,25 +135,124 @@ def capture(
         attn_metadata_builders: list[AttentionMetadataBuilder],
         kv_cache_config: KVCacheConfig,
     ) -> None:
-        assert self.needs_capture()
-        # Capture larger graphs first.
-        sizes_to_capture = sorted(self.cudagraph_sizes, reverse=True)
-        if is_global_first_rank():
-            sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs")
-
-        with graph_capture(device=self.device):
-            for batch_size in sizes_to_capture:
-                self.capture_graph(
-                    batch_size,
-                    model,
-                    input_buffers,
-                    block_tables,
-                    attn_metadata_builders,
-                    kv_cache_config,
-                )
-
-    def run(self, batch_size: int) -> torch.Tensor:
-        assert batch_size in self.graphs
-        self.graphs[batch_size].replay()
+        capture_graphs(
+            self.cudagraph_sizes,
+            self.device,
+            self.capture_graph,
+            model=model,
+            input_buffers=input_buffers,
+            block_tables=block_tables,
+            attn_metadata_builders=attn_metadata_builders,
+            kv_cache_config=kv_cache_config,
+        )
+
+    def run(self, num_tokens: int) -> torch.Tensor:
+        assert num_tokens in self.graphs
+        self.graphs[num_tokens].replay()
         assert self.hidden_states is not None
-        return self.hidden_states[:batch_size]
+        return self.hidden_states[:num_tokens]
+
+
+def get_cudagraph_sizes(
+    capture_sizes: list[int] | None,
+    max_num_reqs: int,
+    max_num_tokens: int,
+    cudagraph_mode: CUDAGraphMode,
+) -> dict[int, int]:
+    if not cudagraph_mode.has_full_cudagraphs():
+        return {}
+    if not capture_sizes:
+        return {}
+
+    capture_sizes = sorted(capture_sizes)
+    # Limit the capture sizes to the max number of requests or tokens.
+    upper_bound = (
+        max_num_reqs
+        if cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY
+        else max_num_tokens
+    )
+    capture_sizes = [x for x in capture_sizes if x <= upper_bound]
+    if not capture_sizes:
+        return {}
+
+    cudagraph_sizes: dict[int, int] = {}
+    for i in range(1, capture_sizes[-1] + 1):
+        for x in capture_sizes:
+            if i <= x:
+                cudagraph_sizes[i] = x
+                break
+    return cudagraph_sizes
+
+
+def get_cudagraph_size(
+    num_tokens_after_dp_padding: int,
+    num_tokens_per_request: Iterable[int],
+    cudagraph_sizes: dict[int, int],
+    cudagraph_mode: CUDAGraphMode,
+) -> int | None:
+    size = cudagraph_sizes.get(num_tokens_after_dp_padding)
+    if size is None:
+        # No CUDA graph for this size.
+        return None
+    if cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY:
+        all_decode = all(x == 1 for x in num_tokens_per_request)
+        if not all_decode:
+            # Prefill is included.
+            return None
+    return size
+
+
+def capture_graphs(
+    cudagraph_sizes: dict[int, int],
+    device: torch.device,
+    capture_fn: Callable,
+    **capture_kwargs,
+) -> None:
+    # Capture larger graphs first.
+    sizes_to_capture = sorted(set(cudagraph_sizes.values()), reverse=True)
+    if is_global_first_rank():
+        sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs")
+
+    with graph_capture(device=device):
+        for size in sizes_to_capture:
+            capture_fn(size, **capture_kwargs)
+
+
+def prepare_inputs_to_capture(
+    num_reqs: int,
+    num_tokens: int,
+    input_buffers: InputBuffers,
+    block_tables: BlockTables,
+    attn_metadata_builders: list[AttentionMetadataBuilder],
+    max_model_len: int,
+    kv_cache_config: KVCacheConfig,
+) -> dict[str, Any]:
+    num_tokens_per_req = num_tokens // num_reqs
+    query_start_loc = input_buffers.query_start_loc
+    query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1) * num_tokens_per_req
+    query_start_loc.np[num_reqs:] = num_tokens
+    query_start_loc.copy_to_gpu()
+    seq_lens_np = np.full(num_reqs, max_model_len, dtype=np.int32)
+    # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len)
+    # for seq_lens. This leads to a mismatch between seq_lens (GPU) and
+    # seq_lens_np (CPU), which might cause issues in some attention backends.
+    input_buffers.seq_lens[:num_reqs] = 1
+    input_buffers.seq_lens[num_reqs:] = 0
+
+    input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
+    slot_mappings = block_tables.slot_mappings[:, :num_tokens]
+
+    attn_metadata = build_attn_metadata(
+        attn_metadata_builders=attn_metadata_builders,
+        num_reqs=num_reqs,
+        num_tokens=num_tokens,
+        query_start_loc_gpu=query_start_loc.gpu[: num_reqs + 1],
+        query_start_loc_cpu=query_start_loc.cpu[: num_reqs + 1],
+        seq_lens=input_buffers.seq_lens,
+        seq_lens_np=seq_lens_np,
+        num_computed_tokens_cpu=None,  # FIXME
+        block_tables=input_block_tables,
+        slot_mappings=slot_mappings,
+        kv_cache_config=kv_cache_config,
+    )
+    return attn_metadata

From c069086b9c9b8212b0a8544eb25b6af65c16762d Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 27 Nov 2025 15:16:07 +0800
Subject: [PATCH 467/578] [Bugfix] Fix getting device for MoE LoRA (#29475)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers/fused_moe.py | 4 +++-
 vllm/lora/layers/utils.py     | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 0eb6562bec6c..1b925742c300 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -30,6 +30,8 @@
     FusedMoEModularMethod,
 )
 
+from .utils import _get_lora_device
+
 
 class FusedMoEWithLoRA(BaseLayerWithLoRA):
     def __init__(self, base_layer: FusedMoE) -> None:
@@ -41,7 +43,7 @@ def __init__(self, base_layer: FusedMoE) -> None:
         )
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
-        self.device = base_layer.w2_weight.device
+        self.device = _get_lora_device(base_layer)
         self._w13_slices = 2
         self._inject_lora_into_fused_moe()
 
diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py
index 2da90f180ee7..74403240f6cc 100644
--- a/vllm/lora/layers/utils.py
+++ b/vllm/lora/layers/utils.py
@@ -33,6 +33,15 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device:
     # HQQ marlin
     elif hasattr(base_layer, "W_q"):
         return base_layer.W_q.device
+    # MoE layer
+    elif hasattr(base_layer, "w2_weight"):
+        return base_layer.w2_weight.device
+    # MoE Compressed Tensor
+    elif hasattr(base_layer, "w2_weight_packed"):
+        return base_layer.w2_weight_packed.device
+    # MoE GPTQ/AWQ/GGUF
+    elif hasattr(base_layer, "w2_qweight"):
+        return base_layer.w2_qweight.device
     else:
         raise ValueError(f"Unsupported base layer: {base_layer}")
 

From 3ecabd06eee69e60c2239a6ca7159b978b26d6ce Mon Sep 17 00:00:00 2001
From: Johnny Yang <24908445+jcyang43@users.noreply.github.com>
Date: Wed, 26 Nov 2025 23:25:21 -0800
Subject: [PATCH 468/578] Fix tpu-inference platform path (#29554)

Signed-off-by: Johnny Yang <johnnyyang@google.com>
---
 vllm/platforms/tpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index aa5ddbe43659..04325a522f44 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -267,7 +267,7 @@ def check_max_model_len(cls, max_model_len: int) -> int:
 
 
 try:
-    from tpu_inference.platforms.tpu_platforms import (
+    from tpu_inference.platforms import (
         TpuPlatform as TpuInferencePlatform,
     )
 

From 43c5792592d9beb02eea57730ce5a4647dc0c838 Mon Sep 17 00:00:00 2001
From: Micah Williamson <micah.williamson@amd.com>
Date: Thu, 27 Nov 2025 01:54:44 -0600
Subject: [PATCH 469/578] [ROCm][CI] Fix test_cpu_offloading for ROCm (#29548)

Signed-off-by: Micah Williamson <micah.williamson@amd.com>
---
 tests/v1/kv_offload/test_cpu_offloading.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index 406d4c0b4c1f..57474a3dc01e 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -20,6 +20,8 @@
 
 if current_platform.is_cuda():
     ATTN_BACKENDS.append("FLASHINFER")
+elif current_platform.is_rocm():
+    ATTN_BACKENDS = ["TRITON_ATTN"]
 
 
 class MockSubscriber:

From da3222f371b48c8e2548ec22767523394580a1c5 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 27 Nov 2025 00:09:41 -0800
Subject: [PATCH 470/578] [Model Runner V2] Implement multi-step Eagle with
 CUDA graph (#29559)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py         |   9 +-
 vllm/v1/worker/gpu/model_runner.py            |  53 +--
 vllm/v1/worker/gpu/spec_decode/eagle.py       | 422 ++++++++++++++++--
 .../worker/gpu/spec_decode/eagle_cudagraph.py | 112 +++++
 4 files changed, 526 insertions(+), 70 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 8f1718e493b1..4fd8eb50a4ea 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -233,10 +233,11 @@ def prepare_inputs_to_capture(
     query_start_loc.np[num_reqs:] = num_tokens
     query_start_loc.copy_to_gpu()
     seq_lens_np = np.full(num_reqs, max_model_len, dtype=np.int32)
-    # HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len)
-    # for seq_lens. This leads to a mismatch between seq_lens (GPU) and
-    # seq_lens_np (CPU), which might cause issues in some attention backends.
-    input_buffers.seq_lens[:num_reqs] = 1
+    # HACK(woosuk): For faster warmup, we set seq_lens (GPU) to num_tokens
+    # rather than max_model_len. This introduces a discrepancy between
+    # seq_lens (on GPU) and seq_lens_np (on CPU), which may cause issues for
+    # certain attention backends.
+    input_buffers.seq_lens[:num_reqs] = num_tokens
     input_buffers.seq_lens[num_reqs:] = 0
 
     input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index ed41e5a1a6c5..0c9fdd0077f4 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -140,10 +140,7 @@ def __init__(
         self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
 
         # CUDA graphs.
-        self.cudagraph_manager = CudaGraphManager(
-            vllm_config=self.vllm_config,
-            device=self.device,
-        )
+        self.cudagraph_manager = CudaGraphManager(self.vllm_config, self.device)
 
     def get_supported_tasks(self) -> tuple[str]:
         return ("generate",)
@@ -203,6 +200,14 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             self.vllm_config,
             self.device,
         )
+        if self.do_spec_decode:
+            # HACK(woosuk)
+            self.speculator.set_attn(
+                self.kv_cache_config,
+                self.attn_metadata_builders,
+                self.block_tables,
+            )
+
         # TODO(woosuk): Support other backends.
         if not all(b.get_name() == "FLASH_ATTN" for b in self.attn_backends.values()):
             raise NotImplementedError("Only FLASH_ATTN backend is supported currently.")
@@ -297,35 +302,6 @@ def _dummy_sampler_run(
         logits = self.model.compute_logits(hidden_states)
         self.sampler(logits, sampling_metadata)
 
-    @torch.inference_mode()
-    def _dummy_speculator_run(
-        self,
-        hidden_states: torch.Tensor,
-        aux_hidden_states: list[torch.Tensor] | None,
-    ) -> None:
-        num_tokens = hidden_states.shape[0]
-        num_reqs = min(num_tokens, self.max_num_reqs)
-        input_batch = InputBatch.make_dummy(
-            num_reqs=num_reqs,
-            num_tokens=num_tokens,
-            input_buffers=self.input_buffers,
-            device=self.device,
-        )
-        sampling_metadata = SamplingMetadata.make_dummy(
-            num_reqs=num_reqs,
-            device=self.device,
-        )
-        num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
-        num_rejected = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
-        self.propose_draft(
-            input_batch=input_batch,
-            sampling_metadata=sampling_metadata,
-            last_hidden_states=hidden_states,
-            aux_hidden_states=aux_hidden_states,
-            num_sampled=num_sampled,
-            num_rejected=num_rejected,
-        )
-
     @torch.inference_mode()
     def profile_run(self) -> None:
         hidden_states, sample_hidden_states = self._dummy_run(
@@ -334,7 +310,14 @@ def profile_run(self) -> None:
         )
         self._dummy_sampler_run(sample_hidden_states)
         if self.do_spec_decode:
-            self._dummy_speculator_run(hidden_states, None)
+            num_tokens_across_dp = make_num_tokens_across_dp(
+                self.dp_size, self.max_num_tokens
+            )
+            self.speculator.run_model(
+                self.max_num_tokens,
+                attn_metadata=None,
+                num_tokens_across_dp=num_tokens_across_dp,
+            )
         torch.cuda.synchronize()
         del hidden_states, sample_hidden_states
         gc.collect()
@@ -368,6 +351,8 @@ def capture_model(self) -> int:
                 attn_metadata_builders=self.attn_metadata_builders,
                 kv_cache_config=self.kv_cache_config,
             )
+            if self.do_spec_decode:
+                self.speculator.capture_model()
 
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 3c8621cc69c9..daf2775e8b92 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -1,17 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import numpy as np
 import torch
 import torch.nn as nn
 
 from vllm.config import VllmConfig
 from vllm.config.compilation import CUDAGraphMode
 from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.triton_utils import tl, triton
-from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.input_batch import InputBatch, InputBuffers
 from vllm.v1.worker.gpu.sampler import gumbel_sample
+from vllm.v1.worker.gpu.spec_decode.eagle_cudagraph import EagleCudaGraphManager
 from vllm.v1.worker.gpu.states import SamplingMetadata
 
+logger = init_logger(__name__)
+
 
 class EagleSpeculator:
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
@@ -27,13 +39,48 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.scheduler_config = vllm_config.scheduler_config
         self.max_num_reqs = self.scheduler_config.max_num_seqs
         self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.max_model_len = vllm_config.model_config.max_model_len
+        # We need to get the hidden size from the draft model config because
+        # the draft model's hidden size can be different from the target model's
+        # hidden size (e.g., Llama 3.3 70B).
+        self.hidden_size = self.draft_model_config.get_hidden_size()
+        self.vocab_size = self.draft_model_config.get_vocab_size()
+        self.pin_memory = is_pin_memory_available()
+        self.dtype = vllm_config.model_config.dtype
 
-        self.input_ids = torch.zeros(
-            self.max_num_tokens, dtype=torch.int32, device=device
+        self.input_buffers = InputBuffers(
+            max_num_reqs=self.max_num_reqs,
+            max_num_tokens=self.max_num_tokens,
+            hidden_size=self.hidden_size,
+            vocab_size=self.vocab_size,
+            dtype=self.dtype,
+            device=device,
+            pin_memory=self.pin_memory,
+        )
+        self.hidden_states = torch.zeros(
+            self.max_num_tokens,
+            self.hidden_size,
+            dtype=self.dtype,
+            device=device,
+        )
+        self.temperature = torch.zeros(
+            self.max_num_reqs,
+            dtype=torch.float32,
+            device=device,
         )
-        self.positions = torch.zeros(
-            self.max_num_tokens, dtype=torch.int64, device=device
+        self.seeds = torch.zeros(
+            self.max_num_reqs,
+            dtype=torch.int64,
+            device=device,
         )
+        self.draft_tokens = torch.zeros(
+            self.max_num_reqs,
+            self.num_speculative_steps,
+            dtype=torch.int64,
+            device=device,
+        )
+
+        self.cudagraph_manager = EagleCudaGraphManager(vllm_config, device)
 
     def load_model(self, target_model: nn.Module) -> None:
         from vllm.compilation.backends import set_model_tag
@@ -49,6 +96,91 @@ def load_model(self, target_model: nn.Module) -> None:
                 del self.model.lm_head
             self.model.lm_head = target_model.lm_head
 
+    def set_attn(
+        self,
+        kv_cache_config: KVCacheConfig,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        block_tables: BlockTables,
+    ) -> None:
+        self.kv_cache_config = kv_cache_config
+        self.attn_metadata_builders = attn_metadata_builders
+        self.block_tables = block_tables
+
+    @torch.inference_mode()
+    def run_model(
+        self,
+        num_tokens: int,
+        attn_metadata: dict[str, Any],
+        num_tokens_across_dp: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        with set_forward_context(
+            attn_metadata,
+            self.vllm_config,
+            num_tokens=num_tokens,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            num_tokens_across_dp=num_tokens_across_dp,
+        ):
+            ret_hidden_states = self.model(
+                input_ids=self.input_buffers.input_ids.gpu[:num_tokens],
+                positions=self.input_buffers.positions[:num_tokens],
+                hidden_states=self.hidden_states[:num_tokens],
+            )
+        if self.method == "mtp":
+            last_hidden_states = ret_hidden_states
+            hidden_states = ret_hidden_states
+        else:
+            last_hidden_states, hidden_states = ret_hidden_states
+        return last_hidden_states, hidden_states
+
+    def generate_draft(
+        self,
+        num_reqs: int,
+        attn_metadata: dict[str, Any],
+        num_tokens_across_dp: torch.Tensor | None,
+    ) -> None:
+        pos = self.input_buffers.positions[:num_reqs]
+        query_start_loc = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        for step in range(1, self.num_speculative_steps):
+            # Run the eagle model.
+            last_hidden_states, hidden_states = self.run_model(
+                num_reqs, attn_metadata, num_tokens_across_dp
+            )
+            logits = self.model.compute_logits(last_hidden_states)
+
+            # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+            # used for draft and target sampling.
+            draft_tokens = gumbel_sample(
+                logits,
+                self.temperature[:num_reqs],
+                self.seeds[:num_reqs],
+                pos + 1,
+                apply_temperature=True,
+            )
+            self.draft_tokens[:num_reqs, step] = draft_tokens
+
+            if step < self.num_speculative_steps - 1:
+                # Update the inputs for the next step.
+                update_eagle_inputs(
+                    draft_tokens,
+                    hidden_states,
+                    self.input_buffers,
+                    self.hidden_states,
+                    self.max_model_len,
+                )
+                self.block_tables.compute_slot_mappings(query_start_loc, pos)
+
+    def capture_model(self) -> None:
+        if self.num_speculative_steps == 1:
+            return
+        logger.info("Capturing model for Eagle speculator...")
+        self.cudagraph_manager.capture(
+            self.generate_draft,
+            self.input_buffers,
+            self.block_tables,
+            self.attn_metadata_builders,
+            self.kv_cache_config,
+        )
+
     @torch.inference_mode()
     def propose(
         self,
@@ -80,64 +212,110 @@ def propose(
             )
         else:
             hidden_states = last_hidden_states
+        num_tokens = input_batch.num_tokens_after_padding
+        self.hidden_states[:num_tokens] = hidden_states
 
         # Get the input ids and last token indices for the speculator.
         last_token_indices = prepare_eagle_inputs(
-            self.input_ids,
+            self.input_buffers,
             input_batch,
             num_sampled,
             num_rejected,
             last_sampled,
             next_prefill_tokens,
         )
-        input_ids = self.input_ids[: input_batch.num_tokens_after_padding]
 
         # Prefill: Run the eagle speculator with eager mode.
-        with set_forward_context(
+        # TODO(woosuk): Support CUDA graph for prefill.
+        last_hidden_states, hidden_states = self.run_model(
+            num_tokens,
             input_batch.attn_metadata,
-            self.vllm_config,
-            num_tokens=input_batch.num_tokens_after_padding,
-            cudagraph_runtime_mode=CUDAGraphMode.NONE,
-        ):
-            ret_hidden_states = self.model(
-                input_ids=input_ids,
-                positions=input_batch.positions,
-                hidden_states=hidden_states,
-            )
-        if self.method == "mtp":
-            last_hidden_states = ret_hidden_states
-            hidden_states = ret_hidden_states
-        else:
-            last_hidden_states, hidden_states = ret_hidden_states
+            num_tokens_across_dp=None,  # FIXME
+        )
         sample_hidden_states = last_hidden_states[last_token_indices]
         logits = self.model.compute_logits(sample_hidden_states)
 
         num_reqs = input_batch.num_reqs
         cu_num_logits = input_batch.cu_num_logits[:num_reqs]
-        temperature = sampling_metadata.temperature[cu_num_logits]
-        seed = sampling_metadata.seeds[cu_num_logits]
-        # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
-        # used for draft and target sampling.
-        pos = input_batch.positions[last_token_indices] + 1
         # NOTE(woosuk): For draft sampling, we only consider the temperature
         # and ignore the other sampling parameters such as top_k and top_p,
         # for simplicity and performance.
         # While this may slightly degrade the acceptance rate, it does not
         # affect the output distribution after rejection sampling.
+        temperature = self.temperature[:num_reqs]
+        seeds = self.seeds[:num_reqs]
+        pos = self.input_buffers.positions[:num_reqs]
+        # Gather the values and copy them to the pre-allocated buffers.
+        torch.gather(sampling_metadata.temperature, 0, cu_num_logits, out=temperature)
+        torch.gather(sampling_metadata.seeds, 0, cu_num_logits, out=seeds)
+        torch.gather(input_batch.positions, 0, last_token_indices, out=pos)
+        # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+        # used for draft and target sampling.
         draft_tokens = gumbel_sample(
-            logits, temperature, seed, pos, apply_temperature=True
+            logits, temperature, seeds, pos + 1, apply_temperature=True
         )
         if self.num_speculative_steps == 1:
             # Early exit.
             return draft_tokens.view(-1, 1)
-        raise NotImplementedError("num_speculative_steps > 1 is not supported yet.")
+
+        # Save the draft tokens for the first step.
+        self.draft_tokens[:num_reqs, 0] = draft_tokens
+        # Prepare the inputs for the decode steps.
+        prepare_eagle_decode(
+            draft_tokens,
+            hidden_states,
+            last_token_indices,
+            input_batch.seq_lens,
+            num_rejected,
+            self.input_buffers,
+            self.hidden_states,
+            self.max_model_len,
+            self.max_num_reqs,
+        )
+        query_start_loc = self.input_buffers.query_start_loc
+        query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
+        slot_mappings = self.block_tables.compute_slot_mappings(
+            query_start_loc_gpu, pos
+        )
+
+        cudagraph_size = self.cudagraph_manager.get_cudagraph_size(num_reqs)
+        if cudagraph_size is not None:
+            # Run CUDA graph.
+            self.cudagraph_manager.run(cudagraph_size)
+            return self.draft_tokens[:num_reqs]
+
+        # Run eager mode.
+        query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1)
+        query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
+        # HACK(woosuk)
+        seq_lens_np = np.full(num_reqs, self.max_model_len, dtype=np.int32)
+        block_tables = [x[:num_reqs] for x in self.block_tables.input_block_tables]
+
+        # FIXME(woosuk): This is UNSAFE!!
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=self.attn_metadata_builders,
+            num_reqs=num_reqs,
+            num_tokens=num_reqs,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
+            seq_lens=self.input_buffers.seq_lens[:num_reqs],
+            seq_lens_np=seq_lens_np,
+            num_computed_tokens_cpu=None,  # FIXME
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=self.kv_cache_config,
+        )
+        self.generate_draft(num_reqs, attn_metadata, num_tokens_across_dp=None)  # FIXME
+        return self.draft_tokens[:num_reqs]
 
 
 @triton.jit
 def _prepare_eagle_inputs_kernel(
     last_token_indices_ptr,
     eagle_input_ids_ptr,
+    eagle_positions_ptr,
     target_input_ids_ptr,
+    target_positions_ptr,
     idx_mapping_ptr,
     last_sampled_ptr,
     next_prefill_tokens_ptr,
@@ -175,9 +353,16 @@ def _prepare_eagle_inputs_kernel(
     tl.store(last_token_indices_ptr + batch_idx, last_token_index)
     tl.store(eagle_input_ids_ptr + last_token_index, next_token)
 
+    # Copy positions.
+    for i in range(0, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        target_pos = tl.load(target_positions_ptr + query_start + block, mask=mask)
+        tl.store(eagle_positions_ptr + query_start + block, target_pos, mask=mask)
+
 
 def prepare_eagle_inputs(
-    eagle_input_ids: torch.Tensor,
+    input_buffers: InputBuffers,
     input_batch: InputBatch,
     # [num_reqs]
     num_sampled: torch.Tensor,
@@ -192,12 +377,14 @@ def prepare_eagle_inputs(
     last_token_indices = torch.empty(
         num_reqs,
         dtype=torch.int64,
-        device=eagle_input_ids.device,
+        device=num_sampled.device,
     )
     _prepare_eagle_inputs_kernel[(num_reqs,)](
         last_token_indices,
-        eagle_input_ids,
+        input_buffers.input_ids.gpu,
+        input_buffers.positions,
         input_batch.input_ids,
+        input_batch.positions,
         input_batch.idx_mapping,
         last_sampled,
         next_prefill_tokens,
@@ -207,3 +394,174 @@ def prepare_eagle_inputs(
         BLOCK_SIZE=1024,
     )
     return last_token_indices
+
+
+@triton.jit
+def _prepare_eagle_docode_kernel(
+    draft_tokens_ptr,
+    output_hidden_states_ptr,
+    output_hidden_states_stride,
+    last_token_indices_ptr,
+    target_seq_lens_ptr,
+    num_rejected_ptr,
+    input_ids_ptr,
+    positions_ptr,
+    input_hidden_states_ptr,
+    input_hidden_states_stride,
+    query_start_loc_ptr,
+    seq_lens_ptr,
+    hidden_size,
+    max_model_len,
+    max_num_reqs,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    num_reqs = tl.num_programs(0) - 1
+    if req_idx == num_reqs:
+        # Compute query_start_loc. Pad it with the last query_start_loc
+        # for CUDA graphs.
+        for i in range(0, max_num_reqs + 1, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            q = tl.where(block < num_reqs, block, num_reqs)
+            mask = block < max_num_reqs + 1
+            tl.store(query_start_loc_ptr + block, q, mask=mask)
+        # Pad seq_lens for CUDA graphs.
+        for i in range(req_idx, max_num_reqs, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            mask = block < max_num_reqs
+            tl.store(seq_lens_ptr + block, 0, mask=mask)
+        return
+
+    # draft token -> input id.
+    draft_token = tl.load(draft_tokens_ptr + req_idx)
+    tl.store(input_ids_ptr + req_idx, draft_token)
+
+    # output hidden states -> input hidden states.
+    src_idx = tl.load(last_token_indices_ptr + req_idx)
+    for i in range(0, hidden_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < hidden_size
+        output_hidden_states = tl.load(
+            output_hidden_states_ptr + src_idx * output_hidden_states_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            input_hidden_states_ptr + req_idx * input_hidden_states_stride + block,
+            output_hidden_states,
+            mask=mask,
+        )
+
+    # Compute position and seq_lens.
+    # NOTE(woosuk): To prevent out-of-range access, we clamp these values
+    # if they reach the max model length.
+    position = tl.load(positions_ptr + req_idx)
+    position = tl.minimum(position + 1, max_model_len - 1)
+    tl.store(positions_ptr + req_idx, position)
+
+    target_seq_len = tl.load(target_seq_lens_ptr + req_idx)
+    num_rejected = tl.load(num_rejected_ptr + req_idx)
+    seq_len = target_seq_len - num_rejected
+    seq_len = tl.minimum(seq_len + 1, max_model_len)
+    tl.store(seq_lens_ptr + req_idx, seq_len)
+
+
+def prepare_eagle_decode(
+    draft_tokens: torch.Tensor,
+    output_hidden_states: torch.Tensor,
+    last_token_indices: torch.Tensor,
+    target_seq_lens: torch.Tensor,
+    num_rejected: torch.Tensor,
+    input_buffers: InputBuffers,
+    input_hidden_states: torch.Tensor,
+    max_model_len: int,
+    max_num_reqs: int,
+):
+    num_reqs = draft_tokens.shape[0]
+    hidden_size = output_hidden_states.shape[-1]
+    _prepare_eagle_docode_kernel[(num_reqs + 1,)](
+        draft_tokens,
+        output_hidden_states,
+        output_hidden_states.stride(0),
+        last_token_indices,
+        target_seq_lens,
+        num_rejected,
+        input_buffers.input_ids.gpu,
+        input_buffers.positions,
+        input_hidden_states,
+        input_hidden_states.stride(0),
+        input_buffers.query_start_loc.gpu,
+        input_buffers.seq_lens,
+        hidden_size,
+        max_model_len,
+        max_num_reqs,
+        BLOCK_SIZE=1024,
+    )
+
+
+@triton.jit
+def _update_eagle_inputs_kernel(
+    input_ids_ptr,
+    positions_ptr,
+    input_hidden_states_ptr,
+    input_hidden_states_stride,
+    seq_lens_ptr,
+    max_model_len,
+    draft_tokens_ptr,
+    output_hidden_states_ptr,
+    output_hidden_states_stride,
+    hidden_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+
+    # Draft token -> Input ID.
+    draft_token = tl.load(draft_tokens_ptr + req_idx)
+    tl.store(input_ids_ptr + req_idx, draft_token)
+
+    # Output hidden states -> Input hidden states.
+    for i in range(0, hidden_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < hidden_size
+        output_hidden_states = tl.load(
+            output_hidden_states_ptr + req_idx * output_hidden_states_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            input_hidden_states_ptr + req_idx * input_hidden_states_stride + block,
+            output_hidden_states,
+            mask=mask,
+        )
+
+    # Increment position and seq_lens.
+    # NOTE(woosuk): To prevent out-of-range access, we clamp these values
+    # if they reach the max model length.
+    position = tl.load(positions_ptr + req_idx)
+    position = tl.minimum(position + 1, max_model_len - 1)
+    tl.store(positions_ptr + req_idx, position)
+
+    seq_len = tl.load(seq_lens_ptr + req_idx)
+    seq_len = tl.minimum(seq_len + 1, max_model_len)
+    tl.store(seq_lens_ptr + req_idx, seq_len)
+
+
+def update_eagle_inputs(
+    draft_tokens: torch.Tensor,
+    output_hidden_states: torch.Tensor,
+    input_buffers: InputBuffers,
+    hidden_states: torch.Tensor,
+    max_model_len: int,
+):
+    num_reqs, hidden_size = output_hidden_states.shape
+    _update_eagle_inputs_kernel[(num_reqs,)](
+        input_buffers.input_ids.gpu,
+        input_buffers.positions,
+        hidden_states,
+        hidden_states.stride(0),
+        input_buffers.seq_lens,
+        max_model_len,
+        draft_tokens,
+        output_hidden_states,
+        output_hidden_states.stride(0),
+        hidden_size,
+        BLOCK_SIZE=1024,
+    )
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
new file mode 100644
index 000000000000..a6f50d68cc68
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.cudagraph_utils import (
+    capture_graphs,
+    get_cudagraph_sizes,
+    prepare_inputs_to_capture,
+)
+from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+
+class EagleCudaGraphManager:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device = device
+
+        self.max_model_len = vllm_config.model_config.max_model_len
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.dp_size = vllm_config.parallel_config.data_parallel_size
+        self.compilation_config = vllm_config.compilation_config
+        assert self.compilation_config is not None
+
+        if self.compilation_config.cudagraph_mode is None:
+            self.cudagraph_mode = CUDAGraphMode.NONE
+        else:
+            self.cudagraph_mode = self.compilation_config.cudagraph_mode
+            if self.cudagraph_mode == CUDAGraphMode.FULL:
+                # NOTE(woosuk): For Eagle, we only use CUDA graphs for decode.
+                self.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
+
+        self.cudagraph_sizes = get_cudagraph_sizes(
+            self.compilation_config.cudagraph_capture_sizes,
+            self.max_num_reqs,
+            self.max_num_tokens,
+            self.cudagraph_mode,
+        )
+
+        self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
+        self.pool = torch.cuda.graph_pool_handle()
+
+    def get_cudagraph_size(self, num_tokens: int) -> int | None:
+        return self.cudagraph_sizes.get(num_tokens)
+
+    def capture_graph(
+        self,
+        num_tokens: int,
+        generate_fn: Callable,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        attn_metadata = prepare_inputs_to_capture(
+            num_reqs,
+            num_tokens,
+            input_buffers,
+            block_tables,
+            attn_metadata_builders,
+            self.max_model_len,
+            kv_cache_config,
+        )
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
+
+        # Warm up.
+        generate_fn(num_tokens, attn_metadata, num_tokens_across_dp)
+
+        # Capture the graph.
+        assert num_tokens not in self.graphs
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, self.pool):
+            generate_fn(num_tokens, attn_metadata, num_tokens_across_dp)
+        self.graphs[num_tokens] = graph
+
+    @torch.inference_mode()
+    def capture(
+        self,
+        generate_fn: Callable,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        capture_graphs(
+            self.cudagraph_sizes,
+            self.device,
+            self.capture_graph,
+            generate_fn=generate_fn,
+            input_buffers=input_buffers,
+            block_tables=block_tables,
+            attn_metadata_builders=attn_metadata_builders,
+            kv_cache_config=kv_cache_config,
+        )
+
+    def run(self, num_tokens: int) -> None:
+        assert num_tokens in self.graphs
+        self.graphs[num_tokens].replay()

From 00d3310d2d00d021d2e8f5f00e31b51d30f0413e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Nov 2025 17:36:18 +0800
Subject: [PATCH 471/578] [Bugfix] Update Ultravox  compatibility (#29588)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/ultravox.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index bb0f6bd036f1..26a8355cd22b 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -116,7 +116,12 @@ def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
 
     def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor:
         hf_processor = self.get_hf_processor(**kwargs)
+
+        # Changed in https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
         audio_processor = hf_processor.audio_processor  # type: ignore
+        if isinstance(audio_processor, WhisperFeatureExtractor):
+            return audio_processor
+
         feature_extractor = audio_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor

From 0838b52e2eff77d1aaf4ee9d0da19522b9a5749c Mon Sep 17 00:00:00 2001
From: Morrison Turnansky <mturnans@redhat.com>
Date: Thu, 27 Nov 2025 04:55:58 -0500
Subject: [PATCH 472/578] [Frontend][torch.compile] CompilationConfig Overhaul
 (#20283): Set up -O infrastructure (#26847)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Co-authored-by: adabeyta <aabeyta@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/design/optimization_levels.md            |  69 ++++
 tests/compile/test_config.py                  |   4 +-
 tests/engine/test_arg_utils.py                |  57 +++-
 .../model_executor/test_enabled_custom_ops.py |   8 +-
 tests/test_config.py                          | 307 +++++++++++++++++-
 tests/utils_/test_argparse_utils.py           |  10 +-
 tests/v1/cudagraph/test_cudagraph_mode.py     |   4 +-
 vllm/config/compilation.py                    |  75 ++++-
 vllm/config/model.py                          |   8 +
 vllm/config/vllm.py                           | 223 ++++++++++++-
 vllm/engine/arg_utils.py                      |   8 +-
 vllm/utils/argparse_utils.py                  |  24 +-
 vllm/v1/worker/gpu/cudagraph_utils.py         |   2 +-
 13 files changed, 735 insertions(+), 64 deletions(-)
 create mode 100644 docs/design/optimization_levels.md

diff --git a/docs/design/optimization_levels.md b/docs/design/optimization_levels.md
new file mode 100644
index 000000000000..940286071ef3
--- /dev/null
+++ b/docs/design/optimization_levels.md
@@ -0,0 +1,69 @@
+<!-- markdownlint-disable -->
+
+# Optimization Levels
+
+## Overview
+
+vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
+
+## Level Summaries and Usage Examples
+```bash
+# CLI usage
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=0
+)
+```
+
+#### `-O1`: Quick Optimizations
+- **Startup**: Moderate startup time
+- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
+- **Use case**:  Balance for most development scenarios
+
+```bash
+# CLI usage
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=1
+)
+```
+
+#### `-O2`: Full Optimizations (Default)
+- **Startup**: Longer startup time
+- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
+- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that  noop & fusion flags are enabled. 
+
+```bash
+# CLI usage (default, so optional)
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=2  # This is the default
+)
+```
+
+#### `-O3`: Full Optimization
+Still in development. Added infrastructure to prevent changing API in future 
+release. Currently behaves the same O2.
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup
+2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information
+3. **Performance Issues**: Ensure using `-O2` for production
\ No newline at end of file
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 1e8a882a7f3e..a9e5ccee520e 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -172,8 +172,8 @@ def test_splitting_ops_dynamic():
     config = VllmConfig()
     # Default V1 config leaves cudagraph mode unset; splitting ops are only
     # populated when the engine decides to use piecewise compilation.
-    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
-    assert not config.compilation_config.splitting_ops_contain_attention()
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+    assert config.compilation_config.splitting_ops_contain_attention()
 
     # When use_inductor_graph_partition=True
     config = VllmConfig(
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index be926764e494..0077609b2f36 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -222,28 +222,53 @@ def test_media_io_kwargs_parser(arg, expected):
     assert args.media_io_kwargs == expected
 
 
-def test_compilation_config():
+@pytest.mark.parametrize(
+    ("args", "expected"),
+    [
+        (["-O", "1"], "1"),
+        (["-O", "2"], "2"),
+        (["-O", "3"], "3"),
+        (["-O0"], "0"),
+        (["-O1"], "1"),
+        (["-O2"], "2"),
+        (["-O3"], "3"),
+    ],
+)
+def test_optimization_level(args, expected):
+    """
+    Test space-separated optimization levels (-O 1, -O 2, -O 3) map to
+    optimization_level.
+    """
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    parsed_args = parser.parse_args(args)
+    assert parsed_args.optimization_level == expected
+    assert parsed_args.compilation_config.mode is None
 
-    # default value
-    args = parser.parse_args([])
-    assert args.compilation_config == CompilationConfig()
 
-    # set to O3
-    args = parser.parse_args(["-O0"])
-    assert args.compilation_config.mode == 0
+@pytest.mark.parametrize(
+    ("args", "expected"),
+    [
+        (["-O.mode=0"], 0),
+        (["-O.mode=1"], 1),
+        (["-O.mode=2"], 2),
+        (["-O.mode=3"], 3),
+    ],
+)
+def test_mode_parser(args, expected):
+    """
+    Test compilation config modes (-O.mode=int) map to compilation_config.
+    """
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    parsed_args = parser.parse_args(args)
+    assert parsed_args.compilation_config.mode == expected
 
-    # set to O 3 (space)
-    args = parser.parse_args(["-O", "1"])
-    assert args.compilation_config.mode == 1
 
-    # set to O 3 (equals)
-    args = parser.parse_args(["-O=2"])
-    assert args.compilation_config.mode == 2
+def test_compilation_config():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
 
-    # set to O.mode 3
-    args = parser.parse_args(["-O.mode", "3"])
-    assert args.compilation_config.mode == 3
+    # default value
+    args = parser.parse_args([])
+    assert args.compilation_config == CompilationConfig()
 
     # set to string form of a dict
     args = parser.parse_args(
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 9121284de85b..7d95dcddca71 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -5,7 +5,12 @@
 import torch
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
+from vllm.config import (
+    CompilationConfig,
+    VllmConfig,
+    get_cached_compilation_config,
+    set_current_vllm_config,
+)
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (
     GeluAndMul,
@@ -86,6 +91,7 @@ def test_enabled_ops(
             backend=backend, mode=compilation_mode, custom_ops=custom_ops
         )
     )
+    get_cached_compilation_config.cache_clear()
     with set_current_vllm_config(vllm_config):
         assert CustomOp.default_on() == default_on
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 16f68d18fc68..080e4d2afacc 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -8,9 +8,20 @@
 import pytest
 
 from vllm.compilation.backends import VllmBackend
-from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config
+from vllm.config import (
+    CompilationConfig,
+    ModelConfig,
+    PoolerConfig,
+    VllmConfig,
+    update_config,
+)
+from vllm.config.compilation import CompilationMode, CUDAGraphMode
 from vllm.config.load import LoadConfig
 from vllm.config.utils import get_field
+from vllm.config.vllm import (
+    OPTIMIZATION_LEVEL_TO_CONFIG,
+    OptimizationLevel,
+)
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
@@ -235,6 +246,43 @@ def test_default_pooling_type(model_id, default_pooling_type, pooling_type):
     assert model_config.pooler_config.pooling_type == pooling_type
 
 
+@pytest.mark.parametrize(
+    ("model_id", "expected_is_moe_model"),
+    [
+        ("RedHatAI/Qwen3-8B-speculator.eagle3", False),
+        ("RedHatAI/Llama-3.1-8B-Instruct-NVFP4", False),
+        ("RedHatAI/Llama-3.2-1B-FP8", False),
+        ("RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8", False),
+        ("RedHatAI/gpt-oss-20b", True),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", True),
+        ("RedHatAI/Llama-4-Scout-17B-16E-Instruct", True),
+        ("RedHatAI/Mixtral-8x7B-Instruct-v0.1", True),
+    ],
+)
+def test_moe_model_detection(model_id, expected_is_moe_model):
+    model_config = ModelConfig(model_id)
+    # Just check that is_moe_model field exists and is a boolean
+    assert model_config.is_model_moe() == expected_is_moe_model
+
+
+@pytest.mark.parametrize(
+    ("model_id", "quantized"),
+    [
+        ("RedHatAI/Qwen3-8B-speculator.eagle3", False),
+        ("RedHatAI/Llama-3.1-8B-Instruct-NVFP4", True),
+        ("RedHatAI/Llama-3.2-1B-FP8", True),
+        ("RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8", True),
+        ("RedHatAI/gpt-oss-20b", True),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", True),
+        ("RedHatAI/Mixtral-8x7B-Instruct-v0.1", False),
+    ],
+)
+def test_is_quantized(model_id, quantized):
+    model_config = ModelConfig(model_id)
+    # Just check that quantized field exists and is a boolean
+    assert model_config.is_quantized() == quantized
+
+
 @pytest.mark.skipif(
     current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
 )
@@ -552,3 +600,260 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
     assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
     assert os.path.exists(config2.model) and os.path.isdir(config2.model)
     assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
+
+
+@pytest.mark.parametrize(
+    ("backend", "custom_ops", "expected"),
+    [
+        ("eager", [], True),
+        ("eager", ["+fused_layernorm"], True),
+        ("eager", ["all", "-fused_layernorm"], False),
+        ("inductor", [], False),
+        ("inductor", ["none", "+fused_layernorm"], True),
+        ("inductor", ["none", "-fused_layernorm"], False),
+    ],
+)
+def test_is_custom_op_enabled(backend: str, custom_ops: list[str], expected: bool):
+    """Test that is_custom_op_enabled works correctly."""
+    config = VllmConfig(
+        compilation_config=CompilationConfig(backend=backend, custom_ops=custom_ops)
+    )
+    assert config.compilation_config.is_custom_op_enabled("fused_layernorm") is expected
+
+
+def test_vllm_config_defaults_are_none():
+    """Verify that optimization-level defaults are None when not set by user."""
+    # Test all optimization levels to ensure defaults work correctly
+    for opt_level in OptimizationLevel:
+        config = object.__new__(VllmConfig)
+        config.compilation_config = CompilationConfig()
+        config.optimization_level = opt_level
+        config.model_config = None
+
+        # Use the global optimization level defaults
+        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[opt_level]
+
+        # Verify that all pass_config values are None before defaults are applied
+        for pass_k in default_config["compilation_config"]["pass_config"]:
+            assert getattr(config.compilation_config.pass_config, pass_k) is None
+
+        # Verify that other config values are None before defaults are applied
+        for k in default_config["compilation_config"]:
+            if k != "pass_config":
+                assert getattr(config.compilation_config, k) is None
+
+
+@pytest.mark.parametrize(
+    ("model_id", "compiliation_config", "optimization_level"),
+    [
+        (
+            None,
+            CompilationConfig(backend="eager", custom_ops=["+quant_fp8"]),
+            OptimizationLevel.O0,
+        ),
+        (None, CompilationConfig(), OptimizationLevel.O0),
+        (None, CompilationConfig(), OptimizationLevel.O1),
+        (None, CompilationConfig(), OptimizationLevel.O2),
+        (None, CompilationConfig(), OptimizationLevel.O3),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(backend="inductor", custom_ops=["+quant_fp8"]),
+            OptimizationLevel.O2,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O0,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O1,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O2,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O3,
+        ),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O0),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O1),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O2),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O3),
+    ],
+)
+def test_vllm_config_defaults(model_id, compiliation_config, optimization_level):
+    """Test that optimization-level defaults are correctly applied."""
+
+    model_config = None
+    if model_id is not None:
+        model_config = ModelConfig(model_id)
+        vllm_config = VllmConfig(
+            model_config=model_config,
+            compilation_config=compiliation_config,
+            optimization_level=optimization_level,
+        )
+    else:
+        vllm_config = VllmConfig(
+            compilation_config=compiliation_config,
+            optimization_level=optimization_level,
+        )
+    # Use the global optimization level defaults
+    default_config = OPTIMIZATION_LEVEL_TO_CONFIG[optimization_level]
+
+    # Verify pass_config defaults (nested under compilation_config)
+    pass_config_dict = default_config["compilation_config"]["pass_config"]
+    for pass_k, pass_v in pass_config_dict.items():
+        actual = getattr(vllm_config.compilation_config.pass_config, pass_k)
+        expected = pass_v(vllm_config) if callable(pass_v) else pass_v
+        assert actual == expected, (
+            f"pass_config.{pass_k}: expected {expected}, got {actual}"
+        )
+
+    # Verify other compilation_config defaults
+    compilation_config_dict = default_config["compilation_config"]
+    for k, v in compilation_config_dict.items():
+        if k != "pass_config":
+            actual = getattr(vllm_config.compilation_config, k)
+            expected = v(vllm_config) if callable(v) else v
+            assert actual == expected, (
+                f"compilation_config.{k}: expected {expected}, got {actual}"
+            )
+
+
+def test_vllm_config_callable_defaults():
+    """Test that callable defaults work in the config system.
+
+    Verifies that lambdas in default configs can inspect VllmConfig properties
+    (e.g., is_quantized, is_model_moe) to conditionally set optimization flags.
+    """
+    config_no_model = VllmConfig(optimization_level=OptimizationLevel.O2)
+
+    # Callable that checks if model exists
+    has_model = lambda cfg: cfg.model_config is not None
+    assert has_model(config_no_model) is False
+
+    # Test with quantized model
+    quantized_model = ModelConfig("RedHatAI/Llama-3.2-1B-FP8")
+    config_quantized = VllmConfig(
+        model_config=quantized_model, optimization_level=OptimizationLevel.O2
+    )
+    enable_if_quantized = lambda cfg: (
+        cfg.model_config is not None and cfg.model_config.is_quantized()
+    )
+    assert enable_if_quantized(config_quantized) is True
+    assert enable_if_quantized(config_no_model) is False
+
+    # Test with MoE model
+    moe_model = ModelConfig("deepseek-ai/DeepSeek-V2-Lite")
+    config_moe = VllmConfig(
+        model_config=moe_model, optimization_level=OptimizationLevel.O2
+    )
+    enable_if_sequential = lambda cfg: (
+        cfg.model_config is not None and not cfg.model_config.is_model_moe()
+    )
+    assert enable_if_sequential(config_moe) is False
+    assert enable_if_sequential(config_quantized) is True
+
+
+def test_vllm_config_explicit_overrides():
+    """Test that explicit property overrides work correctly with callable defaults.
+
+    When users explicitly set configuration properties, those values
+    take precedence over callable defaults, across different models and
+    optimization levels.
+    """
+    from vllm.config.compilation import PassConfig
+
+    quantized_model = ModelConfig("RedHatAI/Llama-3.2-1B-FP8")
+    moe_model = ModelConfig("deepseek-ai/DeepSeek-V2-Lite")
+    regular_model = ModelConfig("Qwen/Qwen1.5-7B")
+
+    # Explicit compilation mode override on O0 (where default is NONE)
+    compilation_config = CompilationConfig(mode=CompilationMode.VLLM_COMPILE)
+    config = VllmConfig(
+        optimization_level=OptimizationLevel.O0,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+
+    # Explicit pass config flags to override defaults
+    pass_config = PassConfig(enable_noop=True, enable_attn_fusion=True)
+    compilation_config = CompilationConfig(pass_config=pass_config)
+    config = VllmConfig(
+        optimization_level=OptimizationLevel.O0,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.pass_config.enable_noop is True
+    assert config.compilation_config.pass_config.enable_attn_fusion is True
+
+    # Explicit cudagraph mode override on quantized model at O2
+    pass_config = PassConfig(enable_async_tp=True)
+    compilation_config = CompilationConfig(
+        cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config
+    )
+    config = VllmConfig(
+        model_config=quantized_model,
+        optimization_level=OptimizationLevel.O2,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+    assert config.compilation_config.pass_config.enable_async_tp is True
+    # Mode should still use default for O2
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+
+    # Different optimization levels with same model
+    config_o0 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O0
+    )
+    config_o2 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O2
+    )
+    assert config_o0.compilation_config.mode == CompilationMode.NONE
+    assert config_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_o0.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+    assert (
+        config_o2.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+
+    # Same optimization level across different model types
+    config_moe_o2 = VllmConfig(
+        model_config=moe_model, optimization_level=OptimizationLevel.O2
+    )
+    config_regular_o2 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O2
+    )
+    config_quantized_o2 = VllmConfig(
+        model_config=quantized_model, optimization_level=OptimizationLevel.O2
+    )
+    # All should have same base compilation settings at O2
+    assert config_moe_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_regular_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_quantized_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert (
+        config_moe_o2.compilation_config.cudagraph_mode
+        == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+    assert (
+        config_regular_o2.compilation_config.cudagraph_mode
+        == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+
+    # Override one field but not others
+    pass_config = PassConfig(enable_noop=False)
+    compilation_config = CompilationConfig(pass_config=pass_config)
+    config = VllmConfig(
+        model_config=regular_model,
+        optimization_level=OptimizationLevel.O2,
+        compilation_config=compilation_config,
+    )
+    # Explicit override should be respected
+    assert config.compilation_config.pass_config.enable_noop is False
+    # Other fields should still use defaults
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 32d4eca54135..c0519155c4ba 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -28,6 +28,7 @@ def parser():
     parser.add_argument("--enable-feature", action="store_true")
     parser.add_argument("--hf-overrides", type=json.loads)
     parser.add_argument("-O", "--compilation-config", type=json.loads)
+    parser.add_argument("--optimization-level", type=int)
     return parser
 
 
@@ -217,8 +218,8 @@ def test_dict_args(parser):
             "key15": "-minus.and.dot",
         },
     }
+    assert parsed_args.optimization_level == 1
     assert parsed_args.compilation_config == {
-        "mode": 1,
         "use_inductor_graph_partition": True,
         "backend": "custom",
         "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
@@ -241,12 +242,13 @@ def test_duplicate_dict_args(caplog_vllm, parser):
     parsed_args = parser.parse_args(args)
     # Should be the last value
     assert parsed_args.hf_overrides == {"key1": "val2"}
-    assert parsed_args.compilation_config == {"mode": 3}
+    assert parsed_args.optimization_level == 3
+    assert parsed_args.compilation_config == {"mode": 2}
 
     assert len(caplog_vllm.records) == 1
     assert "duplicate" in caplog_vllm.text
     assert "--hf-overrides.key1" in caplog_vllm.text
-    assert "-O.mode" in caplog_vllm.text
+    assert "--optimization-level" in caplog_vllm.text
 
 
 def test_model_specification(
@@ -383,7 +385,7 @@ def test_compilation_mode_string_values(parser):
     assert args.compilation_config == {"mode": 0}
 
     args = parser.parse_args(["-O3"])
-    assert args.compilation_config == {"mode": 3}
+    assert args.optimization_level == 3
 
     args = parser.parse_args(["-O.mode=NONE"])
     assert args.compilation_config == {"mode": "NONE"}
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index 7f9c2a0571c3..12621d493e54 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -117,9 +117,9 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
     combo_cases_2 = [
         ("FA2", "FULL", CompilationMode.NONE, True),
         ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "PIECEWISE", CompilationMode.NONE, True),
         ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, True),
         ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
         ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
         ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 865d045676d1..da2c100dae3d 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
-from pydantic import TypeAdapter, field_validator
+from pydantic import Field, TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
@@ -97,19 +97,25 @@ class PassConfig:
 
     This is separate from general `CompilationConfig` so that inductor passes
     don't all have access to full configuration - that would create a cycle as
-    the `PassManager` is set as a property of config."""
+    the `PassManager` is set as a property of config.
 
-    enable_fusion: bool = False
+    You must pass PassConfig to VLLMConfig constructor via the CompilationConfig
+    constructor. VLLMConfig's post_init does further initialization.
+    If used outside of the VLLMConfig, some fields may be left in an
+    improper state.
+    """
+
+    enable_fusion: bool = Field(default=None)
     """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
-    enable_attn_fusion: bool = False
+    enable_attn_fusion: bool = Field(default=None)
     """Whether to enable the custom attention+quant fusion pass."""
-    enable_noop: bool = False
+    enable_noop: bool = Field(default=None)
     """Whether to enable the custom no-op elimination pass."""
-    enable_sequence_parallelism: bool = False
+    enable_sequence_parallelism: bool = Field(default=None)
     """Whether to enable sequence parallelism."""
-    enable_async_tp: bool = False
+    enable_async_tp: bool = Field(default=None)
     """Whether to enable async TP."""
-    enable_fi_allreduce_fusion: bool = False
+    enable_fi_allreduce_fusion: bool = Field(default=None)
     """Whether to enable flashinfer allreduce fusion."""
     fi_allreduce_fusion_max_size_mb: float | None = None
     """The threshold of the communicated tensor sizes under which
@@ -167,6 +173,22 @@ def compute_hash(self) -> str:
         """
         return InductorPass.hash_dict(asdict(self))
 
+    @field_validator(
+        "enable_fusion",
+        "enable_attn_fusion",
+        "enable_noop",
+        "enable_sequence_parallelism",
+        "enable_async_tp",
+        "enable_fi_allreduce_fusion",
+        mode="wrap",
+    )
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
     def __post_init__(self) -> None:
         if not self.enable_noop:
             if self.enable_fusion:
@@ -243,7 +265,13 @@ def compute_hash(self) -> str:
 @config
 @dataclass
 class CompilationConfig:
-    """Configuration for compilation. It has three parts:
+    """Configuration for compilation.
+
+    You must pass CompilationConfig to VLLMConfig constructor.
+    VLLMConfig's post_init does further initialization. If used outside of the
+    VLLMConfig, some fields will be left in an improper state.
+
+    It has three parts:
 
     - Top-level Compilation control:
         - [`mode`][vllm.config.CompilationConfig.mode]
@@ -282,14 +310,14 @@ class CompilationConfig:
     """
 
     # Top-level Compilation control
-    level: int | None = None
+    level: int = Field(default=None)
     """
     Level is deprecated and will be removed in the next release,
     either 0.12.0 or 0.11.2 whichever is soonest.
     Please use mode. Currently all levels are mapped to mode.
     """
     # Top-level Compilation control
-    mode: CompilationMode | None = None
+    mode: CompilationMode = Field(default=None)
     """The compilation approach used for torch.compile-based compilation of the
     model.
 
@@ -390,7 +418,7 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    cudagraph_mode: CUDAGraphMode | None = None
+    cudagraph_mode: CUDAGraphMode = Field(default=None)
     """
     The mode of the cudagraph:
 
@@ -452,7 +480,7 @@ class CompilationConfig:
     When `enable_lora` is False, this option has no effect.
     """
 
-    use_inductor_graph_partition: bool = False
+    use_inductor_graph_partition: bool = Field(default=None)
     """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
     This partition happens at inductor codegen time after all passes and fusions
     are finished. It generates a single `call` function which wraps
@@ -648,6 +676,20 @@ def validate_compile_cache_save_format(cls, value: str) -> str:
             )
         return value
 
+    @field_validator(
+        "level",
+        "mode",
+        "cudagraph_mode",
+        "use_inductor_graph_partition",
+        mode="wrap",
+    )
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
     def __post_init__(self) -> None:
         if self.level is not None:
             logger.warning(
@@ -948,6 +990,13 @@ def custom_op_log_check(self):
                     op,
                 )
 
+    def is_custom_op_enabled(self, op: str) -> bool:
+        if "all" in self.custom_ops:
+            return f"-{op}" not in self.custom_ops
+
+        assert "none" in self.custom_ops
+        return f"+{op}" in self.custom_ops
+
     def adjust_cudagraph_sizes_for_spec_decode(
         self, uniform_decode_query_len: int, tensor_parallel_size: int
     ):
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 25972f097f53..84311596b660 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1752,6 +1752,14 @@ def get_and_verify_max_len(self, max_model_len: int):
         logger.info("Using max model len %s", max_model_len)
         return max_model_len
 
+    def is_model_moe(
+        self,
+    ) -> bool:
+        return self.get_num_experts() > 1
+
+    def is_quantized(self) -> bool:
+        return getattr(self.hf_config, "quantization_config", None) is not None
+
 
 def get_served_model_name(model: str, served_model_name: str | list[str] | None):
     """
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 9342564aa3d3..c576275e80fe 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -9,8 +9,9 @@
 import threading
 import time
 from contextlib import contextmanager
-from dataclasses import replace
+from dataclasses import is_dataclass, replace
 from datetime import datetime
+from enum import IntEnum
 from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, TypeVar, get_args
@@ -57,6 +58,103 @@
 logger = init_logger(__name__)
 
 
+class OptimizationLevel(IntEnum):
+    """Optimization level enum."""
+
+    O0 = 0
+    """O0 : No optimization. no compilation, no cudagraphs, no other
+    optimization, just starting up immediately"""
+    O1 = 1
+    """O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise 
+    cudagraphs"""
+    O2 = 2
+    """O2: Full optimizations. -O1 as well as Full and Piecewise cudagraphs."""
+    O3 = 3
+    """O3: Currently the same as -O2s."""
+
+
+IS_QUANTIZED = False
+IS_DENSE = False
+# The optimizations that depend on these properties currently set to False
+# in all cases.
+# if model_config is not None:
+#     IS_QUANTIZED = lambda c: c.model_config.is_quantized()
+#     IS_DENSE = lambda c: not c.model_config.is_model_moe()
+# See https://github.com/vllm-project/vllm/issues/25689.
+
+
+def enable_fusion(cfg: "VllmConfig") -> bool:
+    """Returns True if RMS norm or quant FP8 is enabled."""
+    return cfg.compilation_config.is_custom_op_enabled(
+        "rms_norm"
+    ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
+
+
+OPTIMIZATION_LEVEL_00 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": False,
+            "enable_fusion": False,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": False,
+            "enable_sequence_parallelism": False,
+            "enable_async_tp": False,
+        },
+        "cudagraph_mode": CUDAGraphMode.NONE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_01 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": False,
+            "enable_sequence_parallelism": False,
+            "enable_async_tp": False,
+        },
+        "cudagraph_mode": CUDAGraphMode.PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_02 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": IS_QUANTIZED,
+            "enable_sequence_parallelism": IS_DENSE,
+            "enable_async_tp": IS_DENSE,
+        },
+        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_03 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": IS_QUANTIZED,
+            "enable_sequence_parallelism": IS_DENSE,
+            "enable_async_tp": IS_DENSE,
+        },
+        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+
+OPTIMIZATION_LEVEL_TO_CONFIG = {
+    OptimizationLevel.O0: OPTIMIZATION_LEVEL_00,
+    OptimizationLevel.O1: OPTIMIZATION_LEVEL_01,
+    OptimizationLevel.O2: OPTIMIZATION_LEVEL_02,
+    OptimizationLevel.O3: OPTIMIZATION_LEVEL_03,
+}
+
+
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class VllmConfig:
@@ -116,6 +214,11 @@ class VllmConfig:
     you are using. Contents must be hashable."""
     instance_id: str = ""
     """The ID of the vLLM instance."""
+    optimization_level: OptimizationLevel = OptimizationLevel.O2
+    """The optimization level. These levels trade startup time cost for
+    performance, with -O0 having the best startup time and -O3 having the best
+    performance. -02 is used by defult. See  OptimizationLevel for full
+    description."""
 
     def compute_hash(self) -> str:
         """
@@ -297,6 +400,50 @@ def with_hf_config(
 
         return replace(self, model_config=model_config)
 
+    def _set_config_default(self, config_obj: Any, key: str, value: Any) -> None:
+        """Set config attribute to default if not already set by user.
+
+        Args:
+            config_obj: Configuration object to update.
+            key: Attribute name.
+            value: Default value (static or callable).
+        """
+        if getattr(config_obj, key) is None:
+            # Some config values are known before initialization and are
+            # hard coded.
+            # Other values depend on the user given configuration, so they are
+            # implemented with lambda functions and decided at run time.
+            setattr(config_obj, key, value(self) if callable(value) else value)
+
+    def _apply_optimization_level_defaults(self, defaults: dict[str, Any]) -> None:
+        """Apply optimization level defaults using self as root.
+
+        Recursively applies values from defaults into nested config objects.
+        Only fields present in defaults are overwritten.
+
+        If the user configuration does not specify a value for a default field
+        and if the default field is still None after all user selections are
+        applied, then default values will be applied to the field. User speciied
+        fields will not be overridden by the default.
+
+        Args:
+            defaults: Dictionary of default values to apply.
+        """
+
+        def apply_recursive(config_obj: Any, config_defaults: dict[str, Any]) -> None:
+            """Recursively apply defaults to config_obj, using self as root."""
+            for key, value in config_defaults.items():
+                if not hasattr(config_obj, key):
+                    continue
+
+                current = getattr(config_obj, key)
+                if isinstance(value, dict) and is_dataclass(current):
+                    apply_recursive(current, value)
+                else:
+                    self._set_config_default(config_obj, key, value)
+
+        apply_recursive(self, defaults)
+
     def _post_init_kv_transfer_config(self) -> None:
         """Update KVTransferConfig based on top-level configs in VllmConfig.
 
@@ -434,17 +581,47 @@ def __post_init__(self):
                 "precision for chunked prefill triton kernels."
             )
 
-        # If the user does not explicitly set a compilation mode, then
-        # we use the default mode. The default mode depends on other
-        # settings (see the below code).
+        if (
+            self.optimization_level > OptimizationLevel.O0
+            and self.model_config is not None
+            and self.model_config.enforce_eager
+        ):
+            logger.warning("Enforce eager set, overriding optimization level to -O0")
+            self.optimization_level = OptimizationLevel.O0
+
+        if self.compilation_config.backend == "eager" or (
+            self.compilation_config.mode is not None
+            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
+        ):
+            logger.warning(
+                "Inductor compilation was disabled by user settings,"
+                "Optimizations settings that are only active during"
+                "Inductor compilation will be ignored."
+            )
+
+        def has_blocked_weights():
+            if self.quant_config is not None:
+                if hasattr(self.quant_config, "weight_block_size"):
+                    return self.quant_config.weight_block_size is not None
+                elif hasattr(self.quant_config, "has_blocked_weights"):
+                    return self.quant_config.has_blocked_weights()
+            return False
+
+        # Enable quant_fp8 CUDA ops (TODO disable in follow up)
+        # On H100 the CUDA kernel is faster than
+        # native implementation
+        # https://github.com/vllm-project/vllm/issues/25094
+        if has_blocked_weights():
+            custom_ops = self.compilation_config.custom_ops
+            if "-quant_fp8" not in custom_ops:
+                custom_ops.append("+quant_fp8")
+
         if self.compilation_config.mode is None:
-            if self.model_config is not None and not self.model_config.enforce_eager:
+            if self.optimization_level > OptimizationLevel.O0:
                 self.compilation_config.mode = CompilationMode.VLLM_COMPILE
             else:
                 self.compilation_config.mode = CompilationMode.NONE
 
-        # If user does not set custom ops via none or all set it here based on
-        # compilation mode and backend.
         if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
             if (
                 self.compilation_config.backend == "inductor"
@@ -454,23 +631,33 @@ def __post_init__(self):
             else:
                 self.compilation_config.custom_ops.append("all")
 
+        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
+        self._apply_optimization_level_defaults(default_config)
+        if (
+            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
+        ):
+            logger.info(
+                "Cudagraph mode %s is not compatible with compilation mode %s."
+                "Overriding to NONE.",
+                self.compilation_config.cudagraph_mode,
+                self.compilation_config.mode,
+            )
+            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
             self.compilation_config.pass_config.enable_sequence_parallelism = True
+        if self.compilation_config.pass_config.enable_sequence_parallelism:
+            if "-rms_norm" in self.compilation_config.custom_ops:
+                logger.warning(
+                    "RMS norm force disabled, sequence parallelism might break"
+                )
+            else:
+                self.compilation_config.custom_ops.append("+rms_norm")
 
         if current_platform.support_static_graph_mode():
-            # if cudagraph_mode is not explicitly set by users, set default
-            # value
-            if self.compilation_config.cudagraph_mode is None:
-                if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
-                    # default to full and piecewise for most models
-                    self.compilation_config.cudagraph_mode = (
-                        CUDAGraphMode.FULL_AND_PIECEWISE
-                    )
-                else:
-                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
             # if cudagraph_mode has full cudagraphs, we need to check support
             if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
                 # decode context parallel does not support full cudagraphs
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 696ff3a1f402..e4c9a82d2522 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -77,6 +77,7 @@
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
+from vllm.config.vllm import OptimizationLevel
 from vllm.logger import init_logger, suppress_logging
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
@@ -560,6 +561,7 @@ class EngineArgs:
     stream_interval: int = SchedulerConfig.stream_interval
 
     kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
+    optimization_level: OptimizationLevel = VllmConfig.optimization_level
 
     kv_offloading_size: float | None = CacheConfig.kv_offloading_size
     kv_offloading_backend: KVOffloadingBackend | None = (
@@ -1114,6 +1116,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
         )
 
+        vllm_group.add_argument(
+            "--optimization-level", **vllm_kwargs["optimization_level"]
+        )
+
         # Other arguments
         parser.add_argument(
             "--disable-log-stats",
@@ -1733,7 +1739,6 @@ def create_engine_config(
             compilation_config.max_cudagraph_capture_size = (
                 self.max_cudagraph_capture_size
             )
-
         config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
@@ -1750,6 +1755,7 @@ def create_engine_config(
             kv_events_config=self.kv_events_config,
             ec_transfer_config=self.ec_transfer_config,
             additional_config=self.additional_config,
+            optimization_level=self.optimization_level,
         )
 
         return config
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index 692e756d1963..b68157f02f6c 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -247,16 +247,16 @@ def repl(match: re.Match) -> str:
             elif arg.startswith("-O") and arg != "-O" and arg[2] != ".":
                 # allow -O flag to be used without space, e.g. -O3 or -Odecode
                 # -O.<...> handled later
-                # also handle -O=<mode> here
-                mode = arg[3:] if arg[2] == "=" else arg[2:]
-                processed_args.append(f"-O.mode={mode}")
+                # also handle -O=<optimization_level> here
+                optimization_level = arg[3:] if arg[2] == "=" else arg[2:]
+                processed_args += ["--optimization-level", optimization_level]
             elif (
                 arg == "-O"
                 and i + 1 < len(args)
                 and args[i + 1] in {"0", "1", "2", "3"}
             ):
-                # Convert -O <n> to -O.mode <n>
-                processed_args.append("-O.mode")
+                # Convert -O <n> to --optimization-level <n>
+                processed_args.append("--optimization-level")
             else:
                 processed_args.append(arg)
 
@@ -294,10 +294,24 @@ def recursive_dict_update(
         delete = set[int]()
         dict_args = defaultdict[str, dict[str, Any]](dict)
         duplicates = set[str]()
+        # Track regular arguments (non-dict args) for duplicate detection
+        regular_args_seen = set[str]()
         for i, processed_arg in enumerate(processed_args):
             if i in delete:  # skip if value from previous arg
                 continue
 
+            if processed_arg.startswith("--") and "." not in processed_arg:
+                if "=" in processed_arg:
+                    arg_name = processed_arg.split("=", 1)[0]
+                else:
+                    arg_name = processed_arg
+
+                if arg_name in regular_args_seen:
+                    duplicates.add(arg_name)
+                else:
+                    regular_args_seen.add(arg_name)
+                continue
+
             if processed_arg.startswith("-") and "." in processed_arg:
                 if "=" in processed_arg:
                     processed_arg, value_str = processed_arg.split("=", 1)
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 4fd8eb50a4ea..eb8e610ae471 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -37,7 +37,7 @@ def __init__(
         self.dp_size = vllm_config.parallel_config.data_parallel_size
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
-
+        self.cudagraph_mode: CUDAGraphMode
         if self.compilation_config.cudagraph_mode is None:
             self.cudagraph_mode = CUDAGraphMode.NONE
         else:

From 51906c8c559f1d7c23efa667fcb3b7ed79f7fa25 Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Thu, 27 Nov 2025 18:09:24 +0800
Subject: [PATCH 473/578] [Docs] Improve `priority` parameter documentation
 (#29572)

Signed-off-by: maang <maang_h@163.com>
Signed-off-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/entrypoints/llm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1860f383d45f..f6ee74678998 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -405,6 +405,9 @@ def generate(
             lora_request: LoRA request to use for generation, if any.
             priority: The priority of the requests, if any.
                 Only applicable when priority scheduling policy is enabled.
+                If provided, must be a list of integers matching the length
+                of `prompts`, where each priority value corresponds to the prompt
+                at the same index.
 
         Returns:
             A list of `RequestOutput` objects containing the

From e6d4f3c254a215e75b4d76d531176e242fe62a1f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Nov 2025 18:23:06 +0800
Subject: [PATCH 474/578] [Bugfix] Fix pre-commit (#29601)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../ec_connector/integration/test_epd_correctness.py  |  5 ++---
 vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py     | 11 +++++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py
index 69c4c58e349b..616d34441ab8 100644
--- a/tests/v1/ec_connector/integration/test_epd_correctness.py
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@@ -237,9 +237,8 @@ def main():
 
     for i, prompt_data in enumerate(test_prompts):
         print(
-            f"\nRunning prompt {i + 1}/{len(test_prompts)}: {
-                prompt_data['description']
-            }"
+            f"\nRunning prompt {i + 1}/{len(test_prompts)}: "
+            f"{prompt_data['description']}"
         )
 
         output_str = run_chat_completion(
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
index a6f50d68cc68..dcdeedda60a7 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
@@ -35,13 +35,16 @@ def __init__(
         self.compilation_config = vllm_config.compilation_config
         assert self.compilation_config is not None
 
+        cudagraph_mode: CUDAGraphMode
         if self.compilation_config.cudagraph_mode is None:
-            self.cudagraph_mode = CUDAGraphMode.NONE
+            cudagraph_mode = CUDAGraphMode.NONE
         else:
-            self.cudagraph_mode = self.compilation_config.cudagraph_mode
-            if self.cudagraph_mode == CUDAGraphMode.FULL:
+            cudagraph_mode = self.compilation_config.cudagraph_mode
+            if cudagraph_mode == CUDAGraphMode.FULL:
                 # NOTE(woosuk): For Eagle, we only use CUDA graphs for decode.
-                self.cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
+                cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
+
+        self.cudagraph_mode = cudagraph_mode
 
         self.cudagraph_sizes = get_cudagraph_sizes(
             self.compilation_config.cudagraph_capture_sizes,

From a5abd1d38439a026607d641c594ca98829ea5623 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 27 Nov 2025 19:33:19 +0800
Subject: [PATCH 475/578] [CI] Auto label CPU related issues (#29602)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .github/workflows/issue_autolabel.yml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index 7d565ef9f2e4..a8251ceed07f 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -105,6 +105,31 @@ jobs:
                   }
                 ],
               },
+              cpu: {
+                // Keyword search - matches whole words only (with word boundaries)
+                keywords: [
+                  {
+                    term: "[CPU]",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "x86",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "ARM",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "Apple Silicon",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "IBM Z",
+                    searchIn: "title"
+                  },
+                ],
+              },
               // Add more label configurations here as needed
               // example: {
               //   keywords: [...],

From cf348c8d27c34247f5976a86ebe6f4a3b4f9e888 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Thu, 27 Nov 2025 04:36:24 -0800
Subject: [PATCH 476/578] [Bugfix] Fix HunyuanVL XD-RoPE (#29593)

Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored by: grider-transwithai <grider@transwith.ai>
---
 vllm/model_executor/models/hunyuan_vision.py           | 2 +-
 vllm/transformers_utils/processors/hunyuan_vl_image.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index e83addd0c092..2950db571e6e 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -847,7 +847,7 @@ def get_xdrope_input_positions(
                 .expand(-1, llm_grid_w + 1)
                 .reshape(-1)
             )
-            h_index[pos : pos + token_num] = 0
+            t_index[pos : pos + token_num] = image_index
 
         if xd_num == 4:
             llm_positions = torch.stack([p_index, w_index, h_index, t_index])
diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py
index 0a7e7865c783..0b10ae249dbb 100644
--- a/vllm/transformers_utils/processors/hunyuan_vl_image.py
+++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py
@@ -195,9 +195,9 @@ def _preprocess(
         processed_images = []
         for image in images:
             if do_resize:
-                resized_width, resized_height = smart_resize(
-                    width,
-                    height,
+                resized_height, resized_width = smart_resize(
+                    height=height,
+                    width=width,
                     factor=patch_size * merge_size,
                     min_pixels=self.min_pixels,
                     max_pixels=self.max_pixels,

From 2f5f9acd551cfb737997a1f7f86982ec74aabf79 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 27 Nov 2025 21:56:28 +0800
Subject: [PATCH 477/578] [LoRA] Continue optimizing MoE LoRA weight loading
 (#29322)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_lora_checkpoints.py          |  15 +-
 tests/lora/test_lora_huggingface.py          |   8 +-
 vllm/lora/layers/base.py                     |   2 +-
 vllm/lora/layers/column_parallel_linear.py   |  16 +-
 vllm/lora/layers/fused_moe.py                | 218 ++++++++++---------
 vllm/lora/layers/logits_processor.py         |   2 +-
 vllm/lora/layers/replicated_linear.py        |   2 +-
 vllm/lora/layers/row_parallel_linear.py      |   4 +-
 vllm/lora/layers/vocal_parallel_embedding.py |   2 +-
 vllm/lora/lora_weights.py                    |  53 +++++
 vllm/lora/models.py                          |  50 +++--
 vllm/lora/utils.py                           |  17 +-
 vllm/lora/worker_manager.py                  |  10 +-
 vllm/model_executor/models/interfaces.py     |   1 +
 vllm/model_executor/models/qwen3_vl_moe.py   |   1 +
 15 files changed, 236 insertions(+), 165 deletions(-)

diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 2219d470e91a..b9b1bc59c6ed 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -28,12 +28,13 @@ def test_load_checkpoints(
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+    expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     if lora_name == "baichuan7B":
         peft_helper = PEFTHelper.from_local_dir(
             baichuan_lora_files, max_position_embeddings=4096
@@ -103,13 +104,13 @@ def test_lora_weights_mapping(baichuan_lora_files):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+    expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
-
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.": "language_model.model.",
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 7d20faef541a..6a787471c74f 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -26,13 +26,13 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
     embedding_modules = LlamaForCausalLM.embedding_modules
     embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+    expected_lora_lst: list[str] = []
     for module in LLAMA_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
-
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     lora_path = get_adapter_absolute_path(lora_name)
 
     # lora loading should work for either absolute path and huggingface id.
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
index 3bfb88c00762..a4b8fb4d2aec 100644
--- a/vllm/lora/layers/base.py
+++ b/vllm/lora/layers/base.py
@@ -60,7 +60,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
         raise NotImplementedError
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index 3e21d426c304..904025901fba 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -153,7 +153,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is ColumnParallelLinear or (
             type(source_layer) is MergedColumnParallelLinear
@@ -272,7 +272,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return (
             type(source_layer) is MergedColumnParallelLinear
@@ -338,7 +338,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1
 
@@ -396,7 +396,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 3
 
@@ -434,7 +434,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -480,7 +480,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -516,7 +516,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -565,7 +565,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 1b925742c300..3ad19370962a 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -401,6 +401,61 @@ def create_lora_weights(
                     self.w13_lora_b_stacked[1][lora_id][experts_id]
                 )
 
+    def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w13_lora_a
+
+        # w13_lora_a shape (num_experts,rank,input_size)
+        current_lora_rank = w13_lora_a.shape[1]
+        assert current_lora_rank % self.tp_size == 0
+        # Based on S-LoRA, we slice W13/W1/W3 A along the rank dim.
+        sliced_rank = current_lora_rank // self.tp_size
+        start_idx = self.tp_rank * sliced_rank
+        end_idx = (self.tp_rank + 1) * sliced_rank
+        return w13_lora_a[:, start_idx:end_idx, :]
+
+    def _slice_w13_b(self, w13_lora_b: torch.Tensor):
+        if self.tp_size == 1:
+            return w13_lora_b
+
+        # w13_lora_b shape (num_experts,output_size,rank)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+
+        return w13_lora_b[:, start_idx:end_idx, :]
+
+    def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1:
+            return w2_lora_a
+        # w2_lora_a shape (num_experts,rank,input_size)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+
+        return w2_lora_a[:, :, start_idx:end_idx]
+
+    def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w2_lora_b
+        # Based on S-LoRA, we slice W2 B along the hidden_size dim.
+        # w2_lora_b shape (num_experts,output_size,rank)
+        current_lora_size = w2_lora_b.shape[1]
+
+        sliced_size = current_lora_size // self.tp_size
+        start_idx = self.tp_rank * sliced_size
+        end_idx = (self.tp_rank + 1) * sliced_size
+        return w2_lora_b[:, start_idx:end_idx, :]
+
     def reset_lora(self, index: int):
         """Resets the lora weights at index back to 0."""
         for pos in range(self._w13_slices):
@@ -411,6 +466,8 @@ def reset_lora(self, index: int):
         self.w2_lora_b_stacked[0][index] = 0
         self.adapter_enabled[index] = 0
 
+    #
+
     def set_lora(
         self,
         index: int,
@@ -418,69 +475,55 @@ def set_lora(
         lora_b: torch.Tensor | list[torch.Tensor],
     ):
         """Overwrites lora tensors at index."""
+        # Make mypy happy
         assert isinstance(lora_a, list)
         assert isinstance(lora_b, list)
+
         self.reset_lora(index)
         self.adapter_enabled[index] = 1
-        for eid in range(len(lora_a) // 3):
-            w1_lora_a = lora_a[eid * 3]
-            w2_lora_a = lora_a[eid * 3 + 1]
-            w3_lora_a = lora_a[eid * 3 + 2]
-            w1_lora_b = lora_b[eid * 3]
-            w2_lora_b = lora_b[eid * 3 + 1]
-            w3_lora_b = lora_b[eid * 3 + 2]
-
-            # Handle the case of adding LoRA to only a subset of experts
-            if w1_lora_a is None or w2_lora_a is None or w3_lora_a is None:
-                continue
-
-            if self.tp_size > 1:
-                shard_size = self.base_layer.intermediate_size_per_partition
-                start_idx = self.tp_rank * shard_size
-                end_idx = (self.tp_rank + 1) * shard_size
-
-                w1_lora_b = w1_lora_b[start_idx:end_idx, :]
-                w3_lora_b = w3_lora_b[start_idx:end_idx, :]
-                w2_lora_a = w2_lora_a[:, start_idx:end_idx]
-
-                if self.fully_sharded:
-                    # Based on S-LoRA, we slice W1 and W3 A along the rank dim,
-                    # and W2 B along the hidden_size dim.
-                    w13_shard_size = self.w13_lora_a_stacked[0][index, eid].shape[0]
-                    w13_start_idx = self.tp_rank * w13_shard_size
-                    w13_end_idx = (self.tp_rank + 1) * w13_shard_size
-                    w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :]
-                    w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :]
-
-                    w2_shard_size = self.w2_lora_b_stacked[0][index, eid].shape[0]
-                    w2_start_idx = self.tp_rank * w2_shard_size
-                    w2_end_idx = (self.tp_rank + 1) * w2_shard_size
-                    w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :]
-            # w1 lora_a
-            self.w13_lora_a_stacked[0][
-                index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1]
-            ].copy_(w1_lora_a, non_blocking=True)
-            # w3 lora_a
-            self.w13_lora_a_stacked[1][
-                index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1]
-            ].copy_(w3_lora_a, non_blocking=True)
-
-            # w1 lora_b
-            self.w13_lora_b_stacked[0][
-                index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1]
-            ].copy_(w1_lora_b, non_blocking=True)
-            # w3 lora_b
-            self.w13_lora_b_stacked[1][
-                index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1]
-            ].copy_(w3_lora_b, non_blocking=True)
-
-            self.w2_lora_a_stacked[0][
-                index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1]
-            ].copy_(w2_lora_a, non_blocking=True)
-
-            self.w2_lora_b_stacked[0][
-                index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1]
-            ].copy_(w2_lora_b, non_blocking=True)
+
+        num_experts = self.w13_lora_a_stacked[0].shape[1]
+
+        w1_lora_a, w2_lora_a, w3_lora_a = lora_a
+        w1_lora_b, w2_lora_b, w3_lora_b = lora_b
+        assert (
+            num_experts
+            == w1_lora_a.shape[0]
+            == w2_lora_a.shape[0]
+            == w3_lora_a.shape[0]
+        )
+
+        slliced_w1_lora_a = self._slice_w13_a(w1_lora_a)
+        slliced_w1_lora_b = self._slice_w13_b(w1_lora_b)
+        slliced_w3_lora_a = self._slice_w13_a(w3_lora_a)
+        slliced_w3_lora_b = self._slice_w13_b(w3_lora_b)
+
+        sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
+        sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
+
+        self.w13_lora_a_stacked[0][
+            index, :, : slliced_w1_lora_a.shape[1], : slliced_w1_lora_a.shape[2]
+        ].copy_(slliced_w1_lora_a, non_blocking=True)
+
+        self.w13_lora_a_stacked[1][
+            index, :, : slliced_w3_lora_a.shape[1], : slliced_w3_lora_a.shape[2]
+        ].copy_(slliced_w3_lora_a, non_blocking=True)
+
+        self.w13_lora_b_stacked[0][
+            index, :, : slliced_w1_lora_b.shape[1], : slliced_w1_lora_b.shape[2]
+        ].copy_(slliced_w1_lora_b, non_blocking=True)
+
+        self.w13_lora_b_stacked[1][
+            index, :, : slliced_w3_lora_b.shape[1], : slliced_w3_lora_b.shape[2]
+        ].copy_(slliced_w3_lora_b, non_blocking=True)
+
+        self.w2_lora_a_stacked[0][
+            index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
+        ].copy_(sliced_w2_lora_a, non_blocking=True)
+
+        self.w2_lora_b_stacked[0][
+            index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
+        ].copy_(sliced_w2_lora_b, non_blocking=True)
 
     def forward(self, *args, **kwargs):
         return self.base_layer.forward(*args, **kwargs)
@@ -506,12 +549,12 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
-        # return type(source_layer) is FusedMoE
 
-        return type(source_layer) is FusedMoE and len(packed_modules_list) == 2
+        # source_layer is FusedMoE or SharedFusedMoE
+        return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 2
 
 
 class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
@@ -555,6 +598,9 @@ def create_lora_weights(
         model_config: PretrainedConfig | None = None,
     ) -> None:
         """Initializes lora matrices."""
+
+        assert isinstance(model_config, PretrainedConfig)
+        self._base_model = model_config.architectures[0]
         self.max_loras = lora_config.max_loras
         self.fully_sharded = lora_config.fully_sharded_loras
 
@@ -565,20 +611,7 @@ def create_lora_weights(
         self._create_lora_a_weights(max_loras, lora_config)
         self._create_lora_b_weights(max_loras, lora_config)
 
-    def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor:
-        if self.tp_size == 1 or not self.fully_sharded:
-            return w13_lora_a
-
-        # w13_lora_a shape (num_experts,rank,input_size)
-        current_lora_rank = w13_lora_a.shape[1]
-        assert current_lora_rank % self.tp_size == 0
-
-        sliced_rank = current_lora_rank // self.tp_size
-        start_idx = self.tp_rank * sliced_rank
-        end_idx = (self.tp_rank + 1) * sliced_rank
-        return w13_lora_a[:, start_idx:end_idx, :]
-
-    def _slice_w13_b(self, w13_lora_b: torch.Tensor, is_interleave: bool = True):
+    def _slice_w13_b(self, w13_lora_b: torch.Tensor):
         if self.tp_size == 1:
             return w13_lora_b
 
@@ -586,7 +619,8 @@ def _slice_w13_b(self, w13_lora_b: torch.Tensor, is_interleave: bool = True):
         shard_size = self.base_layer.intermediate_size_per_partition
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
-        if is_interleave:
+        # HACK: Currently, only GPT-OSS is in interleaved order
+        if self._base_model == "GptOssForCausalLM":
             # For models like GPT-OSS, the weights of w1 (gate_proj) and w3 (up_proj)
             # in the interleaved order, and corresponding LoRA need to be processed.
             w1_lora_b = w13_lora_b[:, ::2, :]
@@ -606,28 +640,6 @@ def _slice_w13_b(self, w13_lora_b: torch.Tensor, is_interleave: bool = True):
 
             return torch.cat([sliced_w1_lora_b, sliced_w3_lora_b], dim=1)
 
-    def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor:
-        if self.tp_size == 1:
-            return w2_lora_a
-        # w2_lora_a shape (num_experts,rank,input_size)
-        shard_size = self.base_layer.intermediate_size_per_partition
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-
-        return w2_lora_a[:, :, start_idx:end_idx]
-
-    def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor:
-        if self.tp_size == 1 or not self.fully_sharded:
-            return w2_lora_b
-        # Based on S-LoRA, we slice W2 B along the hidden_size dim.
-        # w2_lora_b shape (num_experts,output_size,rank)
-        current_lora_size = w2_lora_b.shape[1]
-
-        sliced_size = current_lora_size // self.tp_size
-        start_idx = self.tp_rank * sliced_size
-        end_idx = (self.tp_rank + 1) * sliced_size
-        return w2_lora_b[:, start_idx:end_idx, :]
-
     def set_lora(
         self,
         index: int,
@@ -658,7 +670,7 @@ def set_lora(
         w2_lora_b = w2_lora_b.permute(1, 0, 2)
 
         sliced_w13_lora_a = self._slice_w13_a(w13_lora_a)
-        sliced_w13_lora_b = self._slice_w13_b(w13_lora_b, is_interleave=True)
+        sliced_w13_lora_b = self._slice_w13_b(w13_lora_b)
 
         sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
         sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
@@ -711,8 +723,8 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
-
-        return type(source_layer) is FusedMoE and len(packed_modules_list) == 1
+        # source_layer is FusedMoE or SharedFusedMoE
+        return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 1
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index c01984db4e64..01515f613637 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -197,7 +197,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # Special handling for the LogitsProcessor.
         return False
diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py
index 243736c4ebc6..62bac546ccd1 100644
--- a/vllm/lora/layers/replicated_linear.py
+++ b/vllm/lora/layers/replicated_linear.py
@@ -53,7 +53,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is ReplicatedLinear
 
diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py
index 95517b1aee26..958aa6af3674 100644
--- a/vllm/lora/layers/row_parallel_linear.py
+++ b/vllm/lora/layers/row_parallel_linear.py
@@ -87,7 +87,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is RowParallelLinear
 
@@ -164,7 +164,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
index c87ca9e24dec..4c1550d09e5e 100644
--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -131,7 +131,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is VocabParallelEmbedding
 
diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py
index f0d8e2219405..15c4a1be63ee 100644
--- a/vllm/lora/lora_weights.py
+++ b/vllm/lora/lora_weights.py
@@ -152,6 +152,59 @@ def pack(
         )
         return obj
 
+    @classmethod
+    def pack_moe(
+        cls, loras: GenericSequence[Optional["LoRALayerWeights"]], module_name: str
+    ) -> "PackedLoRALayerWeights":
+        """Pack a list of LoRAs into a single LoRA.
+
+        If LoRA is None, it signifies that the submodule does not have a LoRA.
+        """
+
+        first_lora = next(lora for lora in loras if lora is not None)
+        assert first_lora is not None
+        rank = first_lora.rank
+        lora_alpha = first_lora.lora_alpha
+        assert len(loras) % 3 == 0
+        w1_lora_a_lst = []
+        w2_lora_a_lst = []
+        w3_lora_a_lst = []
+        w1_lora_b_lst = []
+        w2_lora_b_lst = []
+        w3_lora_b_lst = []
+        # TODO: Consider the case where some experts don't have LoRA added.
+        for eid in range(len(loras) // 3):
+            w1_lora = loras[eid * 3]
+            w2_lora = loras[eid * 3 + 1]
+            w3_lora = loras[eid * 3 + 2]
+            assert w1_lora is not None
+            assert w2_lora is not None
+            assert w3_lora is not None
+
+            w1_lora_a_lst.append(w1_lora.lora_a)
+            w2_lora_a_lst.append(w2_lora.lora_a)
+            w3_lora_a_lst.append(w3_lora.lora_a)
+
+            w1_lora_b_lst.append(w1_lora.lora_b)
+            w2_lora_b_lst.append(w2_lora.lora_b)
+            w3_lora_b_lst.append(w3_lora.lora_b)
+
+        w1_lora_a = torch.stack(w1_lora_a_lst, dim=0)  # (num_experts,rank,input_size)
+        w2_lora_a = torch.stack(w2_lora_a_lst, dim=0)
+        w3_lora_a = torch.stack(w3_lora_a_lst, dim=0)
+        w1_lora_b = torch.stack(w1_lora_b_lst, dim=0)  # (num_experts,output_size,rank)
+        w2_lora_b = torch.stack(w2_lora_b_lst, dim=0)
+        w3_lora_b = torch.stack(w3_lora_b_lst, dim=0)
+
+        obj = cls(
+            module_name,
+            rank,
+            [lora_alpha, lora_alpha, lora_alpha],
+            [w1_lora_a, w2_lora_a, w3_lora_a],
+            [w1_lora_b, w2_lora_b, w3_lora_b],
+        )
+        return obj
+
     def optimize(self) -> "PackedLoRALayerWeights":
         """Optimize the LoRA by merging the scaling into lora_b."""
         for i in range(len(self.lora_b)):
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 636f062feb7b..4caaf0e117cc 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -13,7 +13,7 @@
 
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import BaseLayerWithLoRA, FusedMoEWithLoRA, LoRAMapping
+from vllm.lora.layers import BaseLayerWithLoRA, FusedMoE3DWithLoRA, LoRAMapping
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.punica_wrapper import get_punica_wrapper
@@ -151,16 +151,13 @@ def from_lora_tensors(
                 if pin_memory:
                     loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
 
-        for lora in loras.values():
-            lora.optimize()
-
         return cls(lora_model_id, peft_helper.r, loras)
 
     @classmethod
     def from_local_checkpoint(
         cls,
         lora_dir: str,
-        expected_lora_modules: list[str],
+        expected_lora_modules: set[str],
         peft_helper: PEFTHelper,
         *,
         lora_model_id: int | None = None,
@@ -190,10 +187,7 @@ def from_local_checkpoint(
         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
         lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
-        # new_embeddings_tensor_path = os.path.join(
-        #     lora_dir, "new_embeddings.safetensors"
-        # )
-        # new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
+
         tensors: dict[str, torch.Tensor] = {}
         unexpected_modules: list[list[str] | str] = []
 
@@ -201,18 +195,19 @@ def check_unexpected_modules(modules: dict):
             for lora_module in modules.keys():  # noqa
                 if is_base_embeddding_weights(lora_module):
                     continue
-                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
-                # Handle FSDP file format where experts.base_layer is the
+                # Handle PEFT file format where experts.base_layer is the
                 # gate_up_proj and experts is the down_proj
                 if "base_layer" in lora_module:
                     continue
+                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
                 # Case for expert lora weights
                 if ".experts" in module_name:
-                    if not any(
-                        module_name.endswith(ele) for ele in expected_lora_modules
-                    ):
+                    expert_idx = module_name.find(".experts")
+                    expert_suffix = module_name[expert_idx + 1 :]
+                    if expert_suffix not in expected_lora_modules:
                         unexpected_modules.append(module_name)
-                elif module_name.split(".")[-1] not in expected_lora_modules:
+
+                elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules:
                     unexpected_modules.append(module_name)
 
             if unexpected_modules:
@@ -358,9 +353,7 @@ def __init__(
         self.modules: dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a set for compatibility with LRUCache.
         self._last_mapping: LoRAMapping | None = None
-        self._is_3d_moe_model = is_moe_model(self.model) and hasattr(
-            self.model, "is_3d_moe_weight"
-        )
+        self._is_3d_moe_model = is_moe_model(self.model) and self.model.is_3d_moe_weight
         self._create_lora_modules()
 
         self.model.lora_manager = self
@@ -411,7 +404,7 @@ def activate_adapter(
                 continue
             # Note (gnovack) - If MOE lora weights are not split into
             # num_experts chunks, we split them here
-            if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor(
+            if isinstance(module, FusedMoE3DWithLoRA) and torch.is_tensor(
                 module_lora.lora_a
             ):
                 # Handle PEFT file format where experts.base_layer is the
@@ -679,7 +672,10 @@ def create_dummy_lora(
                         "cpu",
                     )
                     subloras.append(lora)
-                lora = PackedLoRALayerWeights.pack(subloras)
+                if module.__class__.__name__ == "FusedMoEWithLoRA":
+                    lora = PackedLoRALayerWeights.pack_moe(subloras, module_name)
+                else:
+                    lora = PackedLoRALayerWeights.pack(subloras)
                 model.loras[module_name] = lora
         return model
 
@@ -739,13 +735,21 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
                 replaced_module_name = module_name.replace("model.", "")
                 if lora_model.check_lora_name(module_name):
                     module_name = replaced_module_name
-            lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
-                replacement_loras
-            )
+            if module_name.endswith(".experts"):
+                lora_model.loras[module_name] = PackedLoRALayerWeights.pack_moe(
+                    replacement_loras, module_name
+                )
+            else:
+                lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
+                    replacement_loras
+                )
             # Remove the modules that have been replaced.
             for module in replaced_module:
                 lora_model.loras.pop(module, None)
 
+        for lora in lora_model.loras.values():
+            lora.optimize()
+
     def _get_lora_layer_weights(
         self, lora_model: LoRAModel, module_name: str
     ) -> LoRALayerWeights | None:
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 12524994d496..47484b2b984d 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -170,16 +170,15 @@ def parse_fine_tuned_lora_name(
 
 def is_base_embeddding_weights(name: str) -> bool:
     # hardcoded subfixes for input & output embedding weights
-    input_embedding_subfix = ".embed_tokens.base_layer.weight"
-    output_embedding_subfix = ".lm_head.base_layer.weight"
-
-    return name.endswith(input_embedding_subfix) or name.endswith(
-        output_embedding_subfix
+    embedding_suffixes = (
+        ".embed_tokens.base_layer.weight",
+        ".lm_head.base_layer.weight",
     )
+    return name.endswith(embedding_suffixes)
 
 
 def is_regex_target_modules(
-    load_modules: str | list[str], expected_lora_modules: list[str]
+    load_modules: str | list[str], expected_lora_modules: set[str]
 ) -> bool:
     """
     PEFT supports passing `target_modules` in the form of regular expressions,
@@ -195,8 +194,8 @@ def is_valid_regex(pattern):
         except re.error:
             return False
 
-    def is_subset(sub_list, full_list):
-        return set(sub_list).issubset(set(full_list))
+    def is_subset(sub_list, full_set):
+        return set(sub_list).issubset(full_set)
 
     # Similar to PEFT's processing logic, regex-related operations are only
     #  executed when the load_modules is a `str`.
@@ -290,7 +289,7 @@ def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]:
             # the expert indices are expanded based on the configured number
             # of routed experts.
             packed_modules_mapping = get_packed_modules_mapping(model)
-            if not hasattr(model, "is_3d_moe_weight"):
+            if not model.is_3d_moe_weight:
                 # 3D MoE LoRA does not need `packed_modules_mapping`
                 packed_modules_mapping["experts"] = [
                     weight_name.rstrip(".")
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 4cc201a6414f..d9a03f050049 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -88,15 +88,15 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
         try:
             supported_lora_modules = self._adapter_manager.supported_lora_modules
             packed_modules_mapping = self._adapter_manager.packed_modules_mapping
-            expected_lora_modules: list[str] = []
+            expected_lora_lst: list[str] = []
             for module in supported_lora_modules:
                 if module in packed_modules_mapping:
-                    expected_lora_modules.extend(packed_modules_mapping[module])
+                    expected_lora_lst.extend(packed_modules_mapping[module])
                 else:
-                    expected_lora_modules.append(module)
+                    expected_lora_lst.append(module)
                 if module == "experts":
-                    expected_lora_modules.append(module)
-            expected_lora_modules = list(set(expected_lora_modules))
+                    expected_lora_lst.append(module)
+            expected_lora_modules = set(expected_lora_lst)
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
 
             peft_helper = PEFTHelper.from_local_dir(
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 6f6ce32538b7..cee0b79e5e5a 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -336,6 +336,7 @@ class SupportsLoRA(Protocol):
         There is no need to redefine this flag if this class is in the
         MRO of your model class.
     """
+    is_3d_moe_weight: ClassVar[bool] = False
     # The `embedding_module` and `embedding_padding_modules`
     # are empty by default.
     embedding_modules: ClassVar[dict[str, str]] = {}
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index e2c129120b1a..a054bd5b3831 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -401,6 +401,7 @@ def set_moe_parameters(self):
 class Qwen3VLMoeForConditionalGeneration(
     Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts
 ):
+    is_3d_moe_weight: bool = True
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

From 882851dc817061de52c949ac27b11442e5529caa Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 27 Nov 2025 22:51:26 +0800
Subject: [PATCH 478/578] [CI/Build][Bugfix] Fix auto label issues for CPU
 (#29610)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .github/workflows/issue_autolabel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index a8251ceed07f..629966b95933 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -109,7 +109,7 @@ jobs:
                 // Keyword search - matches whole words only (with word boundaries)
                 keywords: [
                   {
-                    term: "[CPU]",
+                    term: "CPU Backend",
                     searchIn: "title"
                   },
                   {

From bab438ff3e7bd93f861e66a60c6cbefe42af0d1a Mon Sep 17 00:00:00 2001
From: Ryan Rock <ryan.rock@amd.com>
Date: Thu, 27 Nov 2025 09:01:37 -0600
Subject: [PATCH 479/578] [CI/Build] Skip ray tests on ROCm (#29556)

Signed-off-by: Ryan Rock <ryan.rock@amd.com>
---
 tests/v1/distributed/test_async_llm_dp.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/v1/distributed/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py
index 60f9017184ea..3b5f2e5e8d72 100644
--- a/tests/v1/distributed/test_async_llm_dp.py
+++ b/tests/v1/distributed/test_async_llm_dp.py
@@ -12,6 +12,7 @@
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs import PromptType
+from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import DPAsyncMPClient
@@ -84,6 +85,10 @@ async def test_load(
     if async_scheduling and data_parallel_backend == "ray":
         # TODO(NickLucche) Re-enable when async scheduling is supported
         pytest.skip("Async scheduling is not supported with ray")
+    elif data_parallel_backend == "ray" and current_platform.is_rocm():
+        pytest.skip(
+            "Ray as the distributed executor backend is not supported with ROCm."
+        )
     stats_loggers = {}
 
     @dataclass

From 66d3d5422c9b90f1ee9593e1793e86f14e4eb3f4 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:15:50 +0100
Subject: [PATCH 480/578] [Doc]: fixing typos in diverse files (#29492)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/benchmarks/serve.py                | 4 ++--
 vllm/config/parallel.py                 | 4 ++--
 vllm/lora/punica_wrapper/punica_base.py | 2 +-
 vllm/model_executor/models/adapters.py  | 4 ++--
 vllm/v1/sample/tpu/sampler.py           | 2 +-
 vllm/v1/worker/dp_utils.py              | 6 ++++--
 6 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index dddb050ec180..519303c0bfa0 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1005,7 +1005,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
         "for headers to be passed with each request. These headers override "
         "per backend constants and values set via environment variable, and "
-        "will be overriden by other arguments (such as request ids).",
+        "will be overridden by other arguments (such as request ids).",
     )
     parser.add_argument(
         "--max-concurrency",
@@ -1138,7 +1138,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--percentile-metrics",
         type=str,
         default=None,
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
         "This argument specifies the metrics to report percentiles. "
         'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
         'If not specified, defaults to "ttft,tpot,itl" for generative models '
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 7ba1da5db384..4a8c8bc17cfc 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -238,9 +238,9 @@ class is dynamically inherited by the worker class. This is used to inject
     cp_kv_cache_interleave_size: int = 1
     """Interleave size of kv_cache storage while using DCP or PCP.
     For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
-        and `total_cp_world_size = pcp_world_size * dcp_world_szie`.
+        and `total_cp_world_size = pcp_world_size * dcp_world_size`.
     store interleave_size tokens on total_cp_rank i,
-    then store next interleave_size tokens on taotal_cp_rank i+1.
+    then store next interleave_size tokens on total_cp_rank i+1.
     Interleave_size=1: token-level alignment, where token `i` is stored on
         total_cp_rank `i % total_cp_world_size`.
     Interleave_size=block_size: block-level alignment, where tokens are
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index ce38751e4b6a..47c42b095534 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -173,7 +173,7 @@ def _update_base_metadata(
         vocab_size: int,
     ):
         # NOTE We have remove lora extra vocab support for now. So we set
-        # extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed.
+        # extra_vocab_size always to 0, and extra_vocab_size will be removed.
 
         extra_vocab_size = 0
         (
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index a9cc49451a1d..5aba46f8614b 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -428,7 +428,7 @@ def load_weights_using_from_2_way_softmax(
     )
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
-        # have this attribute, we fallback to get_input_embeddings(), which is used by
+        # have this attribute, we fall back to get_input_embeddings(), which is used by
         # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
@@ -486,7 +486,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     )
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
-        # have this attribute, we fallback to get_input_embeddings(), which is used by
+        # have this attribute, we fall back to get_input_embeddings(), which is used by
         # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 8f0463c76ce1..6d992bb37a59 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -181,7 +181,7 @@ def apply_top_k_top_p(
     after thresholding the logit using this cut-off, the remaining elements
     shall constitute the top-p set.
 
-    Note: in the case of tie (i.e. multipple cut-off elements present in the
+    Note: in the case of tie (i.e. multiple cut-off elements present in the
     logit), all tie elements are included in the top-p set. In other words,
     this function does not break ties. Instead, these tie tokens have equal
     chance of being chosen during final sampling, so we can consider the tie
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 064f2f0360cb..c1509de821b0 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -24,12 +24,14 @@ def _get_device_and_group(parallel_config: ParallelConfig):
     device = get_dp_group().device
     group = get_dp_group().device_group
 
-    # Transfering this tensor from GPU to CPU will introduce a GPU sync
+    # Transferring this tensor from GPU to CPU will introduce a GPU sync
     # point that could adversely affect performance of vllm with asynch
     # scheduling. This environment variable exists to quickly disable
     # this optimization if we run into this case.
     if parallel_config.disable_nccl_for_dp_synchronization:
-        logger.info_once("Using CPU all reduce to syncronize DP padding between ranks.")
+        logger.info_once(
+            "Using CPU all reduce to synchronize DP padding between ranks."
+        )
         device = "cpu"
         group = get_dp_group().cpu_group
     return device, group

From cd007a53b4a2d7a83e35de559dc87da09302e956 Mon Sep 17 00:00:00 2001
From: Mathis Felardos <mathis@mistral.ai>
Date: Thu, 27 Nov 2025 16:32:38 +0100
Subject: [PATCH 481/578] [bugfix] avoid NIXL_ERR_REMOTE_DISCONNECT in
 nixl_connector when Prefill dies (#28120)

Signed-off-by: Mathis Felardos <mathis@mistral.ai>
---
 .../kv_connector/v1/nixl_connector.py         | 62 ++++++++++++-------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index ff51840b84b1..d5edf84e8e7f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -1832,35 +1832,55 @@ def _pop_done_transfers(
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
             in_progress = False
-            for handle, _xfer_stime in handles:
-                xfer_state = self.nixl_wrapper.check_xfer_state(handle)
-                if xfer_state == "DONE":
-                    # Get telemetry from NIXL
-                    res = self.nixl_wrapper.get_xfer_telemetry(handle)
-                    self.xfer_stats.record_transfer(res)
-                    self.nixl_wrapper.release_xfer_handle(handle)
-                elif xfer_state == "PROC":
-                    in_progress = True
-                    continue
-                else:
-                    # transfer failed - mark blocks as invalid
-                    logger.error(
-                        "NIXL transfer failed for request %s with state %s. "
+            for handle, xfer_start_time in handles:
+                try:
+                    xfer_state = self.nixl_wrapper.check_xfer_state(handle)
+                    if xfer_state == "DONE":
+                        # Get telemetry from NIXL
+                        res = self.nixl_wrapper.get_xfer_telemetry(handle)
+                        self.xfer_stats.record_transfer(res)
+                        self.nixl_wrapper.release_xfer_handle(handle)
+                    elif xfer_state == "PROC":
+                        in_progress = True
+                        continue
+                    else:
+                        logger.error(
+                            "NIXL transfer failed for request %s with state "
+                            "%s. Marking blocks as invalid.",
+                            req_id,
+                            xfer_state,
+                        )
+                        self._handle_failed_transfer(req_id, handle)
+                        in_progress = False
+                except Exception:
+                    logger.exception(
+                        "NIXL transfer exception for request %s. "
                         "Marking blocks as invalid.",
                         req_id,
-                        xfer_state,
                     )
-                    # mark all (logical)blocks for this request as invalid
-                    if meta := self._recving_metadata.pop(req_id, None):
-                        self._invalid_block_ids.update(meta.local_block_ids)
-                    self._recving_metadata.pop(req_id, None)
-                    self.nixl_wrapper.release_xfer_handle(handle)
-                    self.xfer_stats.record_failed_transfer()
+                    self._handle_failed_transfer(req_id, handle)
+                    in_progress = False
+
             if not in_progress:
                 done_req_ids.add(req_id)
                 del transfers[req_id]
         return done_req_ids
 
+    def _handle_failed_transfer(self, req_id: str, handle: int):
+        """
+        Handle a failed transfer by marking all (logical) blocks as invalid and
+        recording the failure.
+
+        Args:
+            req_id: The request ID.
+            handle: The transfer handle.
+        """
+        if meta := self._recving_metadata.pop(req_id, None):
+            self._invalid_block_ids.update(meta.local_block_ids)
+        self._recving_metadata.pop(req_id, None)
+        self.nixl_wrapper.release_xfer_handle(handle)
+        self.xfer_stats.record_failed_transfer()
+
     def start_load_kv(self, metadata: NixlConnectorMetadata):
         """
         Start loading by triggering non-blocking nixl_xfer.

From fc1d8be3dc97e33ade7fb578451006bb044a5e60 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Thu, 27 Nov 2025 11:19:09 -0500
Subject: [PATCH 482/578] [Attention] Update attention imports (#29540)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../test_rocm_attention_backends_selection.py         |  9 +++------
 .../kv_connector/unit/test_backwards_compatibility.py |  6 +++---
 vllm/attention/backends/abstract.py                   | 11 ++++-------
 vllm/attention/layers/chunked_local_attention.py      |  3 +--
 vllm/config/model.py                                  |  3 +--
 vllm/config/multimodal.py                             | 11 ++---------
 vllm/distributed/kv_transfer/kv_connector/v1/base.py  |  4 ++--
 .../kv_connector/v1/decode_bench_connector.py         |  4 ++--
 .../kv_transfer/kv_connector/v1/lmcache_connector.py  |  4 ++--
 .../v1/lmcache_integration/vllm_v1_adapter.py         |  4 ++--
 .../kv_connector/v1/lmcache_mp_connector.py           |  4 ++--
 .../kv_transfer/kv_connector/v1/multi_connector.py    |  4 ++--
 .../kv_transfer/kv_connector/v1/nixl_connector.py     |  5 ++---
 .../kv_connector/v1/p2p/p2p_nccl_connector.py         |  4 ++--
 .../kv_connector/v1/shared_storage_connector.py       |  4 ++--
 vllm/forward_context.py                               |  8 +++-----
 vllm/model_executor/layers/attention_layer_base.py    |  7 ++-----
 vllm/model_executor/layers/mamba/abstract.py          |  7 ++-----
 .../compressed_tensors/compressed_tensors.py          |  3 +--
 vllm/model_executor/layers/quantization/fp8.py        |  4 +---
 vllm/model_executor/layers/quantization/modelopt.py   |  3 +--
 vllm/model_executor/layers/quantization/mxfp4.py      |  3 +--
 vllm/model_executor/layers/quantization/petit.py      |  3 +--
 vllm/model_executor/layers/quantization/ptpc_fp8.py   |  3 +--
 .../model_executor/layers/quantization/quark/quark.py |  3 +--
 vllm/platforms/cpu.py                                 |  5 +----
 vllm/platforms/cuda.py                                | 10 ++--------
 vllm/platforms/interface.py                           |  5 +----
 vllm/platforms/rocm.py                                |  6 +-----
 vllm/platforms/tpu.py                                 |  5 +----
 vllm/platforms/xpu.py                                 |  7 +------
 vllm/v1/attention/backends/cpu_attn.py                |  2 --
 vllm/v1/attention/backends/flash_attn.py              |  2 --
 vllm/v1/attention/backends/flex_attention.py          |  2 --
 vllm/v1/attention/backends/utils.py                   |  7 +++++--
 vllm/v1/kv_offload/spec.py                            |  4 ++--
 vllm/v1/spec_decode/eagle.py                          |  3 +--
 vllm/v1/worker/utils.py                               |  7 ++-----
 38 files changed, 63 insertions(+), 126 deletions(-)

diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
index 80158d4b7278..77790be6f892 100644
--- a/tests/v1/attention/test_rocm_attention_backends_selection.py
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -139,14 +139,13 @@ def test_standard_attention_backend_selection(
     import importlib
 
     import vllm.envs as envs
-    from vllm.attention.backends.registry import _Backend
 
     importlib.reload(envs)
 
     # Convert string backend to enum if provided
     backend_enum = None
     if selected_backend:
-        backend_enum = getattr(_Backend, selected_backend)
+        backend_enum = getattr(AttentionBackendEnum, selected_backend)
 
     # Get the backend class path
     from vllm.platforms.rocm import RocmPlatform
@@ -253,7 +252,6 @@ def test_mla_backend_selection(
     import importlib
 
     import vllm.envs as envs
-    from vllm.attention.backends.registry import _Backend
 
     importlib.reload(envs)
 
@@ -269,7 +267,7 @@ def test_mla_backend_selection(
         # Convert string backend to enum if provided
         backend_enum = None
         if selected_backend:
-            backend_enum = getattr(_Backend, selected_backend)
+            backend_enum = getattr(AttentionBackendEnum, selected_backend)
 
         from vllm.platforms.rocm import RocmPlatform
 
@@ -301,7 +299,6 @@ def test_mla_backend_selection(
 
 def test_aiter_fa_requires_gfx9(mock_vllm_config):
     """Test that ROCM_AITER_FA requires gfx9 architecture."""
-    from vllm.attention.backends.registry import _Backend
     from vllm.platforms.rocm import RocmPlatform
 
     # Mock on_gfx9 to return False
@@ -313,7 +310,7 @@ def test_aiter_fa_requires_gfx9(mock_vllm_config):
         ),
     ):
         RocmPlatform.get_attn_backend_cls(
-            selected_backend=_Backend.ROCM_AITER_FA,
+            selected_backend=AttentionBackendEnum.ROCM_AITER_FA,
             head_size=128,
             dtype=torch.float16,
             kv_cache_dtype="auto",
diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
index f51001a6ec12..7cd23805c599 100644
--- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py
+++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
@@ -14,6 +14,7 @@
 
 import pytest
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
@@ -24,7 +25,6 @@
 from .utils import create_scheduler, create_vllm_config
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -68,7 +68,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         pass
@@ -119,7 +119,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         pass
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index a321167b8090..c290670eeacb 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -6,11 +6,10 @@
 
 import torch
 
-from vllm.model_executor.layers.linear import ColumnParallelLinear
-from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
-
 if TYPE_CHECKING:
     from vllm.config.cache import CacheDType
+    from vllm.model_executor.layers.linear import ColumnParallelLinear
+    from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
     from vllm.platforms.interface import DeviceCapability
     from vllm.v1.attention.backends.utils import KVCacheLayoutType
 
@@ -178,8 +177,6 @@ def supports_attn_type(cls, attn_type: str) -> bool:
         By default, only supports decoder attention.
         Backends should override this to support other attention types.
         """
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type == AttentionType.DECODER
 
     @classmethod
@@ -360,7 +357,7 @@ def forward(
     ) -> torch.Tensor:
         raise NotImplementedError
 
-    def fused_output_quant_supported(self, quant_key: QuantKey):
+    def fused_output_quant_supported(self, quant_key: "QuantKey"):
         """
         Does this attention implementation support fused output quantization.
         This is used by the AttnFusionPass to only fuse output quantization
@@ -412,7 +409,7 @@ def __init__(
         qk_rope_head_dim: int,
         qk_head_dim: int,
         v_head_dim: int,
-        kv_b_proj: ColumnParallelLinear,
+        kv_b_proj: "ColumnParallelLinear",
         indexer: object | None = None,
     ) -> None:
         raise NotImplementedError
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index 48fcc6fa736b..0ced0028ded9 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -5,6 +5,7 @@
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
 from vllm.config.vllm import VllmConfig
@@ -22,8 +23,6 @@
     KVCacheSpec,
 )
 
-from ..layer import Attention
-
 
 @functools.lru_cache
 def create_chunked_local_attention_backend(
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 84311596b660..5dabd636c18c 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -14,6 +14,7 @@
 from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
 from vllm.config.pooler import PoolerConfig
 from vllm.config.scheduler import RunnerType
@@ -53,7 +54,6 @@
 
     import vllm.model_executor.layers.quantization as me_quant
     import vllm.model_executor.models as me_models
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config.load import LoadConfig
     from vllm.config.parallel import ParallelConfig
     from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -61,7 +61,6 @@
 else:
     PretrainedConfig = Any
 
-    AttentionBackendEnum = Any
     me_quant = LazyLoader(
         "model_executor", globals(), "vllm.model_executor.layers.quantization"
     )
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 590bc4dcd076..8a2936de96d6 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -2,19 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Any, Literal, TypeAlias
+from typing import Any, Literal, TypeAlias
 
 from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
-else:
-    AttentionBackendEnum = Any
-
 
 @dataclass
 class BaseDummyOptions:
@@ -170,9 +166,6 @@ def _validate_limit_per_prompt(
     def _validate_mm_encoder_attn_backend(
         cls, value: str | AttentionBackendEnum | None
     ) -> AttentionBackendEnum | None:
-        # We need to import the real type here (deferred to avoid circular import).
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if isinstance(value, str) and value.upper() == "XFORMERS":
             raise ValueError(
                 "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 74f09278b7bb..cac45425bb7a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -42,12 +42,12 @@
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -239,7 +239,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         return
 
     def register_cross_layers_kv_cache(
-        self, kv_cache: torch.Tensor, attn_backend: type["AttentionBackend"]
+        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
     ):
         """
         Initialize with a single KV cache tensor used by all layers.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
index 9cd7d93c92fa..e9b2bd392b0e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
@@ -36,6 +36,7 @@
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
     KVConnectorRole,
@@ -45,7 +46,6 @@
 from vllm.utils.math_utils import cdiv
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -117,7 +117,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         # This connector doesn't save KV cache (benchmarking only)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 0c24a53fb754..30da424ddcca 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -7,6 +7,7 @@
     LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
 )
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -17,7 +18,6 @@
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -91,7 +91,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
index 94572b02fa87..15ac5b049fce 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -29,6 +29,7 @@
 from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer
 from lmcache.v1.plugin.plugin_launcher import PluginLauncher
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -50,7 +51,6 @@
 from vllm.version import __version__ as VLLM_VERSION
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.multimodal.inputs import PlaceholderRange
     from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -915,7 +915,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         """Start saving the a layer of KV cache from vLLM's paged buffer
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index d1d3e475cc88..a4bddf5e0316 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -10,6 +10,7 @@
 from lmcache.integration.vllm.utils import mla_enabled
 from lmcache.utils import init_logger as lmcache_init_logger
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -26,7 +27,6 @@
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -490,7 +490,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index c9d08e9b78ed..f47e8ca7e6c5 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -7,6 +7,7 @@
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
@@ -27,7 +28,6 @@
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -216,7 +216,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         for c in self._connectors:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index d5edf84e8e7f..24c8d32dafed 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -20,7 +20,7 @@
 import zmq
 
 from vllm import envs
-from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
@@ -51,7 +51,6 @@
 from vllm.v1.worker.block_table import BlockTable
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
@@ -308,7 +307,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         """NixlConnector does not save explicitly."""
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index a124a0d519db..8f3a62d7bcdb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -7,6 +7,7 @@
 import regex as re
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -22,7 +23,6 @@
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -243,7 +243,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """Start saving the KV cache of the layer from vLLM's paged buffer
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 4611b4d1ff7b..ed641cfc43dd 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -7,6 +7,7 @@
 import safetensors
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -19,7 +20,6 @@
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -211,7 +211,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """Start saving the KV cache of the layer from vLLM's paged buffer
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 635419bc7cad..173d366267e8 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,19 +5,17 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, NamedTuple
+from typing import Any, NamedTuple
 
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.ubatch_utils import UBatchSlices
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
-
 logger = init_logger(__name__)
 
 track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
@@ -195,7 +193,7 @@ class ForwardContext:
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
+    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
diff --git a/vllm/model_executor/layers/attention_layer_base.py b/vllm/model_executor/layers/attention_layer_base.py
index ffbef470b186..a60cf787135c 100644
--- a/vllm/model_executor/layers/attention_layer_base.py
+++ b/vllm/model_executor/layers/attention_layer_base.py
@@ -3,14 +3,11 @@
 """Base class for attention-like layers."""
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.v1.kv_cache_interface import KVCacheSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
 
 class AttentionLayerBase(ABC):
     """
@@ -22,7 +19,7 @@ class AttentionLayerBase(ABC):
     """
 
     @abstractmethod
-    def get_attn_backend(self) -> type["AttentionBackend"]:
+    def get_attn_backend(self) -> type[AttentionBackend]:
         """Get the attention backend class for this layer."""
         pass
 
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index aa919d6fdc35..74f4383e9c23 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -2,18 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import abstractmethod
 from collections.abc import Iterable
-from typing import TYPE_CHECKING
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.selector import get_mamba_attn_backend
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
 
 class MambaBase(AttentionLayerBase):
     """
@@ -66,6 +63,6 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None:
             ),
         )
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
+    def get_attn_backend(self) -> type[AttentionBackend]:
         """Get the attention backend class for this Mamba layer."""
         return get_mamba_attn_backend(self.mamba_type)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 7f61746a4e45..f9d8f5883680 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -18,6 +18,7 @@
 from compressed_tensors.transform import TransformConfig
 
 import vllm.envs as envs
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
@@ -131,8 +132,6 @@ def get_quant_method(
         layer: torch.nn.Module,
         prefix: str,
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             # collect schemes
             quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e033032903e8..7dfc8a9c36c3 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -14,6 +14,7 @@
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
+from vllm.attention.layer import Attention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
@@ -277,7 +278,6 @@ def from_config(cls, config: dict[str, Any]) -> "Fp8Config":
     def get_xpu_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention
         from vllm.model_executor.layers.quantization.ipex_quant import (
             XPUFp8LinearMethod,
             XPUFp8MoEMethod,
@@ -307,8 +307,6 @@ def get_xpu_quant_method(
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if current_platform.is_xpu():
             return self.get_xpu_quant_method(layer, prefix)
         if isinstance(layer, LinearBase):
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 2cf7089e0ff9..80f8e3a03e7c 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -12,6 +12,7 @@
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
@@ -149,8 +150,6 @@ def is_layer_excluded(self, prefix: str) -> bool:
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         # handle kv-cache first so we can focus only on weight quantization thereafter
         if isinstance(layer, Attention):
             return self.KVCacheMethodCls(self)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index d975131f7cff..bc241ac692e2 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -8,6 +8,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.attention.layer import Attention
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
@@ -184,8 +185,6 @@ def get_config_filenames(cls) -> list[str]:
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             if self.ignored_layers and is_layer_skipped(
                 prefix=prefix,
diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py
index 402cebc38c21..5ccc73166361 100644
--- a/vllm/model_executor/layers/quantization/petit.py
+++ b/vllm/model_executor/layers/quantization/petit.py
@@ -8,6 +8,7 @@
 import torch
 from torch.nn.parameter import Parameter
 
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (
     LinearBase,
@@ -159,8 +160,6 @@ def is_layer_excluded(self, prefix: str, exclude_modules: list[str]) -> bool:
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         exclude = self.require_exclude_modules()
 
         if isinstance(layer, LinearBase):
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 26ba8e5b16bc..ed8a2c7fa084 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -7,6 +7,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -65,8 +66,6 @@ def from_config(cls, config: dict[str, Any]) -> "PTPCFp8Config":
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             if is_layer_skipped(prefix, self.ignored_layers):
                 return UnquantizedLinearMethod()
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index f59e5e2a0af7..3640e5c45278 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -6,6 +6,7 @@
 
 import torch
 
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
@@ -102,8 +103,6 @@ def apply_vllm_mapper(  # noqa: B027
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         # Check if the layer is skipped for quantization.
         exclude_layers = cast(list[str], self.quant_config.get("exclude"))
         if should_ignore_layer(
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index ed655912d396..5f9561366e0d 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -14,6 +14,7 @@
 import torch
 
 from vllm import envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 
 from .interface import CpuArchEnum, Platform, PlatformEnum
@@ -21,10 +22,8 @@
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
-    AttentionBackendEnum = None
     VllmConfig = None
 
 
@@ -135,8 +134,6 @@ def get_attn_backend_cls(
         use_sparse: bool,
         attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         if use_mla:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index e8e14387bb7f..d5c3a177d9c2 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -15,6 +15,8 @@
 # import custom ops, trigger op registration
 import vllm._C  # noqa
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.import_utils import import_pynvml
 from vllm.utils.torch_utils import cuda_device_count_stateless
@@ -22,11 +24,9 @@
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import CacheDType
 else:
-    AttentionBackendEnum = None
     VllmConfig = None
     CacheDType = None
 
@@ -48,8 +48,6 @@ def _get_backend_priorities(
     device_capability: DeviceCapability,
 ) -> list[AttentionBackendEnum]:
     """Get backend priorities with lazy import to avoid circular dependency."""
-    from vllm.attention.backends.registry import AttentionBackendEnum
-
     if use_mla:
         if device_capability.major == 10:
             return [
@@ -265,8 +263,6 @@ def get_current_memory_usage(
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         # Try FlashAttention first
         try:
             backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
@@ -335,8 +331,6 @@ def get_attn_backend_cls(
         use_sparse: bool,
         attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.abstract import AttentionType
-
         if attn_type is None:
             attn_type = AttentionType.DECODER
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 1e6b53021f88..27c6fac09f49 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -12,12 +12,12 @@
 import numpy as np
 import torch
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     from torch.distributed import PrefixStore, ProcessGroup
 
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import CacheDType
     from vllm.inputs import ProcessorInputs, PromptType
@@ -226,9 +226,6 @@ def import_kernels(cls) -> None:
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        # Import AttentionBackendEnum here to avoid circular import.
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 0483f6c06ada..ccf3446a3a6e 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -8,16 +8,14 @@
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.torch_utils import cuda_device_count_stateless
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
-else:
-    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -196,7 +194,6 @@ def get_vit_attn_backend(
         from importlib.util import find_spec
 
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if rocm_aiter_ops.is_mha_enabled():
             # Note: AITER FA is only supported for Qwen-VL models.
@@ -222,7 +219,6 @@ def get_attn_backend_cls(
         attn_type: str | None = None,
     ) -> str:
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
             if kv_cache_dtype.startswith("fp8"):
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 04325a522f44..cbc0a996f366 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -7,6 +7,7 @@
 import torch
 from tpu_info import device
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 
@@ -15,7 +16,6 @@
 if TYPE_CHECKING:
     from typing import TypeAlias
 
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import BlockSize
     from vllm.pooling_params import PoolingParams
@@ -26,7 +26,6 @@
     BlockSize = None
     VllmConfig = None
     PoolingParams = None
-    AttentionBackendEnum = None
     ParamsType = None
 
 logger = init_logger(__name__)
@@ -67,8 +66,6 @@ def get_attn_backend_cls(
         use_sparse,
         attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on TPU.")
         if selected_backend != AttentionBackendEnum.PALLAS:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 18a3186b142f..768714fb1672 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -8,16 +8,15 @@
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
     VllmConfig = None
-    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -60,8 +59,6 @@ def get_attn_backend_cls(
             "only NHD layout is supported by XPU attention kernels."
         )
 
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on XPU.")
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
@@ -116,8 +113,6 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         return AttentionBackendEnum.FLASH_ATTN
 
     @classmethod
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index d0b1f8c1b807..fed7dcdf293b 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -51,8 +51,6 @@ def get_name() -> str:
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """CPU attention supports decoder and encoder-only attention."""
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type in (
             AttentionType.DECODER,
             AttentionType.ENCODER,
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a1558073003f..fb080b0b33bc 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -84,8 +84,6 @@ def get_name() -> str:
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """FlashAttention supports all attention types."""
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type in (
             AttentionType.DECODER,
             AttentionType.ENCODER,
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 3869f1f4164c..8de0a0a11471 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -87,8 +87,6 @@ def get_name() -> str:
     @classmethod
     def supports_attn_type(cls, attn_type: str) -> bool:
         """FlexAttention supports both decoder and encoder-only attention."""
-        from vllm.attention.backends.abstract import AttentionType
-
         return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
 
     @staticmethod
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index ea9dccc702a0..6e0d84e4fb4a 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -24,12 +24,15 @@
 from vllm.utils.math_utils import cdiv
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionImpl
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionMetadata,
+)
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     get_kv_connector_cache_layout,
 )
diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py
index 3afce5589075..2cdd5ba5ffe5 100644
--- a/vllm/v1/kv_offload/spec.py
+++ b/vllm/v1/kv_offload/spec.py
@@ -6,12 +6,12 @@
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
     from vllm.config import VllmConfig
 
 logger = init_logger(__name__)
@@ -51,7 +51,7 @@ def get_manager(self) -> OffloadingManager:
     def get_handlers(
         self,
         kv_caches: dict[str, torch.Tensor],
-        attn_backends: dict[str, type["AttentionBackend"]],
+        attn_backends: dict[str, type[AttentionBackend]],
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         """
         Get offloading handlers along with their respective src and dst types.
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 7b9037c03d4f..7600df48150a 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -8,6 +8,7 @@
 import torch
 import torch.nn as nn
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CompilationMode,
     CUDAGraphMode,
@@ -157,8 +158,6 @@ def __init__(
         )
 
         # Determine allowed attention backends once during initialization.
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         self.allowed_attn_types: tuple | None = None
         if current_platform.is_rocm():
             rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 92e4ce3abdba..bd88cb1b253f 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -2,11 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.layer import Attention
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
@@ -17,9 +17,6 @@
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.layer import Attention
-
 
 class MultiModalBudget:
     """Helper class to calculate budget information for multi-modal models."""
@@ -278,7 +275,7 @@ def add_kv_sharing_layers_to_kv_cache_groups(
 
 def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],
-    forward_context: dict[str, "Attention"],
+    forward_context: dict[str, Attention],
     runner_kv_caches: list[torch.Tensor],
     num_attn_module: int = 1,
 ) -> None:

From e1f262337bcf774032019b5b717a6297a860f190 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:42:14 +0000
Subject: [PATCH 483/578] Update Transformers pin in CI to 4.57.3 (#29418)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 requirements/nightly_torch_test.txt | 2 +-
 requirements/test.in                | 2 +-
 requirements/test.txt               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index d9c5d89c1d52..53b012372be8 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -29,7 +29,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
-transformers==4.57.1
+transformers==4.57.3
 tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
diff --git a/requirements/test.in b/requirements/test.in
index 05f6bcca5c2c..da7a7db1f00c 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -37,7 +37,7 @@ datamodel_code_generator # required for minicpm3 test
 # TODO: Use lm-eval[api]==0.4.10 once released
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.1
+transformers==4.57.3
 tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
diff --git a/requirements/test.txt b/requirements/test.txt
index bcd511660f85..c5f103b8b0d7 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1196,7 +1196,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.57.1
+transformers==4.57.3
     # via
     #   -r requirements/test.in
     #   genai-perf

From 0840abdd242bbc7d0c42f0bfa73fec94a44e921b Mon Sep 17 00:00:00 2001
From: Injae Ryou <injaeryou@gmail.com>
Date: Fri, 28 Nov 2025 01:53:10 +0900
Subject: [PATCH 484/578] [BugFix] Optional tokenizer argument when loading
 GGUF models (#29582)

Signed-off-by: Injae Ryou <injaeryou@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/config/model.py                  | 15 +++++-----
 vllm/transformers_utils/gguf_utils.py | 42 +++++++++++++++++++++++++++
 vllm/transformers_utils/tokenizer.py  | 10 ++++++-
 3 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 5dabd636c18c..21d602b30ac1 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -439,13 +439,6 @@ def __post_init__(
         self.model = maybe_model_redirect(self.model)
         # The tokenizer is consistent with the model by default.
         if self.tokenizer is None:
-            # Check if this is a GGUF model (either local file or remote GGUF)
-            if is_gguf(self.model):
-                raise ValueError(
-                    "Using a tokenizer is mandatory when loading a GGUF model. "
-                    "Please specify the tokenizer path or name using the "
-                    "--tokenizer argument."
-                )
             self.tokenizer = self.model
         if self.tokenizer_revision is None:
             self.tokenizer_revision = self.revision
@@ -699,6 +692,14 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
 
             self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
 
+        # Multimodal GGUF models must use original repo for mm processing
+        if is_gguf(self.tokenizer) and self.is_multimodal_model:
+            raise ValueError(
+                "Loading a multimodal GGUF model needs to use original "
+                "tokenizer. Please specify the unquantized hf model's "
+                "repo name or path using the --tokenizer argument."
+            )
+
         if self.disable_sliding_window:
             # Set after get_and_verify_max_len to ensure that max_model_len
             # can be correctly capped to sliding window size
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index 2bf59c91a3bb..f727b1b4726b 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -9,6 +9,7 @@
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
 
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
@@ -164,3 +165,44 @@ def maybe_patch_hf_config_from_gguf(
             hf_config = new_hf_config
 
     return hf_config
+
+
+def get_gguf_file_path_from_hf(
+    repo_id: str | Path,
+    quant_type: str,
+    revision: str | None = None,
+) -> str:
+    """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type.
+
+    Args:
+        repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B")
+        quant_type: The quantization type (e.g., "Q4_K_M", "F16")
+        revision: Optional revision/branch name
+
+    Returns:
+        The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"),
+    """
+    repo_id = str(repo_id)
+    gguf_patterns = [
+        f"*-{quant_type}.gguf",
+        f"*-{quant_type}-*.gguf",
+        f"*/*-{quant_type}.gguf",
+        f"*/*-{quant_type}-*.gguf",
+    ]
+    matching_files = list_filtered_repo_files(
+        repo_id,
+        allow_patterns=gguf_patterns,
+        revision=revision,
+    )
+
+    if len(matching_files) == 0:
+        raise ValueError(
+            "Could not find GGUF file for repo %s with quantization %s.",
+            repo_id,
+            quant_type,
+        )
+
+    # Sort to ensure consistent ordering (prefer non-sharded files)
+    matching_files.sort(key=lambda x: (x.count("-"), x))
+    gguf_filename = matching_files[0]
+    return gguf_filename
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index f0e0ba8ef424..929dc8bf481c 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -19,6 +19,7 @@
     get_sentence_transformer_tokenizer_config,
     list_filtered_repo_files,
 )
+from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import (
     check_gguf_file,
@@ -190,7 +191,14 @@ def get_tokenizer(
             kwargs["gguf_file"] = Path(tokenizer_name).name
             tokenizer_name = Path(tokenizer_name).parent
         elif is_remote_gguf(tokenizer_name):
-            tokenizer_name, _ = split_remote_gguf(tokenizer_name)
+            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
+            # Get the HuggingFace Hub path for the GGUF file
+            gguf_file = get_gguf_file_path_from_hf(
+                tokenizer_name,
+                quant_type,
+                revision=revision,
+            )
+            kwargs["gguf_file"] = gguf_file
 
     # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
     # first to use official Mistral tokenizer if possible.

From ee9841daa995a606139775043f0199d6a81037b3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 01:08:08 +0800
Subject: [PATCH 485/578] [Bugfix] Fix doc build on main (#29619)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/interfaces_base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 4267b6c6598e..85c5574bacf0 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -167,8 +167,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
 
     default_pooling_type: ClassVar[str] = "LAST"
     """
-    Indicates the
-    [vllm.model_executor.layers.pooler.PoolerConfig.pooling_type][]
+    Indicates the [vllm.config.pooler.PoolerConfig.pooling_type][]
     to use by default.
 
     You can use the

From d45269b37844b992dc4b34c0509ad8319bc043e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9E=9C=E5=86=BB=E8=99=BE=E4=BB=81?= <guodong@apache.org>
Date: Fri, 28 Nov 2025 01:21:00 +0800
Subject: [PATCH 486/578] add skip_reading_prefix_cache in repr for
 PoolingParams (#29620)

---
 vllm/pooling_params.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index d1aab98c274e..c2094a2d920a 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -219,6 +219,7 @@ def __repr__(self) -> str:
             f"step_tag_id={self.step_tag_id}, "
             f"returned_token_ids={self.returned_token_ids}, "
             f"requires_token_ids={self.requires_token_ids}, "
+            f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
             f"extra_kwargs={self.extra_kwargs})"
         )
 

From ea228b4491342f6b7a283e1a414e1a75171a0241 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 02:39:59 +0800
Subject: [PATCH 487/578] [Misc] Remove unused code from `protocol.py` (#29616)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/protocol.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 5e3374f9f6a1..6b3ee042daf3 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import enum
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from typing import Any
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs.data import PromptType
-from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.plugins.io_processors import IOProcessor
@@ -19,13 +17,6 @@
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.processor import Processor
 
-logger = init_logger(__name__)
-
-
-class Device(enum.Enum):
-    GPU = enum.auto()
-    CPU = enum.auto()
-
 
 class EngineClient(ABC):
     """Protocol class for Clients to Engine"""

From a24ea5414bc5b623cde301c8c6e1c5082ecfe412 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 03:04:58 +0800
Subject: [PATCH 488/578] [Deprecation] Advance deprecation status (#29617)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config/scheduler.py            | 15 +--------
 vllm/distributed/parallel_state.py  | 19 -----------
 vllm/model_executor/models/utils.py | 49 -----------------------------
 vllm/v1/core/sched/output.py        |  4 +--
 4 files changed, 3 insertions(+), 84 deletions(-)

diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 2cf42d57ec21..ff1ac0e18f32 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -7,7 +7,7 @@
 
 from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
-from typing_extensions import Self, deprecated
+from typing_extensions import Self
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
@@ -224,19 +224,6 @@ def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
 
         self.verify_max_model_len(max_model_len)
 
-    @property
-    @deprecated(
-        "`SchedulerConfig.chunked_prefill_enabled` has been renamed to "
-        "`SchedulerConfig.enable_chunked_prefill`. "
-        "The old name will be removed in v0.12."
-    )
-    def chunked_prefill_enabled(self) -> bool:
-        return self.enable_chunked_prefill
-
-    @chunked_prefill_enabled.setter
-    def chunked_prefill_enabled(self, value: bool):
-        self.enable_chunked_prefill = value
-
     def verify_max_model_len(self, max_model_len: int) -> Self:
         if (
             self.max_num_batched_tokens < max_model_len
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 69c28e278f2d..52b433cfaf1b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -41,7 +41,6 @@
 import torch.distributed._functional_collectives as funcol
 import torch.distributed._symmetric_memory
 from torch.distributed import Backend, ProcessGroup
-from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.distributed.device_communicators.base_device_communicator import (
@@ -1078,15 +1077,6 @@ def get_tp_group() -> GroupCoordinator:
     return _TP
 
 
-@deprecated(
-    "`get_tensor_model_parallel_group` has been replaced with "
-    "`get_tp_group` and may be removed after v0.12. Please use "
-    "`get_tp_group` instead."
-)
-def get_tensor_model_parallel_group():
-    return get_tp_group()
-
-
 _DCP: GroupCoordinator | None = None
 
 
@@ -1130,15 +1120,6 @@ def get_pcp_group() -> GroupCoordinator:
     return _PCP
 
 
-@deprecated(
-    "`get_pipeline_model_parallel_group` has been replaced with "
-    "`get_pp_group` and may be removed in v0.12. Please use "
-    "`get_pp_group` instead."
-)
-def get_pipeline_model_parallel_group():
-    return get_pp_group()
-
-
 @contextmanager
 def graph_capture(device: torch.device):
     """
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index ccefd7e66697..f25ab9153a50 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -10,7 +10,6 @@
 import torch.nn as nn
 from torch.func import functional_call
 from transformers import PretrainedConfig
-from typing_extensions import deprecated
 
 from vllm.config import VllmConfig
 from vllm.distributed import (
@@ -481,54 +480,6 @@ def _merge_multimodal_embeddings(
     return inputs_embeds
 
 
-@deprecated(
-    "`merge_multimodal_embeddings` has been replaced with "
-    "`SupportsMultiModal.embed_input_ids` and will be "
-    "removed in v0.12."
-)
-def merge_multimodal_embeddings(
-    input_ids: torch.Tensor,
-    inputs_embeds: torch.Tensor,
-    multimodal_embeddings: NestedTensors,
-    placeholder_token_id: int | list[int],
-) -> torch.Tensor:
-    """
-    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
-    positions in `inputs_embeds` corresponding to placeholder tokens in
-    `input_ids`.
-
-    `placeholder_token_id` can be a list of token ids (e.g, token ids
-    of img_start, img_break, and img_end tokens) when needed: This means
-    the order of these tokens in the `input_ids` MUST MATCH the order of
-    their embeddings in `multimodal_embeddings` since we need to
-    slice-merge instead of individually scattering.
-
-    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
-    - T is text token
-    - S is image start token
-    - I is image embedding token
-    - B is image break token
-    - E is image end token.
-
-    Then the image embeddings (that correspond to I's) from vision encoder
-    must be padded with embeddings of S, B, and E in the same order of
-    input_ids for a correct embedding merge.
-
-    Note:
-        This updates `inputs_embeds` in place.
-    """
-    if isinstance(placeholder_token_id, list):
-        is_multimodal = isin_list(input_ids, placeholder_token_id)
-    else:
-        is_multimodal = input_ids == placeholder_token_id
-
-    return _merge_multimodal_embeddings(
-        inputs_embeds,
-        multimodal_embeddings=multimodal_embeddings,
-        is_multimodal=is_multimodal,
-    )
-
-
 def isin_list(
     elements: torch.Tensor,
     test_elements_list: list[int],
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 7902513dce49..abfab43499b2 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -126,12 +126,12 @@ def num_reqs(self) -> int:
         return len(self.req_ids)
 
     @cached_property
-    @deprecated("use resumed_req_ids field")
+    @deprecated("This will be removed in v0.14, use `resumed_req_ids` instead.")
     def resumed_from_preemption(self) -> list[bool]:
         return [req_id in self.resumed_req_ids for req_id in self.req_ids]
 
     @cached_property
-    @deprecated("use all_token_ids field")
+    @deprecated("This will be removed in v0.14, use `all_token_ids` instead.")
     def resumed_req_token_ids(self) -> list[list[int] | None]:
         return [
             self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None

From 38658ec6f3b3a09a6cd205bab23a550b3d3f8c0e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 28 Nov 2025 03:17:37 +0800
Subject: [PATCH 489/578] [Bugfix][MM encoder] Fix ViT attention backend
 resolving for Turing GPU (#29614)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/platforms/cuda.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index d5c3a177d9c2..4bf9401b6b05 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -264,14 +264,15 @@ def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
         # Try FlashAttention first
-        try:
-            backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
-            if backend_class.supports_head_size(
-                head_size
-            ) and backend_class.supports_dtype(dtype):
-                return AttentionBackendEnum.FLASH_ATTN
-        except ImportError:
-            pass
+        if (cc := cls.get_device_capability()) and cc.major >= 8:
+            try:
+                backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
+                if backend_class.supports_head_size(
+                    head_size
+                ) and backend_class.supports_dtype(dtype):
+                    return AttentionBackendEnum.FLASH_ATTN
+            except ImportError:
+                pass
 
         return AttentionBackendEnum.TORCH_SDPA
 

From e5a621b724e5570aaffc4bbf9c5f6ec9bca63333 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 27 Nov 2025 20:31:52 +0100
Subject: [PATCH 490/578] [CI] Add batched audios Whisper test (#29308)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../openai/test_transcription_validation.py   | 197 +--------------
 .../test_transcription_validation_whisper.py  | 237 ++++++++++++++++++
 2 files changed, 238 insertions(+), 196 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_transcription_validation_whisper.py

diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 88580ed899f1..8045ab1468d6 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -2,20 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for structured outputs tests
-import io
 import json
 
-import librosa
-import numpy as np
-import openai
 import pytest
-import pytest_asyncio
-import soundfile as sf
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "openai/whisper-large-v3-turbo"
-SERVER_ARGS = ["--enforce-eager"]
 MISTRAL_FORMAT_ARGS = [
     "--tokenizer_mode",
     "mistral",
@@ -26,22 +18,8 @@
 ]
 
 
-@pytest.fixture(scope="module")
-def server():
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"]
-)
+@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
 async def test_basic_audio(mary_had_lamb, model_name):
     server_args = ["--enforce-eager"]
 
@@ -120,176 +98,3 @@ async def test_basic_audio_gemma(foscolo):
         )
         out = json.loads(transcription)["text"]
         assert "da cui vergine nacque Venere" in out
-
-
-@pytest.mark.asyncio
-async def test_non_asr_model(winning_call):
-    # text to text model
-    model_name = "JackFram/llama-68m"
-    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
-        client = remote_server.get_async_client()
-        res = await client.audio.transcriptions.create(
-            model=model_name, file=winning_call, language="en", temperature=0.0
-        )
-        err = res.error
-        assert err["code"] == 400 and not res.text
-        assert err["message"] == "The model does not support Transcriptions API"
-
-
-@pytest.mark.asyncio
-async def test_bad_requests(mary_had_lamb, client):
-    # invalid language
-    with pytest.raises(openai.BadRequestError):
-        await client.audio.transcriptions.create(
-            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
-        )
-
-
-@pytest.mark.asyncio
-async def test_long_audio_request(mary_had_lamb, client):
-    mary_had_lamb.seek(0)
-    audio, sr = librosa.load(mary_had_lamb)
-    # Add small silence after each audio for repeatability in the split process
-    audio = np.pad(audio, (0, 1600))
-    repeated_audio = np.tile(audio, 10)
-    # Repeated audio to buffer
-    buffer = io.BytesIO()
-    sf.write(buffer, repeated_audio, sr, format="WAV")
-    buffer.seek(0)
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=buffer,
-        language="en",
-        response_format="text",
-        temperature=0.0,
-    )
-    out = json.loads(transcription)
-    out_text = out["text"]
-    out_usage = out["usage"]
-    counts = out_text.count("Mary had a little lamb")
-    assert counts == 10, counts
-    assert out_usage["seconds"] == 161, out_usage["seconds"]
-
-
-@pytest.mark.asyncio
-async def test_completion_endpoints(client):
-    # text to text model
-    res = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=[{"role": "system", "content": "You are a helpful assistant."}],
-    )
-    err = res.error
-    assert err["code"] == 400
-    assert err["message"] == "The model does not support Chat Completions API"
-
-    res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
-    err = res.error
-    assert err["code"] == 400
-    assert err["message"] == "The model does not support Completions API"
-
-
-@pytest.mark.asyncio
-async def test_streaming_response(winning_call, client):
-    transcription = ""
-    res_no_stream = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        response_format="json",
-        language="en",
-        temperature=0.0,
-    )
-    res = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        language="en",
-        temperature=0.0,
-        stream=True,
-        timeout=30,
-    )
-    # Reconstruct from chunks and validate
-    async for chunk in res:
-        text = chunk.choices[0]["delta"]["content"]
-        transcription += text
-
-    assert transcription == res_no_stream.text
-
-
-@pytest.mark.asyncio
-async def test_stream_options(winning_call, client):
-    res = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        language="en",
-        temperature=0.0,
-        stream=True,
-        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
-        timeout=30,
-    )
-    final = False
-    continuous = True
-    async for chunk in res:
-        if not len(chunk.choices):
-            # final usage sent
-            final = True
-        else:
-            continuous = continuous and hasattr(chunk, "usage")
-    assert final and continuous
-
-
-@pytest.mark.asyncio
-async def test_sampling_params(mary_had_lamb, client):
-    """
-    Compare sampling with params and greedy sampling to assert results
-    are different when extreme sampling parameters values are picked.
-    """
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        temperature=0.8,
-        extra_body=dict(
-            seed=42,
-            repetition_penalty=1.9,
-            top_k=12,
-            top_p=0.4,
-            min_p=0.5,
-            frequency_penalty=1.8,
-            presence_penalty=2.0,
-        ),
-    )
-
-    greedy_transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        temperature=0.0,
-        extra_body=dict(seed=42),
-    )
-
-    assert greedy_transcription.text != transcription.text
-
-
-@pytest.mark.asyncio
-async def test_audio_prompt(mary_had_lamb, client):
-    prompt = "This is a speech, recorded in a phonograph."
-    # Prompts should not omit the part of original prompt while transcribing.
-    prefix = "The first words I spoke in the original phonograph"
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        response_format="text",
-        temperature=0.0,
-    )
-    out = json.loads(transcription)["text"]
-    assert prefix in out
-    transcription_wprompt = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        response_format="text",
-        prompt=prompt,
-        temperature=0.0,
-    )
-    out_prompt = json.loads(transcription_wprompt)["text"]
-    assert prefix in out_prompt
diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py
new file mode 100644
index 000000000000..82c50e58a016
--- /dev/null
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for structured outputs tests
+import asyncio
+import io
+import json
+
+import librosa
+import numpy as np
+import openai
+import pytest
+import pytest_asyncio
+import soundfile as sf
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/whisper-large-v3-turbo"
+SERVER_ARGS = ["--enforce-eager"]
+
+
+@pytest.fixture(scope="module")
+def server():
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def whisper_client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_basic_audio(mary_had_lamb):
+    server_args = ["--enforce-eager"]
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=MODEL_NAME,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0,
+        )
+        out = json.loads(transcription)
+        out_text = out["text"]
+        out_usage = out["usage"]
+        assert "Mary had a little lamb," in out_text
+        assert out_usage["seconds"] == 16, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
+    transcription = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    transcription2 = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    # Await both transcriptions by scheduling coroutines together
+    transcription, transcription2 = await asyncio.gather(transcription, transcription2)
+    out = json.loads(transcription)
+    out_text = out["text"]
+    assert "Mary had a little lamb," in out_text
+    out2 = json.loads(transcription2)
+    out_text2 = out2["text"]
+    assert "Edgar Martinez" in out_text2
+
+
+@pytest.mark.asyncio
+async def test_bad_requests(mary_had_lamb, whisper_client):
+    # invalid language
+    with pytest.raises(openai.BadRequestError):
+        await whisper_client.audio.transcriptions.create(
+            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
+        )
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(mary_had_lamb, whisper_client):
+    mary_had_lamb.seek(0)
+    audio, sr = librosa.load(mary_had_lamb)
+    # Add small silence after each audio for repeatability in the split process
+    audio = np.pad(audio, (0, 1600))
+    repeated_audio = np.tile(audio, 10)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format="WAV")
+    buffer.seek(0)
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=buffer,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    counts = out_text.count("Mary had a little lamb")
+    assert counts == 10, counts
+    assert out_usage["seconds"] == 161, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_completion_endpoints(whisper_client):
+    # text to text model
+    res = await whisper_client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{"role": "system", "content": "You are a helpful assistant."}],
+    )
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Chat Completions API"
+
+    res = await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Completions API"
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(winning_call, whisper_client):
+    transcription = ""
+    res_no_stream = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        response_format="json",
+        language="en",
+        temperature=0.0,
+    )
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        timeout=30,
+    )
+    # Reconstruct from chunks and validate
+    async for chunk in res:
+        text = chunk.choices[0]["delta"]["content"]
+        transcription += text
+
+    assert transcription == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(winning_call, whisper_client):
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
+        timeout=30,
+    )
+    final = False
+    continuous = True
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            final = True
+        else:
+            continuous = continuous and hasattr(chunk, "usage")
+    assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_sampling_params(mary_had_lamb, whisper_client):
+    """
+    Compare sampling with params and greedy sampling to assert results
+    are different when extreme sampling parameters values are picked.
+    """
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.8,
+        extra_body=dict(
+            seed=42,
+            repetition_penalty=1.9,
+            top_k=12,
+            top_p=0.4,
+            min_p=0.5,
+            frequency_penalty=1.8,
+            presence_penalty=2.0,
+        ),
+    )
+
+    greedy_transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.0,
+        extra_body=dict(seed=42),
+    )
+
+    assert greedy_transcription.text != transcription.text
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(mary_had_lamb, whisper_client):
+    prompt = "This is a speech, recorded in a phonograph."
+    # Prompts should not omit the part of original prompt while transcribing.
+    prefix = "The first words I spoke in the original phonograph"
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)["text"]
+    assert prefix in out
+    transcription_wprompt = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        prompt=prompt,
+        temperature=0.0,
+    )
+    out_prompt = json.loads(transcription_wprompt)["text"]
+    assert prefix in out_prompt

From a5345bf49df74cd394a07797649f51cd67c6c697 Mon Sep 17 00:00:00 2001
From: Andrii Skliar <andreyws96@gmail.com>
Date: Thu, 27 Nov 2025 20:34:59 +0100
Subject: [PATCH 491/578] [BugFix] Fix `plan` API Mismatch when using latest
 FlashInfer (#29426)

Signed-off-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com>
Co-authored-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com>
---
 docker/Dockerfile                        | 4 ++--
 requirements/cuda.txt                    | 2 +-
 vllm/v1/attention/backends/flashinfer.py | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index aa3aad21d6c0..eb7c105071c0 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -398,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.5.2 \
-    && uv pip install --system flashinfer-jit-cache==0.5.2 \
+    uv pip install --system flashinfer-cubin==0.5.3 \
+    && uv pip install --system flashinfer-jit-cache==0.5.3 \
         --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
     && flashinfer show-config
 
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 15e8aadc56f4..462f18ef7159 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -10,4 +10,4 @@ torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.5.2
+flashinfer-python==0.5.3
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index dbd72b298b1f..777398bf8a20 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -1508,7 +1508,7 @@ def fast_plan_decode(
     qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
 
     try:
-        # Make sure we pass exactly 18 arguments for tensor core version
+        # Make sure we pass exactly 19 arguments for tensor core version
         self._plan_info = self._cached_module.plan(
             self._float_workspace_buffer,
             self._int_workspace_buffer,
@@ -1528,6 +1528,7 @@ def fast_plan_decode(
             window_left,
             fixed_split_size,
             disable_split_kv,
+            0,
         )
     except Exception as e:
         raise RuntimeError(f"Error in tensor core plan: {e}") from e

From ae0ce1be272105f02a3ac6a63e646690be2481fb Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 27 Nov 2025 12:38:53 -0800
Subject: [PATCH 492/578] [Model Runner V2][BugFix] Keep reference to GPU
 tensors in AsyncOutput (#29623)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/async_utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
index 421fb29a7f87..f6bc607c1ae6 100644
--- a/vllm/v1/worker/gpu/async_utils.py
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -21,6 +21,9 @@ def __init__(
         copy_stream: torch.cuda.Stream,
         copy_event: torch.cuda.Event,
     ):
+        # NOTE(woosuk): We must retain references to the GPU tensors,
+        # as the copy operations are performed on a different CUDA stream than
+        # the one where the tensors were created.
         self.model_runner_output = model_runner_output
         self.sampler_output = sampler_output
         self.num_sampled_tokens = num_sampled_tokens
@@ -51,7 +54,9 @@ def __init__(
                 )
             else:
                 self.logprobs_tensors = None
-            self.num_sampled_tokens = num_sampled_tokens.to("cpu", non_blocking=True)
+            self.num_sampled_tokens_cpu = num_sampled_tokens.to(
+                "cpu", non_blocking=True
+            )
             self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
             if self.model_runner_output.prompt_logprobs_dict:
                 for k, v in self.model_runner_output.prompt_logprobs_dict.items():
@@ -63,7 +68,7 @@ def __init__(
 
     def get_output(self) -> ModelRunnerOutput:
         self.copy_event.synchronize()
-        num_sampled_tokens_np = self.num_sampled_tokens.numpy()
+        num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy()
 
         # NOTE(woosuk): The following code is to ensure compatibility with
         # the existing model runner.

From be493e0b3cfb5810d254e9845217878a39a4853b Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:45:38 -0500
Subject: [PATCH 493/578] [BugFix] Fix new nightly failures (#29578)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/attention/backends/utils.py | 26 ++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py  | 12 +++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 6e0d84e4fb4a..27f07218d9b2 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -100,6 +100,32 @@ class CommonAttentionMetadata:
     dcp_local_seq_lens_cpu: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
+    # TODO(lucas): remove once we have FULL-CG spec-decode support
+    def unpadded(
+        self, num_actual_tokens: int, num_actual_reqs: int
+    ) -> "CommonAttentionMetadata":
+        maybe_slice_reqs = lambda x: x[:num_actual_reqs] if x is not None else None
+        return CommonAttentionMetadata(
+            query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
+            query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
+            seq_lens=self.seq_lens[:num_actual_reqs],
+            seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
+            num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs],
+            num_reqs=num_actual_reqs,
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=self.max_query_len,
+            max_seq_len=self.max_seq_len,
+            block_table_tensor=self.block_table_tensor[:num_actual_reqs],
+            slot_mapping=self.slot_mapping[:num_actual_tokens],
+            causal=self.causal,
+            logits_indices_padded=self.logits_indices_padded,
+            num_logits_indices=self.num_logits_indices,
+            encoder_seq_lens=maybe_slice_reqs(self.encoder_seq_lens),
+            encoder_seq_lens_cpu=maybe_slice_reqs(self.encoder_seq_lens_cpu),
+            dcp_local_seq_lens=maybe_slice_reqs(self.dcp_local_seq_lens),
+            dcp_local_seq_lens_cpu=maybe_slice_reqs(self.dcp_local_seq_lens_cpu),
+        )
+
 
 def slice_query_start_locs(
     query_start_loc: torch.Tensor,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0ae4eb48acf2..6bff83658b45 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1551,7 +1551,7 @@ def _build_attention_metadata(
                 # Encoder-only layers do not have KV cache, so we need to
                 # create a dummy block table and slot mapping for them.
                 blk_table_tensor = torch.zeros(
-                    (num_tokens_padded, 1),
+                    (num_reqs_padded, 1),
                     dtype=torch.int32,
                     device=self.device,
                 )
@@ -1652,6 +1652,16 @@ def _build_attention_metadata(
                     for layer_name in attn_group.layer_names:
                         attn_metadata[layer_name] = attn_metadata_i
 
+        if spec_decode_common_attn_metadata is not None and (
+            num_reqs != num_reqs_padded or num_tokens != num_tokens_padded
+        ):
+            # Currently the drafter still only uses piecewise cudagraphs (and modifies
+            # the attention metadata in directly), and therefore does not want to use
+            # padded attention metadata.
+            spec_decode_common_attn_metadata = (
+                spec_decode_common_attn_metadata.unpadded(num_tokens, num_reqs)
+            )
+
         return attn_metadata, spec_decode_common_attn_metadata
 
     def _compute_cascade_attn_prefix_lens(

From 35657bcd7a5fd7a7af1aa1b19d78eb8973ec79c1 Mon Sep 17 00:00:00 2001
From: scydas <scyda@outlook.com>
Date: Fri, 28 Nov 2025 09:34:33 +0800
Subject: [PATCH 494/578] [CPU]Update CPU PyTorch to 2.9.0 (#29589)

Signed-off-by: scyda <scyda@outlook.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 docker/Dockerfile.cpu      | 4 ----
 requirements/cpu-build.txt | 4 ++--
 requirements/cpu.txt       | 8 ++++----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index eb3807ef0ca4..67d3fb83a027 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -119,7 +119,6 @@ FROM base AS vllm-test-deps
 
 WORKDIR /workspace/vllm
 
-# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
     cp requirements/test.in requirements/cpu-test.in && \
     sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
@@ -132,9 +131,6 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
       esac; \
     }; \
     remove_packages_not_supported_on_aarch64 && \
-    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
-    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
-    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
     uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
 
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 81d429a5e5f8..0c6fdd3b33cd 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -4,9 +4,9 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.0; platform_system == "Darwin"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index e23d3286f3f7..8c04d6d5ce1b 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -7,17 +7,17 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.0; platform_system == "Darwin"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.8.0; platform_machine == "ppc64le"
+torchaudio==2.9.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.23.0; platform_machine == "ppc64le"
+torchvision==0.24.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # Intel Extension for PyTorch, only for x86_64 CPUs

From 745a3bae1aef2ff3aa70b3eab8624e4571698ba0 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Thu, 27 Nov 2025 18:48:28 -0800
Subject: [PATCH 495/578] [LoRA] Support FusedMoE LoRA Triton kernel for mxfp4
 (#28971)

Signed-off-by: Xin Yang <xyangx@amazon.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../moe/test_modular_oai_triton_moe.py        | 250 ++++++++++++++++++
 vllm/lora/layers/fused_moe.py                 |  35 ++-
 .../fused_moe/gpt_oss_triton_kernels_moe.py   | 146 ++++++++++
 .../layers/quantization/mxfp4.py              |  20 +-
 4 files changed, 440 insertions(+), 11 deletions(-)
 create mode 100644 tests/kernels/moe/test_modular_oai_triton_moe.py

diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py
new file mode 100644
index 000000000000..3361d85e9250
--- /dev/null
+++ b/tests/kernels/moe/test_modular_oai_triton_moe.py
@@ -0,0 +1,250 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test modular OAI Triton MoE
+"""
+
+import pytest
+import torch
+
+from vllm.utils.import_utils import has_triton_kernels
+
+if not has_triton_kernels():
+    pytest.skip(
+        "triton_kernels not found, skipping all related tests",
+        allow_module_level=True,
+    )
+
+from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+from triton_kernels.numerics import InFlexData
+from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
+from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+from triton_kernels.tensor_details import layout
+from triton_kernels.testing import assert_close
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    OAITritonExperts,
+    UnfusedOAITritonExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
+from vllm.model_executor.layers.utils import shuffle_weight
+from vllm.platforms import current_platform
+
+MNK = [
+    (1, 512, 384),
+    (1, 2880, 2880),
+    (2, 512, 384),
+    (2, 2880, 2880),
+    (32, 2880, 2880),
+    (64, 2880, 2880),
+]
+
+
+def unshuffle_weight(w: torch.Tensor):
+    first = w[..., ::2]
+    second = w[..., 1::2]
+    return torch.concat((first, second), dim=-1)
+
+
+def make_weights(dtype, k, n, e):
+    w1 = torch.randn((e, k, 2 * n), dtype=dtype, device="cuda")
+    w1_bias = torch.randn((e, 2 * n), dtype=dtype, device="cuda")
+
+    w2 = torch.randn((e, n, k), dtype=dtype, device="cuda")
+    w2_bias = torch.randn((e, k), dtype=dtype, device="cuda")
+
+    w1_tri = w1.clone()
+    w2_tri = w2.clone()
+
+    w1_bias_tri = w1_bias.clone()
+    w2_bias_tri = w2_bias.clone()
+    w1_bias_tri = w1_bias_tri.to(torch.float32)
+    w2_bias_tri = w2_bias_tri.to(torch.float32)
+
+    # shuffle weights
+    w1_tri = shuffle_weight(w1_tri)
+    w1_bias_tri = shuffle_weight(w1_bias_tri)
+
+    # quant triton_weights
+    w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1)
+    w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, dtype, axis=1)
+    w1 = unshuffle_weight(w1)
+
+    w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1)
+    w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, dtype, axis=1)
+
+    num_warps = 8
+    w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
+    w_scale_layout, w_scale_layout_opts = (
+        layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps)
+    )
+
+    w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts)
+    w1_scale_tri = convert_layout(
+        wrap_torch_tensor(w1_scale_tri),
+        w_scale_layout,
+        **w_scale_layout_opts,
+    )
+
+    w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts)
+    w2_scale_tri = convert_layout(
+        wrap_torch_tensor(w2_scale_tri),
+        w_scale_layout,
+        **w_scale_layout_opts,
+    )
+
+    w1_precision_config = PrecisionConfig(
+        weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+    w2_precision_config = PrecisionConfig(
+        weight_scale=w2_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+
+    return (
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        w1_tri,
+        w2_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        w1_precision_config,
+        w2_precision_config,
+    )
+
+
+def swiglu(x, alpha: float = 1.702, limit: float = 1.0):
+    # Note we add an extra bias of 1 to the linear layer
+    x_glu, x_linear = torch.chunk(x, 2, dim=-1)
+    if limit is not None:
+        x_glu = x_glu.clamp(max=limit)
+    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
+    if limit is not None:
+        x_linear = x_linear.clamp(min=-limit, max=limit)
+    return out_glu * (x_linear + 1)
+
+
+def torch_moe_impl(
+    hidden_states: torch.Tensor,  # (M, K)
+    w1: torch.Tensor,  # (E, K, 2N)
+    w2: torch.Tensor,  # (E, N, K)
+    w1_bias: torch.Tensor,  # (E, 2N)
+    w2_bias: torch.Tensor,  # (E, K)
+    topk_weights: torch.Tensor,  # (M, topk)
+    topk_ids: torch.Tensor,  # (M, topk)
+):
+    w1 = w1[topk_ids, ...]
+    w1_bias = w1_bias[topk_ids, ...]
+    hidden_states = torch.einsum("bekc,bk->bec", w1, hidden_states) + w1_bias
+    hidden_states = swiglu(hidden_states, limit=7)
+
+    w2 = w2[topk_ids, ...]
+    w2_bias = w2_bias[topk_ids, ...]
+    hidden_states = torch.einsum("bekc,bek->bec", w2, hidden_states) + w2_bias
+
+    # Weighted sum of experts
+    hidden_states = torch.einsum("bec,be->bc", hidden_states, topk_weights)
+    return hidden_states
+
+
+def oai_triton_moe_impl(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: "PrecisionConfig",
+    w2_scale: "PrecisionConfig",
+    w1_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+    num_experts: int,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    unfused: bool = False,
+) -> torch.Tensor:
+    quant_config = mxfp4_w4a16_moe_quant_config(
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+    )
+
+    if unfused:
+        fused_experts = UnfusedOAITritonExperts(quant_config)
+    else:
+        fused_experts = OAITritonExperts(quant_config)
+
+    mk = FusedMoEModularKernel(MoEPrepareAndFinalizeNoEP(), fused_experts)
+
+    return mk.forward(
+        hidden_states=x,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=True,
+        activation="swigluoai",
+        global_num_experts=num_experts,
+        expert_map=None,
+        apply_router_weight_on_input=False,
+    )
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("m,n,k", MNK)
+@pytest.mark.parametrize("num_experts", [32, 128])
+@pytest.mark.parametrize("topk", [4])
+@pytest.mark.parametrize("unfused", [True, False])
+def test_oai_triton_moe(
+    dtype: torch.dtype,
+    m: int,
+    n: int,
+    k: int,
+    num_experts: int,
+    topk: int,
+    unfused: bool,
+):
+    current_platform.seed_everything(0)
+    (
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        w1_tri,
+        w2_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        w1_precision_config,
+        w2_precision_config,
+    ) = make_weights(dtype, k, n, num_experts)
+
+    x = torch.randn((m, k), dtype=dtype, device="cuda")
+    router_logits = torch.randn(m, num_experts, device="cuda", dtype=dtype)
+    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1, sorted=True)
+    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
+
+    with set_current_vllm_config(VllmConfig()):
+        out_ref = torch_moe_impl(x, w1, w2, w1_bias, w2_bias, topk_weights, topk_ids)
+
+        out = oai_triton_moe_impl(
+            x,
+            w1_tri,
+            w2_tri,
+            w1_precision_config,
+            w2_precision_config,
+            w1_bias_tri,
+            w2_bias_tri,
+            num_experts,
+            topk_weights,
+            topk_ids,
+            unfused,
+        )
+
+    assert_close(ref=out_ref, tri=out, maxtol=0.025, rmstol=0.005)
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 3ad19370962a..24cab79a7244 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -20,15 +20,24 @@
     _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    modular_marlin_fused_moe,
+    MarlinExperts,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    modular_triton_fused_moe,
+    TritonExperts,
     try_get_optimal_moe_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
 )
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    UnfusedOAITritonExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel,
+)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 
 from .utils import _get_lora_device
 
@@ -114,15 +123,23 @@ def _inject_lora_into_fused_moe(self):
         self.base_layer.ensure_moe_quant_config_init()
         quant_config = self.base_layer.quant_method.moe_quant_config
 
-        m_fused_moe_fn = (
-            modular_triton_fused_moe(
-                quant_config, shared_experts=self.base_layer.shared_experts
+        prepare_finalize = MoEPrepareAndFinalizeNoEP()
+        m_fused_moe_fn = FusedMoEModularKernel(
+            prepare_finalize,
+            self.base_layer.quant_method.select_gemm_impl(
+                prepare_finalize, self.base_layer
+            ),
+            self.base_layer.shared_experts,
+            getattr(self.base_layer, "shared_experts_stream", None),
+        )
+        if quant_config.use_mxfp4_w4a16:
+            assert isinstance(
+                m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts)
             )
-            if not quant_config.use_mxfp4_w4a16
-            else modular_marlin_fused_moe(
-                quant_config, shared_experts=self.base_layer.shared_experts
+        else:
+            assert isinstance(
+                m_fused_moe_fn.fused_experts, (MarlinExperts, TritonExperts)
             )
-        )
 
         def fwd_decorator(layer, func):
             def wrapper(*args, **kwargs):
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 128507639fdf..0b006e15632e 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -5,6 +5,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
@@ -376,3 +377,148 @@ def apply(
             intermediate_cache=workspace2,
             a1q_scale=a1q_scale,
         )
+
+
+class UnfusedOAITritonExperts(BaseOAITritonExperts):
+    """
+    A Triton based MoE expert class that operates on expert standard
+    format and explicitly keeps the activation and reduction (moe_sum) steps
+    unfused from the matmul_ogs kernel. This exposes injection points
+    for activation and moe_sum.
+
+    One use case for it is to inject LoRA modules on the activation and moe_sum.
+    """
+
+    def __init__(self, quant_config: FusedMoEQuantConfig):
+        # TODO (varun) : Enable activation quantization
+        assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
+        super().__init__(quant_config)
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # workspace are allocated inside the kernel
+        workspace1 = (M * topk, N // 2)
+        workspace2 = (M * topk, max(N, K))
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def moe_sum(self, input: torch.Tensor, output: torch.Tensor):
+        ops.moe_sum(input, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        if self.quant_config is None:
+            self.quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
+
+        if expert_map is not None:
+            topk_ids = expert_map[topk_ids]
+
+        local_num_experts = w1.size(0)
+        if global_num_experts == -1:
+            global_num_experts = local_num_experts
+
+        routing_data, gather_indx, scatter_indx = self._make_routing_data(
+            topk_ids, topk_weights, local_num_experts
+        )
+
+        topk = topk_ids.size(1)
+
+        # type check, uint8 means mxfp4
+        assert hidden_states.dtype == torch.bfloat16
+        assert (
+            self.quant_config.w1_bias is None
+            or self.quant_config.w1_bias.dtype == torch.float32
+        )
+        assert (
+            self.quant_config.w2_bias is None
+            or self.quant_config.w2_bias.dtype == torch.float32
+        )
+
+        # Shape check, only check non-mxfp4
+        assert hidden_states.ndim == 2
+        assert hidden_states.shape[-1] == w1.shape[-2]
+        assert w2.shape[-1] == w1.shape[1]
+
+        batch_dim = 1
+        M, K = hidden_states.shape
+        E, _, N = w1.shape
+
+        if global_num_experts == -1:
+            global_num_experts = E
+
+        # Note that the output tensor might be in workspace13
+        intermediate_cache1 = _resize_cache(workspace2, (batch_dim, M * topk, N))
+        intermediate_cache3 = _resize_cache(workspace2, (batch_dim, M * topk, K))
+        intermediate_cache2 = _resize_cache(workspace13, (M * topk, N // 2))
+
+        gammas = routing_data.gate_scal if routing_data else None
+
+        matmul_ogs(
+            hidden_states,
+            w1,
+            self.quant_config.w1_bias,
+            routing_data,
+            gather_indx=gather_indx,
+            precision_config=self.quant_config.w1_precision,
+            gammas=gammas if apply_router_weight_on_input else None,
+            fused_activation=None,
+            y=intermediate_cache1,
+        )
+
+        self.activation(
+            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+        )
+
+        # matmul_ogs grouped reduction fuse sum across multiple experts:
+        # y[dst_ind // n_expts_act, :] += x[src_ind, :]
+        # Need to set n_expts_act to 1 to unfuse moe_sum
+        routing_data.n_expts_act = 1
+
+        matmul_ogs(
+            intermediate_cache2,
+            w2,
+            self.quant_config.w2_bias,
+            routing_data,
+            scatter_indx=scatter_indx,
+            precision_config=self.quant_config.w2_precision,
+            gammas=None if apply_router_weight_on_input else gammas,
+            y=intermediate_cache3,
+        )
+
+        self.moe_sum(intermediate_cache3.view(-1, topk, K), output)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index bc241ac692e2..74036753496d 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -30,6 +30,7 @@
 )
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
     OAITritonExperts,
+    UnfusedOAITritonExperts,
 )
 from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
@@ -83,8 +84,21 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
     if not current_platform.is_cuda():
         return Mxfp4Backend.NONE
 
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-    return Mxfp4Backend.MARLIN
+    # If FlashInfer is not available, try either Marlin or Triton
+    triton_kernels_supported = (
+        has_triton_kernels()
+        and is_torch_equal_or_newer("2.8.0")
+        # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
+        # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
+        # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
+        and (9, 0) <= current_platform.get_device_capability() < (11, 0)
+    )
+    if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
+        logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
+        return Mxfp4Backend.MARLIN
+
+    logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
+    return Mxfp4Backend.TRITON
 
 
 def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
@@ -854,6 +868,8 @@ def select_gemm_impl(
             elif self.mxfp4_backend == Mxfp4Backend.MARLIN:
                 return MarlinExperts(self.moe_quant_config)
             elif self.mxfp4_backend == Mxfp4Backend.TRITON:
+                if self.moe.is_lora_enabled:
+                    return UnfusedOAITritonExperts(self.moe_quant_config)
                 return OAITritonExperts(self.moe_quant_config)
             else:
                 raise NotImplementedError(

From 18523b87f67b12e9044d690dfe9da7cddc390627 Mon Sep 17 00:00:00 2001
From: Wilson Wu <iwilsonwu@gmail.com>
Date: Fri, 28 Nov 2025 10:53:55 +0800
Subject: [PATCH 496/578] [Docs] Update supported models for Olmo 3 in tool
 calling documentation (#29411)

Signed-off-by: Wilson Wu <iwilsonwu@gmail.com>
---
 docs/features/tool_calling.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index dd79ba19b724..22dda37279ac 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -371,7 +371,8 @@ Olmo 3 models output tool calls in a format that is very similar to the one expe
 
 Supported models:
 
-* TODO (will be updated after Olmo 3 release)
+* `allenai/Olmo-3-7B-Instruct`
+* `allenai/Olmo-3-32B-Think`
 
 Flags: `--tool-call-parser olmo3`
 

From c7ba1f6bc762af8f231e6ee885725e7401d74578 Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Fri, 28 Nov 2025 13:42:30 +0800
Subject: [PATCH 497/578] [BugFix] Fix ValueError in NewRequestData repr
 methods (#29392)

Signed-off-by: maang <maang_h@163.com>
---
 tests/v1/core/test_output.py | 36 ++++++++++++++++++++++++++++++++++++
 vllm/v1/core/sched/output.py |  8 ++++++--
 2 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100644 tests/v1/core/test_output.py

diff --git a/tests/v1/core/test_output.py b/tests/v1/core/test_output.py
new file mode 100644
index 000000000000..9dea19320e61
--- /dev/null
+++ b/tests/v1/core/test_output.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.v1.core.sched.output import NewRequestData
+
+
+def _create_new_requests_data(prompt_embeds: torch.Tensor | None) -> NewRequestData:
+    return NewRequestData(
+        req_id="test_req",
+        prompt_token_ids=None,
+        mm_features=[],
+        sampling_params=None,
+        pooling_params=None,
+        block_ids=([],),
+        num_computed_tokens=0,
+        lora_request=None,
+        prompt_embeds=prompt_embeds,
+    )
+
+
+def test_repr_with_none() -> None:
+    """Test repr when prompt_embeds is None."""
+    new_requests_data = _create_new_requests_data(None)
+
+    assert "prompt_embeds_shape=None" in repr(new_requests_data)
+    assert "prompt_embeds_shape=None" in new_requests_data.anon_repr()
+
+
+def test_repr_with_multi_element_tensor() -> None:
+    """Test repr when prompt_embeds is a multi-element tensor."""
+    prompt_embeds = torch.randn(10, 768)
+    new_requests_data = _create_new_requests_data(prompt_embeds)
+
+    assert "prompt_embeds_shape=torch.Size([10, 768])" in repr(new_requests_data)
+    assert "prompt_embeds_shape=torch.Size([10, 768])" in new_requests_data.anon_repr()
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index abfab43499b2..b69fa87ebddc 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -68,7 +68,9 @@ def from_request(
         )
 
     def __repr__(self) -> str:
-        prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
+        prompt_embeds_shape = (
+            self.prompt_embeds.shape if self.prompt_embeds is not None else None
+        )
         return (
             f"NewRequestData("
             f"req_id={self.req_id},"
@@ -88,7 +90,9 @@ def anon_repr(self) -> str:
         prompt_token_ids_len = (
             len(self.prompt_token_ids) if self.prompt_token_ids is not None else None
         )
-        prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
+        prompt_embeds_shape = (
+            self.prompt_embeds.shape if self.prompt_embeds is not None else None
+        )
         return (
             f"NewRequestData("
             f"req_id={self.req_id},"

From 37b15e97e8443a7fd76f5aa95a78d5593f7241a4 Mon Sep 17 00:00:00 2001
From: EanWang211123 <wangyiheng@sangfor.com.cn>
Date: Fri, 28 Nov 2025 14:05:45 +0800
Subject: [PATCH 498/578] [Multimodal][Speculative Decoding]Eagle3 mm support,
 enablement on qwen3vl (#29594)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: EanWang211123 <wangyiheng@sangfor.com.cn>
Co-authored-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/models/registry.py               |  4 ++++
 tests/v1/e2e/test_spec_decode.py       | 14 ++++++++++++++
 vllm/model_executor/models/qwen3_vl.py | 23 ++++++++++++++++++++++-
 vllm/model_executor/models/registry.py |  1 +
 vllm/v1/spec_decode/eagle.py           |  8 ++++----
 5 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index c9d4823d5279..1f4a106c06b4 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -913,6 +913,10 @@ def check_available_online(
         "Qwen/Qwen2.5-VL-7B-Instruct",
         speculative_model="Rayzl/qwen2.5-vl-7b-eagle3-sgl",
     ),
+    "Eagle3Qwen3vlForCausalLM": _HfExamplesInfo(
+        "Qwen/Qwen3-VL-8B-Instruct",
+        speculative_model="taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3",
+    ),
     "Qwen3NextMTP": _HfExamplesInfo(
         "Qwen/Qwen3-Next-80B-A3B-Instruct", min_transformers_version="4.56.3"
     ),
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 03396270a31c..3a25f7411eec 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -283,6 +283,19 @@ def test_speculators_model_integration(
     ["model_setup", "mm_enabled", "enable_chunked_prefill"],
     [
         (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False),
+        pytest.param(
+            (
+                "eagle3",
+                "Qwen/Qwen3-VL-8B-Instruct",
+                "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3",
+                1,
+            ),
+            False,
+            False,
+            marks=pytest.mark.skip(
+                reason="architecture of its eagle3 is LlamaForCausalLMEagle3"
+            ),
+        ),
         pytest.param(
             (
                 "eagle3",
@@ -352,6 +365,7 @@ def test_speculators_model_integration(
     ],
     ids=[
         "qwen3_eagle3",
+        "qwen3_vl_eagle3",
         "qwen2_5_vl_eagle3",
         "llama3_eagle",
         "llama3_eagle3",
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 4cd6fa14c32d..52d31e70a8f0 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -89,6 +89,7 @@
 
 from .interfaces import (
     MultiModalEmbeddings,
+    SupportsEagle3,
     SupportsLoRA,
     SupportsMRoPE,
     SupportsMultiModal,
@@ -1122,9 +1123,14 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
+        
+        aux_hidden_states = []
         for layer_idx, layer in islice(
             enumerate(self.layers), self.start_layer, self.end_layer
         ):
+            if layer_idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
+            
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
@@ -1144,6 +1150,9 @@ def forward(
                 {"hidden_states": hidden_states, "residual": residual}
             )
         hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
         return hidden_states
 
 
@@ -1186,7 +1195,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
     dummy_inputs=Qwen3VLDummyInputsBuilder,
 )
 class Qwen3VLForConditionalGeneration(
-    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+    nn.Module,
+    SupportsMultiModal,
+    SupportsLoRA,
+    SupportsPP,
+    SupportsMRoPE,
+    SupportsEagle3,
 ):
     merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
@@ -1279,6 +1293,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
         self.visual_dim = config.vision_config.out_hidden_size
         self.multiscale_dim = self.visual_dim * self.deepstack_num_level
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.language_model.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.language_model.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
     def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors:
         # get deepstack_input_embeds from buffer, and clear the buffer
         return IntermediateTensors(
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index ba9f33819c95..0d582043e8c0 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -414,6 +414,7 @@
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "Eagle3Qwen3vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 7600df48150a..305abdade8da 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1017,10 +1017,10 @@ def load_model(self, target_model: nn.Module) -> None:
 
         if supports_multimodal(target_model):
             # handle multimodality
-            if (
-                self.get_model_name(target_model)
-                == "Qwen2_5_VLForConditionalGeneration"
-            ):
+            if self.get_model_name(target_model) in [
+                "Qwen2_5_VLForConditionalGeneration",
+                "Qwen3VLForConditionalGeneration",
+            ]:
                 self.model.config.image_token_index = target_model.config.image_token_id
             else:
                 self.model.config.image_token_index = (

From f4b76056ee5c3a3f917527da5be3786e1b8530c6 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Fri, 28 Nov 2025 14:05:48 +0800
Subject: [PATCH 499/578] Improve enable chunked_prefill & prefix_caching
 logic. (#26623)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../pooling/test_auto_prefix_cache_support.py |   4 +-
 tests/test_config.py                          | 240 +++++++++++++++++-
 vllm/config/model.py                          | 109 ++++++++
 vllm/config/pooler.py                         |   6 +-
 vllm/config/vllm.py                           |  76 ++----
 vllm/engine/arg_utils.py                      |  90 +++----
 vllm/model_executor/models/bert.py            |   4 +-
 vllm/model_executor/models/interfaces_base.py |  35 ++-
 vllm/model_executor/models/modernbert.py      |   3 +-
 vllm/model_executor/models/registry.py        |  15 +-
 vllm/v1/engine/core.py                        |   7 +-
 11 files changed, 456 insertions(+), 133 deletions(-)

diff --git a/tests/models/language/pooling/test_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py
index 0904c7e877ef..3795f2a5d866 100644
--- a/tests/models/language/pooling/test_auto_prefix_cache_support.py
+++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py
@@ -105,8 +105,6 @@ def test_embed_models(
 def test_non_causal_models(
     hf_runner, vllm_runner, example_prompts, model: str, dtype: str
 ) -> None:
-    with vllm_runner(
-        model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
-    ) as vllm_model:
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
         cache_config = vllm_model.llm.llm_engine.cache_config
         assert not cache_config.enable_prefix_caching
diff --git a/tests/test_config.py b/tests/test_config.py
index 080e4d2afacc..112b02edd038 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import logging
 import os
 from dataclasses import MISSING, Field, asdict, dataclass, field
 from unittest.mock import patch
@@ -602,6 +602,244 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
     assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
 
 
+@pytest.mark.parametrize(
+    ("model_id", "expected_attn_type", "expected_result", "reason"),
+    [
+        # pooling models
+        (
+            "jason9693/Qwen2.5-1.5B-apeach",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support chunked prefill.",
+        ),
+        (
+            "Qwen/Qwen3-Embedding-0.6B",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support chunked prefill.",
+        ),
+        (
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            "decoder",
+            False,
+            "Pooling models with step pooling does not support chunked prefill.",
+        ),
+        (
+            "internlm/internlm2-1_8b-reward",
+            "decoder",
+            False,
+            "Pooling models with all pooling does not support chunked prefill.",
+        ),
+        (
+            "BAAI/bge-base-en",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "boltuix/NeuroBERT-NER",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "papluca/xlm-roberta-base-language-detection",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "intfloat/e5-small",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        # multimodal models
+        (
+            "openai/clip-vit-base-patch32",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support chunked prefill.",
+        ),
+        (
+            "google/siglip-base-patch16-224",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        # generate models
+        (
+            "Qwen/Qwen3-0.6B",
+            "decoder",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        (
+            "Qwen/Qwen3-Next-80B-A3B-Instruct",
+            "hybrid",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        (
+            "ibm-granite/granite-4.0-h-small",
+            "hybrid",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        (
+            "state-spaces/mamba-130m-hf",
+            "attention_free",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        # encoder_decoder models
+        (
+            "openai/whisper-small",
+            "encoder_decoder",
+            False,
+            "Encoder decoder models does not support chunked prefill.",
+        ),
+    ],
+)
+def test_is_chunked_prefill_supported(
+    model_id: str,
+    expected_attn_type: str,
+    expected_result: bool,
+    reason: str,
+    caplog_vllm,
+):
+    model_config = ModelConfig(model_id, trust_remote_code=True)
+    assert model_config.attn_type == expected_attn_type
+    with caplog_vllm.at_level(level=logging.DEBUG):
+        assert model_config.is_chunked_prefill_supported == expected_result
+    assert reason in caplog_vllm.text
+
+
+@pytest.mark.parametrize(
+    ("model_id", "expected_attn_type", "expected_result", "reason"),
+    [
+        # pooling models
+        (
+            "jason9693/Qwen2.5-1.5B-apeach",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support prefix caching.",
+        ),
+        (
+            "Qwen/Qwen3-Embedding-0.6B",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support prefix caching.",
+        ),
+        (
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            "decoder",
+            False,
+            "Pooling models with step pooling does not support prefix caching.",
+        ),
+        (
+            "internlm/internlm2-1_8b-reward",
+            "decoder",
+            False,
+            "Pooling models with all pooling does not support prefix caching.",
+        ),
+        (
+            "BAAI/bge-base-en",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "boltuix/NeuroBERT-NER",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "papluca/xlm-roberta-base-language-detection",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "intfloat/e5-small",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        # multimodal models
+        (
+            "openai/clip-vit-base-patch32",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support prefix caching.",
+        ),
+        (
+            "google/siglip-base-patch16-224",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        # generate models
+        (
+            "Qwen/Qwen3-0.6B",
+            "decoder",
+            True,
+            "Generative models support prefix caching.",
+        ),
+        (
+            "Qwen/Qwen3-Next-80B-A3B-Instruct",
+            "hybrid",
+            False,
+            "Hybrid models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+        ),
+        (
+            "ibm-granite/granite-4.0-h-small",
+            "hybrid",
+            False,
+            "Hybrid models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+        ),
+        (
+            "state-spaces/mamba-130m-hf",
+            "attention_free",
+            False,
+            "Attention free models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+        ),
+        # encoder_decoder models
+        (
+            "openai/whisper-small",
+            "encoder_decoder",
+            False,
+            "Encoder decoder models does not support prefix caching.",
+        ),
+    ],
+)
+def test_is_prefix_caching_supported(
+    model_id: str,
+    expected_attn_type: str,
+    expected_result: bool,
+    reason: str,
+    caplog_vllm,
+):
+    model_config = ModelConfig(model_id, trust_remote_code=True)
+    assert model_config.attn_type == expected_attn_type
+    with caplog_vllm.at_level(level=logging.DEBUG):
+        assert model_config.is_prefix_caching_supported == expected_result
+    assert reason in caplog_vllm.text
+
+
 @pytest.mark.parametrize(
     ("backend", "custom_ops", "expected"),
     [
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 21d602b30ac1..b9ae4fec14ef 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -107,6 +107,10 @@
     "draft": [],
 }
 
+AttnTypeStr = Literal[
+    "decoder", "encoder", "encoder_only", "encoder_decoder", "attention_free", "hybrid"
+]
+
 
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
@@ -1752,6 +1756,111 @@ def get_and_verify_max_len(self, max_model_len: int):
         logger.info("Using max model len %s", max_model_len)
         return max_model_len
 
+    @property
+    def attn_type(self) -> AttnTypeStr:
+        if self.pooler_config is not None:
+            pooling_type = self._model_info.default_pooling_type.lower()
+            if pooling_type == "cls":
+                return "encoder_only"
+            else:
+                is_causal = getattr(self.hf_config, "is_causal", True)
+                return "encoder_only" if not is_causal else self._model_info.attn_type
+        elif self.is_hybrid:
+            return "hybrid"
+        elif self.is_attention_free:
+            return "attention_free"
+        elif self.is_encoder_decoder:
+            return "encoder_decoder"
+        else:
+            return "decoder"
+
+    @property
+    def is_chunked_prefill_supported(self) -> bool:
+        attn_type = self.attn_type
+        if self.pooler_config is not None:
+            # for pooling models
+            if attn_type == "encoder_only":
+                logger.debug(
+                    "Pooling models with bidirectional attn does not support "
+                    "chunked prefill."
+                )
+                return False
+            elif attn_type == "decoder":
+                pooling_type = self.pooler_config.pooling_type.lower()
+                if pooling_type in ["all", "mean", "step", "cls"]:
+                    logger.debug(
+                        "Pooling models with %s pooling does not "
+                        "support chunked prefill.",
+                        pooling_type,
+                    )
+                    return False
+                else:
+                    # pooling_type == "last"
+                    logger.debug(
+                        "Pooling models with causal attn and last pooling support "
+                        "chunked prefill."
+                    )
+                    return True
+            # vllm currently does not have pooling models using hybrid,
+            # attention_free or encoder_decoder attn types.
+            return attn_type != "encoder_decoder"
+        else:
+            if attn_type == "encoder_decoder":
+                logger.debug("Encoder decoder models does not support chunked prefill.")
+                return False
+            logger.debug("Generative models support chunked prefill.")
+            return True
+
+    @property
+    def is_prefix_caching_supported(self) -> bool:
+        attn_type = self.attn_type
+        if self.pooler_config is not None:
+            # for pooling models
+            if attn_type == "encoder_only":
+                logger.debug(
+                    "Pooling models with bidirectional attn does not "
+                    "support prefix caching."
+                )
+                return False
+            elif attn_type == "decoder":
+                pooling_type = self.pooler_config.pooling_type.lower()
+                if pooling_type in ["all", "mean", "step", "cls"]:
+                    logger.debug(
+                        "Pooling models with %s pooling does not "
+                        "support prefix caching.",
+                        pooling_type,
+                    )
+                    return False
+                else:
+                    # pooling_type == "last"
+                    logger.debug(
+                        "Pooling models with causal attn and last pooling support "
+                        "prefix caching."
+                    )
+                    return True
+            # vllm currently does not have pooling models using hybrid,
+            # attention_free or encoder_decoder attn types.
+            return False
+        else:
+            if attn_type == "hybrid":
+                logger.debug(
+                    "Hybrid models does not support prefix caching since the feature "
+                    "is still experimental."
+                )
+                return False
+            elif attn_type == "attention_free":
+                logger.debug(
+                    "Attention free models does not support prefix caching since the "
+                    "feature is still experimental."
+                )
+                return False
+            elif attn_type == "encoder_decoder":
+                logger.debug("Encoder decoder models does not support prefix caching.")
+                return False
+            else:  # attn_type == "decoder"
+                logger.debug("Generative models support prefix caching.")
+                return True
+
     def is_model_moe(
         self,
     ) -> bool:
diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py
index 85950bbcd666..aa4e7006d024 100644
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any
+from typing import Any, Literal
 
 from pydantic.dataclasses import dataclass
 
@@ -11,13 +11,15 @@
 
 logger = init_logger(__name__)
 
+PoolingTypeStr = Literal["LAST", "ALL", "CLS", "STEP", "MEAN"]
+
 
 @config
 @dataclass
 class PoolerConfig:
     """Controls the behavior of output pooling in pooling models."""
 
-    pooling_type: str | None = None
+    pooling_type: PoolingTypeStr | None = None
     """
     The pooling method of the pooling model. This should be a key in
     [`vllm.model_executor.layers.pooler.PoolingType`][].
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index c576275e80fe..7ac8cc764322 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -721,65 +721,27 @@ def has_blocked_weights():
                 "correctness and to realize prefill savings. "
             )
 
-        disable_chunked_prefill_reasons: list[str] = []
+        if self.model_config and self.model_config.is_encoder_decoder:
+            from vllm.multimodal import MULTIMODAL_REGISTRY
 
-        if self.model_config:
-            if self.model_config.pooler_config:
-                pooling_type = self.model_config.pooler_config.pooling_type
-                if pooling_type is None or pooling_type.lower() != "last":
-                    disable_chunked_prefill_reasons.append(
-                        'Only "last" pooling supports chunked '
-                        "prefill and prefix caching; disabling both."
-                    )
-                if not getattr(self.model_config.hf_config, "is_causal", True):
-                    disable_chunked_prefill_reasons.append(
-                        "Only models using causal attention support chunked "
-                        "prefill and prefix caching; disabling both."
-                    )
-            elif self.model_config.is_encoder_decoder:
-                from vllm.multimodal import MULTIMODAL_REGISTRY
-
-                self.scheduler_config.max_num_encoder_input_tokens = (
-                    MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
-                )
-                logger.debug(
-                    "Encoder-decoder model detected: setting "
-                    "`max_num_encoder_input_tokens` to encoder length (%s)",
-                    self.scheduler_config.max_num_encoder_input_tokens,
-                )
-                if (
-                    self.model_config.architecture == "WhisperForConditionalGeneration"
-                    and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
-                ):
-                    logger.warning(
-                        "Whisper is known to have issues with "
-                        "forked workers. If startup is hanging, "
-                        "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
-                        "to 'spawn'."
-                    )
-
-        # Final off-switch for CP/APC:
-        # Disable for (a) collected blockers, (b) encoder–decoder, or
-        # (c) explicit CP=False when APC wasn't requested.
-        # Do NOT disable merely because the resolved CP flag is False.
-        apc_requested = (
-            self.cache_config is not None and self.cache_config.enable_prefix_caching
-        )
-        if (
-            disable_chunked_prefill_reasons
-            or (self.model_config is not None and self.model_config.is_encoder_decoder)
-            or (
-                self.scheduler_config.enable_chunked_prefill is False
-                and not apc_requested
+            self.scheduler_config.max_num_encoder_input_tokens = (
+                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
             )
-        ):
-            for reason in disable_chunked_prefill_reasons:
-                logger.info(reason)
-            self.scheduler_config.enable_chunked_prefill = False
-            self.scheduler_config.long_prefill_token_threshold = 0
-
-            if self.cache_config is not None:
-                self.cache_config.enable_prefix_caching = False
+            logger.debug(
+                "Encoder-decoder model detected: setting "
+                "`max_num_encoder_input_tokens` to encoder length (%s)",
+                self.scheduler_config.max_num_encoder_input_tokens,
+            )
+            if (
+                self.model_config.architecture == "WhisperForConditionalGeneration"
+                and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
+            ):
+                logger.warning(
+                    "Whisper is known to have issues with "
+                    "forked workers. If startup is hanging, "
+                    "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
+                    "to 'spawn'."
+                )
 
         if (
             self.kv_events_config is not None
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e4c9a82d2522..ad5a34c56161 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1349,30 +1349,10 @@ def create_engine_config(
         self.tokenizer = model_config.tokenizer
 
         self._check_feature_supported(model_config)
-
-        # Set default arguments for V1 Engine.
-        self._set_default_args(usage_context, model_config)
-        # Disable chunked prefill and prefix caching for:
-        # POWER (ppc64le)/s390x/RISCV CPUs in V1
-        if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
-            CpuArchEnum.POWERPC,
-            CpuArchEnum.S390X,
-            CpuArchEnum.RISCV,
-        ):
-            logger.info(
-                "Chunked prefill is not supported for ARM and POWER, "
-                "S390X and RISC-V CPUs; "
-                "disabling it for V1 backend."
-            )
-            self.enable_chunked_prefill = False
-            logger.info(
-                "Prefix caching is not supported for ARM and POWER, "
-                "S390X and RISC-V CPUs; "
-                "disabling it for V1 backend."
-            )
-            self.enable_prefix_caching = False
-
-        assert self.enable_chunked_prefill is not None
+        self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
+        self._set_default_max_num_seqs_and_batched_tokens_args(
+            usage_context, model_config
+        )
 
         sliding_window: int | None = None
         if not is_interleaved(model_config.hf_text_config):
@@ -1805,34 +1785,6 @@ def _check_feature_supported(self, model_config: ModelConfig):
                 )
                 _raise_unsupported_error(feature_name=name)
 
-    @classmethod
-    def get_chunked_prefill_prefix_caching_defaults(
-        cls,
-        model_config: ModelConfig,
-    ) -> tuple[bool, bool]:
-        if model_config.runner_type != "pooling":
-            default_chunked_prefill = True
-
-            # Disable prefix caching default for hybrid models and mamba-only
-            # models since the feature is still experimental.
-            default_prefix_caching = not (
-                model_config.is_hybrid or model_config.is_attention_free
-            )
-        else:
-            assert model_config.pooler_config is not None
-
-            pooling_type = model_config.pooler_config.pooling_type
-            incremental_prefill_supported = (
-                pooling_type is not None
-                and pooling_type.lower() == "last"
-                and getattr(model_config.hf_config, "is_causal", True)
-            )
-
-            default_chunked_prefill = incremental_prefill_supported
-            default_prefix_caching = incremental_prefill_supported
-
-        return default_chunked_prefill, default_prefix_caching
-
     @classmethod
     def get_batch_defaults(
         cls,
@@ -1916,14 +1868,11 @@ def get_batch_defaults(
 
         return default_max_num_batched_tokens, default_max_num_seqs
 
-    def _set_default_args(
-        self, usage_context: UsageContext, model_config: ModelConfig
+    def _set_default_chunked_prefill_and_prefix_caching_args(
+        self, model_config: ModelConfig
     ) -> None:
-        """Set Default Arguments for V1 Engine."""
-        (
-            default_chunked_prefill,
-            default_prefix_caching,
-        ) = self.get_chunked_prefill_prefix_caching_defaults(model_config)
+        default_chunked_prefill = model_config.is_chunked_prefill_supported
+        default_prefix_caching = model_config.is_prefix_caching_supported
 
         if self.prefill_context_parallel_size > 1:
             default_chunked_prefill = False
@@ -1984,6 +1933,29 @@ def _set_default_args(
                 scope="local",
             )
 
+        # Disable chunked prefill and prefix caching for:
+        # POWER (ppc64le)/s390x/RISCV CPUs in V1
+        if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
+            CpuArchEnum.POWERPC,
+            CpuArchEnum.S390X,
+            CpuArchEnum.RISCV,
+        ):
+            logger.info(
+                "Chunked prefill is not supported for ARM and POWER, "
+                "S390X and RISC-V CPUs; "
+                "disabling it for V1 backend."
+            )
+            self.enable_chunked_prefill = False
+            logger.info(
+                "Prefix caching is not supported for ARM and POWER, "
+                "S390X and RISC-V CPUs; "
+                "disabling it for V1 backend."
+            )
+            self.enable_prefix_caching = False
+
+    def _set_default_max_num_seqs_and_batched_tokens_args(
+        self, usage_context: UsageContext, model_config: ModelConfig
+    ):
         world_size = self.pipeline_parallel_size * self.tensor_parallel_size
         (
             default_max_num_batched_tokens,
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 2679448bce77..e774cd647ea8 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -32,7 +32,7 @@
 from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding, SupportsQuant
-from .interfaces_base import default_pooling_type
+from .interfaces_base import attn_type, default_pooling_type
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
@@ -432,7 +432,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
-@default_pooling_type("ALL")
 class BertPoolingModel(BertModel):
     is_pooling_model = True
 
@@ -864,6 +863,7 @@ def forward(
         )
 
 
+@attn_type("encoder_only")
 @default_pooling_type("ALL")
 class BertForTokenClassification(nn.Module):
     is_pooling_model = True
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 85c5574bacf0..2c99fce8d918 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -19,10 +19,14 @@
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
+    from vllm.config.model import AttnTypeStr
+    from vllm.config.pooler import PoolingTypeStr
     from vllm.model_executor.layers.pooler import Pooler
 else:
     VllmConfig = Any
     Pooler = Any
+    PoolingTypeStr = Any
+    AttnTypeStr = Any
 
 logger = init_logger(__name__)
 
@@ -165,7 +169,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
         MRO of your model class.
     """
 
-    default_pooling_type: ClassVar[str] = "LAST"
+    default_pooling_type: ClassVar[PoolingTypeStr] = "LAST"
     """
     Indicates the [vllm.config.pooler.PoolerConfig.pooling_type][]
     to use by default.
@@ -175,6 +179,17 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
     decorator to conveniently set this field.
     """
 
+    attn_type: ClassVar[AttnTypeStr] = "decoder"
+    """
+    Indicates the
+    [vllm.config.model.ModelConfig.attn_type][]
+    to use by default.
+
+    You can use the
+    [vllm.model_executor.models.interfaces_base.attn_type][]
+    decorator to conveniently set this field.
+    """
+
     pooler: Pooler
     """The pooler is only called on TP rank 0."""
 
@@ -199,7 +214,7 @@ def is_pooling_model(
 _T = TypeVar("_T", bound=type[nn.Module])
 
 
-def default_pooling_type(pooling_type: str):
+def default_pooling_type(pooling_type: PoolingTypeStr):
     """Decorator to set `VllmModelForPooling.default_pooling_type`."""
 
     def func(model: _T) -> _T:
@@ -209,5 +224,19 @@ def func(model: _T) -> _T:
     return func
 
 
-def get_default_pooling_type(model: type[object] | object) -> str:
+def get_default_pooling_type(model: type[object] | object) -> PoolingTypeStr:
     return getattr(model, "default_pooling_type", "LAST")
+
+
+def attn_type(attn_type: AttnTypeStr):
+    """Decorator to set `VllmModelForPooling.attn_type`."""
+
+    def func(model: _T) -> _T:
+        model.attn_type = attn_type  # type: ignore
+        return model
+
+    return func
+
+
+def get_attn_type(model: type[object] | object) -> AttnTypeStr:
+    return getattr(model, "attn_type", "decoder")
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 3a8a6c74d9d1..743bc23d9876 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -28,7 +28,7 @@
 from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding
-from .interfaces_base import default_pooling_type
+from .interfaces_base import attn_type, default_pooling_type
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
@@ -396,6 +396,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return self.norm(self.act(self.dense(hidden_states)))
 
 
+@attn_type("encoder_only")
 @default_pooling_type("ALL")
 class ModernBertForTokenClassification(nn.Module):
     is_pooling_model = True
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 0d582043e8c0..73a61f1148b5 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -17,7 +17,7 @@
 from dataclasses import asdict, dataclass, field
 from functools import lru_cache
 from pathlib import Path
-from typing import TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 
 import torch.nn as nn
 import transformers
@@ -33,6 +33,14 @@
 from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
 from vllm.utils.hashing import safe_hash
 
+if TYPE_CHECKING:
+    from vllm.config.model import AttnTypeStr
+    from vllm.config.pooler import PoolingTypeStr
+else:
+    AttnTypeStr = Any
+    PoolingTypeStr = Any
+
+
 from .interfaces import (
     has_inner_state,
     has_noops,
@@ -47,6 +55,7 @@
     supports_transcription,
 )
 from .interfaces_base import (
+    get_attn_type,
     get_default_pooling_type,
     is_pooling_model,
     is_text_generation_model,
@@ -509,7 +518,8 @@ class _ModelInfo:
     architecture: str
     is_text_generation_model: bool
     is_pooling_model: bool
-    default_pooling_type: str
+    attn_type: AttnTypeStr
+    default_pooling_type: PoolingTypeStr
     supports_cross_encoding: bool
     supports_multimodal: bool
     supports_multimodal_raw_input_only: bool
@@ -530,6 +540,7 @@ def from_model_cls(model: type[nn.Module]) -> "_ModelInfo":
             is_text_generation_model=is_text_generation_model(model),
             is_pooling_model=is_pooling_model(model),
             default_pooling_type=get_default_pooling_type(model),
+            attn_type=get_attn_type(model),
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_multimodal_raw_input_only=supports_multimodal_raw_input_only(
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8657a95b5e6e..e3a5f51a8fc5 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -119,11 +119,12 @@ def __init__(
         # Setup scheduler.
         Scheduler = vllm_config.scheduler_config.get_scheduler_cls()
 
-        if len(kv_cache_config.kv_cache_groups) == 0:
+        if len(kv_cache_config.kv_cache_groups) == 0:  # noqa: SIM102
             # Encoder models without KV cache don't support
             # chunked prefill. But do SSM models?
-            logger.info("Disabling chunked prefill for model without KVCache")
-            vllm_config.scheduler_config.enable_chunked_prefill = False
+            if vllm_config.scheduler_config.enable_chunked_prefill:
+                logger.warning("Disabling chunked prefill for model without KVCache")
+                vllm_config.scheduler_config.enable_chunked_prefill = False
 
         scheduler_block_size = (
             vllm_config.cache_config.block_size

From b34e8775a31c1a077a1a24f22ffbf048b2a979f6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 14:43:18 +0800
Subject: [PATCH 500/578] Revert "[CPU]Update CPU PyTorch to 2.9.0 (#29589)"
 (#29647)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docker/Dockerfile.cpu                  | 4 ++++
 requirements/cpu-build.txt             | 4 ++--
 requirements/cpu.txt                   | 8 ++++----
 vllm/model_executor/models/qwen3_vl.py | 4 ++--
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 67d3fb83a027..eb3807ef0ca4 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -119,6 +119,7 @@ FROM base AS vllm-test-deps
 
 WORKDIR /workspace/vllm
 
+# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
     cp requirements/test.in requirements/cpu-test.in && \
     sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
@@ -131,6 +132,9 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
       esac; \
     }; \
     remove_packages_not_supported_on_aarch64 && \
+    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
+    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
+    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
     uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
 
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 0c6fdd3b33cd..81d429a5e5f8 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -4,9 +4,9 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.0; platform_system == "Darwin"
-torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 8c04d6d5ce1b..e23d3286f3f7 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -7,17 +7,17 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.0; platform_system == "Darwin"
-torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.9.0; platform_machine == "ppc64le"
+torchaudio==2.8.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.24.0; platform_machine == "ppc64le"
+torchvision==0.23.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # Intel Extension for PyTorch, only for x86_64 CPUs
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 52d31e70a8f0..39fe8336b84a 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1123,14 +1123,14 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        
+
         aux_hidden_states = []
         for layer_idx, layer in islice(
             enumerate(self.layers), self.start_layer, self.end_layer
         ):
             if layer_idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(hidden_states + residual)
-            
+
             hidden_states, residual = layer(
                 positions,
                 hidden_states,

From 480598958e28fa1e2ed2f7be2d457fc6f85a1748 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Fri, 28 Nov 2025 15:53:20 +0800
Subject: [PATCH 501/578] [Feature][Bench] Add pareto visualization (#29477)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 docs/contributing/benchmarks.md        |  18 ++
 docs/mkdocs/hooks/generate_argparse.py |   4 +
 vllm/benchmarks/sweep/cli.py           |   3 +
 vllm/benchmarks/sweep/plot_pareto.py   | 393 +++++++++++++++++++++++++
 4 files changed, 418 insertions(+)
 create mode 100644 vllm/benchmarks/sweep/plot_pareto.py

diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index c9bc9cfe28a3..e4714e626638 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -1146,6 +1146,24 @@ vllm bench sweep plot benchmarks/results/<timestamp> \
 !!! tip
     You can use `--dry-run` to preview the figures to be plotted.
 
+### Pareto visualization (tokens/s/user vs tokens/s/GPU)
+
+`vllm bench sweep plot_pareto` helps pick configurations that balance per-user and per-GPU throughput.
+
+Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add per user latency; lower concurrency improves per-user rate but underutilizes GPUs; The Pareto frontier shows the best achievable pairs across your runs.
+
+- x-axis: tokens/s/user = `output_throughput` ÷ concurrency (`--user-count-var`, default `max_concurrency`, fallback `max_concurrent_requests`).
+- y-axis: tokens/s/GPU = `output_throughput` ÷ GPU count (`--gpu-count-var` if set; else gpu_count is TP×PP*DP).
+- Output: a single figure at `OUTPUT_DIR/pareto/PARETO.png`.
+- Show the configuration used in each data point `--label-by` (default: `max_concurrency,gpu_count`).
+
+Example:
+
+```bash
+vllm bench sweep plot_pareto benchmarks/results/<timestamp> \
+  --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size
+```
+
 ## Performance Benchmarks
 
 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index 735074c08b8c..4ae64a6e4bfc 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -94,6 +94,9 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
 bench_latency = auto_mock("vllm.benchmarks", "latency")
 bench_serve = auto_mock("vllm.benchmarks", "serve")
 bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
+bench_sweep_plot_pareto = auto_mock(
+    "vllm.benchmarks.sweep.plot_pareto", "SweepPlotParetoArgs"
+)
 bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs")
 bench_sweep_serve_sla = auto_mock(
     "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs"
@@ -221,6 +224,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         "bench_latency": create_parser(bench_latency.add_cli_args),
         "bench_serve": create_parser(bench_serve.add_cli_args),
         "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
+        "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args),
         "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args),
         "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args),
         "bench_throughput": create_parser(bench_throughput.add_cli_args),
diff --git a/vllm/benchmarks/sweep/cli.py b/vllm/benchmarks/sweep/cli.py
index 108cd7569086..e74e0e2c181c 100644
--- a/vllm/benchmarks/sweep/cli.py
+++ b/vllm/benchmarks/sweep/cli.py
@@ -6,6 +6,8 @@
 
 from .plot import SweepPlotArgs
 from .plot import main as plot_main
+from .plot_pareto import SweepPlotParetoArgs
+from .plot_pareto import main as plot_pareto_main
 from .serve import SweepServeArgs
 from .serve import main as serve_main
 from .serve_sla import SweepServeSLAArgs
@@ -15,6 +17,7 @@
     (SweepServeArgs, serve_main),
     (SweepServeSLAArgs, serve_sla_main),
     (SweepPlotArgs, plot_main),
+    (SweepPlotParetoArgs, plot_pareto_main),
 )
 
 
diff --git a/vllm/benchmarks/sweep/plot_pareto.py b/vllm/benchmarks/sweep/plot_pareto.py
new file mode 100644
index 000000000000..70472552b5cd
--- /dev/null
+++ b/vllm/benchmarks/sweep/plot_pareto.py
@@ -0,0 +1,393 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import math
+from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+from typing import ClassVar
+
+from vllm.utils.collection_utils import full_groupby
+from vllm.utils.import_utils import PlaceholderModule
+
+from .plot import DummyExecutor, _json_load_bytes
+from .utils import sanitize_filename
+
+try:
+    import matplotlib.pyplot as plt
+    import pandas as pd
+    import seaborn as sns
+except ImportError:
+    plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot")
+    pd = PlaceholderModule("pandas")
+    sns = PlaceholderModule("seaborn")
+
+
+def _first_present(run_data: dict[str, object], keys: list[str]):
+    for key in keys:
+        for candidate in {key, key.replace("_", "-"), key.replace("-", "_")}:
+            if candidate in run_data:
+                return run_data[candidate]
+    return None
+
+
+def _get_numeric(
+    run_data: dict[str, object],
+    keys: list[str],
+    *,
+    allow_zero: bool = True,
+) -> float | None:
+    value = _first_present(run_data, keys)
+    if value is None:
+        return None
+
+    try:
+        numeric = float(value)
+    except (TypeError, ValueError) as exc:
+        raise ValueError(
+            f"Expected numeric value for one of {keys}, "
+            f"but found {value!r} in {run_data=}"
+        ) from exc
+
+    if not allow_zero and numeric == 0:
+        return None
+
+    return numeric
+
+
+def _infer_user_count(
+    run_data: dict[str, object],
+    user_count_var: str | None,
+) -> float | None:
+    candidates = [user_count_var] if user_count_var else []
+    candidates.extend(["request_rate"])
+    user_count = _get_numeric(run_data, candidates, allow_zero=False)
+    if user_count is not None:
+        return user_count
+
+    # Fallback to the observed peak if configured value is missing.
+    return _get_numeric(run_data, ["max_concurrent_requests"], allow_zero=False)
+
+
+def _infer_gpu_count(
+    run_data: dict[str, object],
+    gpu_count_var: str | None,
+) -> float:
+    direct_candidates = [gpu_count_var] if gpu_count_var else []
+    direct_gpu_count = _get_numeric(run_data, direct_candidates, allow_zero=False)
+    if direct_gpu_count:
+        return direct_gpu_count
+
+    tp_size = _get_numeric(run_data, ["tensor_parallel_size", "tp"])
+    pp_size = _get_numeric(run_data, ["pipeline_parallel_size", "pp"])
+    dp_size = _get_numeric(run_data, ["data_parallel_size", "dp"])
+    world_size = 1.0
+    if tp_size:
+        world_size *= tp_size
+    if pp_size:
+        world_size *= pp_size
+    if dp_size:
+        world_size *= dp_size
+
+    return world_size
+
+
+def _get_throughput(
+    run_data: dict[str, object],
+    throughput_var: str,
+) -> float:
+    throughput = _get_numeric(run_data, [throughput_var])
+    if throughput is None:
+        raise ValueError(
+            f"Cannot find throughput metric {throughput_var!r} in run data. "
+            f"Available keys: {sorted(run_data)}"
+        )
+
+    return throughput
+
+
+def _prepare_records(
+    all_data: list[dict[str, object]],
+    *,
+    user_count_var: str | None,
+    gpu_count_var: str | None,
+) -> tuple[list[dict[str, object]], int]:
+    prepared = []
+    skipped_missing_users = 0
+
+    for record in all_data:
+        throughput = _get_throughput(record, "output_throughput")
+        user_count = _infer_user_count(record, user_count_var)
+        if user_count is None:
+            skipped_missing_users += 1
+            continue
+
+        gpu_count = _infer_gpu_count(record, gpu_count_var)
+        tokens_per_user = throughput / user_count
+        tokens_per_gpu = throughput / gpu_count
+
+        prepared.append(
+            {
+                **record,
+                "tokens_per_user": tokens_per_user,
+                "tokens_per_gpu": tokens_per_gpu,
+                "user_count_estimate": user_count,
+                "gpu_count": gpu_count,
+            }
+        )
+
+    return prepared, skipped_missing_users
+
+
+def _pareto_frontier(
+    df: "pd.DataFrame",
+    x_col: str,
+    y_col: str,
+    *,
+    epsilon: float = 1e-9,
+) -> "pd.DataFrame":
+    sorted_df = df.sort_values([x_col, y_col], ascending=[False, False])
+    frontier_indices = []
+    best_y = -math.inf
+
+    for idx, row in sorted_df.iterrows():
+        y_val = row[y_col]
+        if y_val >= best_y - epsilon:
+            frontier_indices.append(idx)
+            best_y = max(best_y, y_val)
+
+    return df.loc[frontier_indices]
+
+
+def _get_fig_path(
+    fig_dir: Path,
+    fig_group: tuple[tuple[str, str], ...],
+) -> Path:
+    parts = ["PARETO"]
+    if fig_group:
+        parts.extend(f"{k}={v}" for k, v in fig_group)
+    filename = sanitize_filename("-".join(parts) + ".png")
+    return fig_dir / filename
+
+
+def _plot_fig(
+    fig_dir: Path,
+    fig_group_data: tuple[tuple[tuple[str, str], ...], list[dict[str, object]]],
+    label_by: list[str],
+    *,
+    dry_run: bool,
+):
+    fig_group, fig_data = fig_group_data
+    fig_path = _get_fig_path(fig_dir, fig_group)
+
+    print("[BEGIN FIGURE]")
+    print(f"Group: {dict(fig_group)}")
+    print(f"Output file: {fig_path}")
+
+    if dry_run:
+        print("[END FIGURE]")
+        return
+
+    df = pd.DataFrame.from_records(fig_data)
+    df = df.dropna(subset=["tokens_per_user", "tokens_per_gpu"])
+
+    if df.empty:
+        print("No data points available after filtering; skipping.")
+        print("[END FIGURE]")
+        return
+
+    frontier = _pareto_frontier(df, "tokens_per_user", "tokens_per_gpu")
+    frontier = frontier.sort_values("tokens_per_user")
+
+    fig, ax = plt.subplots()
+    sns.scatterplot(
+        data=df,
+        x="tokens_per_user",
+        y="tokens_per_gpu",
+        color="0.5",
+        alpha=0.6,
+        ax=ax,
+        label="All runs",
+    )
+    sns.lineplot(
+        data=frontier,
+        x="tokens_per_user",
+        y="tokens_per_gpu",
+        marker="o",
+        ax=ax,
+        label="Pareto frontier",
+    )
+
+    if label_by:
+        for _, row in frontier.iterrows():
+            label_parts = []
+            for key in label_by:
+                if key in row:
+                    label_parts.append(f"{key}={row[key]}")
+            if label_parts:
+                ax.text(
+                    row["tokens_per_user"],
+                    row["tokens_per_gpu"],
+                    "\n".join(label_parts),
+                    fontsize=8,
+                )
+
+    ax.set_xlabel("Tokens/s/user")
+    ax.set_ylabel("Tokens/s/GPU")
+    ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.6)
+    fig.tight_layout()
+    fig.savefig(fig_path)
+    plt.close(fig)
+
+    print(
+        f"Plotted {len(df)} points; Pareto frontier size: {len(frontier)}.",
+    )
+    print("[END FIGURE]")
+
+
+def plot_pareto(
+    output_dir: Path,
+    user_count_var: str | None,
+    gpu_count_var: str | None,
+    label_by: list[str],
+    *,
+    dry_run: bool,
+):
+    fig_dir = output_dir / "pareto"
+    raw_data = [
+        run_data
+        for path in output_dir.rglob("**/summary.json")
+        for run_data in _json_load_bytes(path)
+    ]
+
+    if not raw_data:
+        raise ValueError(f"Did not find any parameter sweep results under {output_dir}")
+
+    fig_dir.mkdir(parents=True, exist_ok=True)
+
+    prepared_data, skipped_missing_users = _prepare_records(
+        raw_data,
+        user_count_var=user_count_var,
+        gpu_count_var=gpu_count_var,
+    )
+
+    if skipped_missing_users:
+        print(
+            f"Skipped {skipped_missing_users} runs without a user count "
+            "(`max_concurrency` or `max_concurrent_requests`).",
+        )
+
+    if not prepared_data:
+        raise ValueError(
+            "No data points with both throughput and user count available "
+            "to plot Pareto frontier.",
+        )
+
+    fig_groups = full_groupby(
+        prepared_data,
+        key=lambda item: tuple(),
+    )
+
+    with DummyExecutor() if len(fig_groups) <= 1 else ProcessPoolExecutor() as executor:
+        all(
+            executor.map(
+                partial(
+                    _plot_fig,
+                    fig_dir,
+                    label_by=label_by,
+                    dry_run=dry_run,
+                ),
+                fig_groups,
+            )
+        )
+
+
+@dataclass
+class SweepPlotParetoArgs:
+    output_dir: Path
+    user_count_var: str | None
+    gpu_count_var: str | None
+    label_by: list[str]
+    dry_run: bool
+
+    parser_name: ClassVar[str] = "plot_pareto"
+    parser_help: ClassVar[str] = (
+        "Plot Pareto frontier between tokens/s/user and tokens/s/GPU "
+        "from parameter sweep results."
+    )
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        output_dir = Path(args.OUTPUT_DIR)
+        if not output_dir.exists():
+            raise ValueError(f"No parameter sweep results under {output_dir}")
+
+        label_by = [] if not args.label_by else args.label_by.split(",")
+
+        return cls(
+            output_dir=output_dir,
+            user_count_var=args.user_count_var,
+            gpu_count_var=args.gpu_count_var,
+            label_by=label_by,
+            dry_run=args.dry_run,
+        )
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "OUTPUT_DIR",
+            type=str,
+            default="results",
+            help="The directory containing the sweep results to plot.",
+        )
+        parser.add_argument(
+            "--user-count-var",
+            type=str,
+            default="max_concurrency",
+            help="Result key that stores concurrent user count. "
+            "Falls back to max_concurrent_requests if missing.",
+        )
+        parser.add_argument(
+            "--gpu-count-var",
+            type=str,
+            default=None,
+            help="Result key that stores GPU count. "
+            "If not provided, falls back to num_gpus/gpu_count "
+            "or tensor_parallel_size * pipeline_parallel_size.",
+        )
+        parser.add_argument(
+            "--label-by",
+            type=str,
+            default="max_concurrency,gpu_count",
+            help="Comma-separated list of fields to annotate on Pareto frontier "
+            "points.",
+        )
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="If set, prints the figures to plot without drawing them.",
+        )
+
+        return parser
+
+
+def run_main(args: SweepPlotParetoArgs):
+    return plot_pareto(
+        output_dir=args.output_dir,
+        user_count_var=args.user_count_var,
+        gpu_count_var=args.gpu_count_var,
+        label_by=args.label_by,
+        dry_run=args.dry_run,
+    )
+
+
+def main(args: argparse.Namespace):
+    run_main(SweepPlotParetoArgs.from_cli_args(args))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=SweepPlotParetoArgs.parser_help)
+    SweepPlotParetoArgs.add_cli_args(parser)
+
+    main(parser.parse_args())

From cc0f2a0e19881c3c601d3e287f297b36d2a78f78 Mon Sep 17 00:00:00 2001
From: maang-h <55082429+maang-h@users.noreply.github.com>
Date: Fri, 28 Nov 2025 16:12:20 +0800
Subject: [PATCH 502/578] [Doc] Improve abnormal information string (#29655)

Signed-off-by: maang <maang_h@163.com>
---
 vllm/v1/engine/utils.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index d65cad7af03d..24bf66c42f31 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -371,8 +371,7 @@ def create_dp_placement_groups(
         )
         assert len(nodes) > 0, "No nodes with resources found in Ray cluster."
         assert dp_master_ip_key in nodes[0], (
-            "The DP master node (ip: %s) is missing or dead",
-            dp_master_ip,
+            f"The DP master node (ip: {dp_master_ip}) is missing or dead"
         )
         device_str = current_platform.ray_device_key
         n_node_devices: list[int] = [
@@ -446,8 +445,7 @@ def create_dp_placement_groups(
                 if key != "node:__internal_head__" and key.startswith("node:")
             ]
             assert len(node_ip_keys) == 1, (
-                "Zero or multiple node IP keys found in node resources: %s",
-                node_ip_keys,
+                f"Zero or multiple node IP keys found in node resources: {node_ip_keys}"
             )
             node_ip_key = node_ip_keys[0]
             node_ip = node_ip_key.split(":")[1]
@@ -464,11 +462,9 @@ def create_dp_placement_groups(
             if node_ip == dp_master_ip:
                 if dp_size_available < dp_size_local:
                     raise ValueError(
-                        "Not enough resources to allocate %s DP ranks "
-                        "on DP master node %s, possible to fit %s DP ranks",
-                        dp_size_local,
-                        dp_master_ip,
-                        dp_size_available,
+                        f"Not enough resources to allocate {dp_size_local} DP ranks "
+                        f"on DP master node {dp_master_ip}, possible to fit "
+                        f"{dp_size_available} DP ranks."
                     )
                 dp_size_to_allocate = dp_size_local
             elif pack_strategy == "strict":

From b2c1d294faca96643dbc2413d604ca160f458f0d Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Fri, 28 Nov 2025 09:44:47 +0100
Subject: [PATCH 503/578] [BUGFIX] MistralTokenizer._call__ adds an invalid EOS
 token (#29607)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/tokenization/test_mistral_tokenizer.py  | 68 +++++++++++++++++++
 vllm/transformers_utils/tokenizers/mistral.py | 20 +++++-
 2 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index 1ada8ee187c3..c80b698ba384 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -331,6 +331,7 @@ def test_encode_one(self, mistral_tokenizer: MistralTokenizer):
             )
             == token_ids
         )
+        assert mistral_tokenizer.encode_one("") == []
 
     def test_encode(self, mistral_tokenizer: MistralTokenizer):
         token_ids = (
@@ -370,6 +371,51 @@ def test_encode(self, mistral_tokenizer: MistralTokenizer):
             mistral_tokenizer.encode("Hello world !", add_special_tokens=False)
             == token_ids[1:]
         )
+        assert mistral_tokenizer.encode("", add_special_tokens=False) == []
+
+    def test_call(self, mistral_tokenizer: MistralTokenizer):
+        token_ids = (
+            [1, 22177, 4304, 2662]
+            if mistral_tokenizer.is_tekken
+            else [1, 23325, 2294, 1686]
+        )
+        attn_mask = [1 for _ in range(len(token_ids))]
+
+        # Test 1: default
+        assert mistral_tokenizer("Hello world !") == {
+            "attention_mask": attn_mask[1:],
+            "input_ids": token_ids[1:],
+        }
+        # Test 2: special tokens
+        assert mistral_tokenizer("Hello world !", add_special_tokens=True) == {
+            "attention_mask": attn_mask,
+            "input_ids": token_ids,
+        }
+        # Test 3: special tokens + truncation
+        assert mistral_tokenizer(
+            "Hello world !", add_special_tokens=True, truncation=True, max_length=3
+        ) == {
+            "attention_mask": attn_mask[:-1],
+            "input_ids": token_ids[:-1],
+        }
+        # Test 4: special tokens + no truncation + max length
+        assert mistral_tokenizer(
+            "Hello world !", add_special_tokens=True, max_length=3
+        ) == {
+            "attention_mask": attn_mask,
+            "input_ids": token_ids,
+        }
+        # Test 5: empty string
+        assert mistral_tokenizer("") == {
+            "attention_mask": [],
+            "input_ids": [],
+        }
+
+        with pytest.raises(
+            ValueError,
+            match=(r"`text_pair` is not supported by `MistralTokenizer.__call__`."),
+        ):
+            mistral_tokenizer("Hello world !", "invalid pair")
 
     @pytest.mark.parametrize(
         "openai_request,add_generation_prompt,continue_final_message,expected_output,decoded_expected_output",
@@ -1087,6 +1133,24 @@ def test_decode(
             )
             == expected_tokens[mistral_tokenizer.is_tekken]
         )
+        assert (
+            mistral_tokenizer.decode(
+                ids[mistral_tokenizer.is_tekken],
+                skip_special_tokens=skip_special_tokens,
+            )
+            == expected_tokens[mistral_tokenizer.is_tekken]
+        )
+
+    def test_decode_empty(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+    ):
+        assert (
+            mistral_tokenizer.decode(
+                [],
+            )
+            == ""
+        )
 
     def test_decode_int(
         self,
@@ -1390,6 +1454,8 @@ def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer):
             == expected_strings[mistral_tokenizer.is_tekken]
         )
 
+        assert mistral_tokenizer.convert_tokens_to_string([]) == ""
+
     @pytest.mark.parametrize(
         "skip_special_tokens,tuple_expected_tokens",
         (
@@ -2220,3 +2286,5 @@ def test_convert_ids_to_tokens(
             ids, skip_special_tokens=skip_special_tokens
         )
         assert actual_tokens == expected_tokens
+
+        assert mistral_tokenizer.convert_ids_to_tokens([]) == []
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 39198a1f3d81..caff43c55ce8 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -312,13 +312,27 @@ def __call__(
         truncation: bool = False,
         max_length: int | None = None,
     ):
-        return self.transformers_tokenizer(
+        if text_pair is not None:
+            raise ValueError(
+                "`text_pair` is not supported by `MistralTokenizer.__call__`."
+            )
+
+        encoded = self.transformers_tokenizer(
             text=text,
             text_pair=text_pair,
             add_special_tokens=add_special_tokens,
             truncation=truncation,
             max_length=max_length,
         )
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, revert to only call self.transformers_tokenizer(...).
+        # Hack to fix wrongly added eos token, when fix will be supported the condition
+        # below will be False even before the revert is done.
+        if encoded["input_ids"] and encoded["input_ids"][-1] == self.eos_token_id:
+            encoded["input_ids"].pop(-1)
+            if attention_mask := encoded.get("attention_mask"):
+                attention_mask.pop(-1)
+        return encoded
 
     @property
     def vocab(self) -> list[str]:
@@ -349,6 +363,8 @@ def encode(
         max_length: int | None = None,
         add_special_tokens: bool | None = None,
     ) -> list[int]:
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, directly call self.transformers_tokenizer.encode(...).
         encoded = self.tokenizer.encode(
             text, bos=add_special_tokens is not False, eos=False
         )
@@ -387,6 +403,8 @@ def apply_chat_template(
         )
 
     def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, directly call self.transformers_tokenizer.decode(...).
         if isinstance(ids, int):
             ids = [ids]
 

From 5f5521bd5d7d38d380640166294d97a839cf7ef9 Mon Sep 17 00:00:00 2001
From: Filipp Fisin <48059208+qGentry@users.noreply.github.com>
Date: Fri, 28 Nov 2025 09:45:10 +0100
Subject: [PATCH 504/578] Fix parameter order in GPT-OSS weight loading
 function for non-MXFP4 weights  (#29506)

Signed-off-by: Filipp Fisin <48059208+qGentry@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/gpt_oss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 9de3e261941b..cff16b7a7a8c 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -647,8 +647,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             )
         else:
             return self._load_weights_other(
-                ep_rank_start,
                 ep_rank_end,
+                ep_rank_start,
                 heads_per_rank,
                 head_start,
                 weights,

From ccbdf51bd57761a7a7e7a5adf685fcec67c9c1bd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 17:19:25 +0800
Subject: [PATCH 505/578] [Doc] Reorganize benchmark docs (#29658)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/.nav.yml                                 |   5 +
 docs/benchmarking/README.md                   |   7 +
 .../benchmarks.md => benchmarking/cli.md}     | 335 +++---------------
 docs/benchmarking/dashboard.md                |  58 +++
 docs/benchmarking/sweeps.md                   | 178 ++++++++++
 5 files changed, 291 insertions(+), 292 deletions(-)
 create mode 100644 docs/benchmarking/README.md
 rename docs/{contributing/benchmarks.md => benchmarking/cli.md} (71%)
 create mode 100644 docs/benchmarking/dashboard.md
 create mode 100644 docs/benchmarking/sweeps.md

diff --git a/docs/.nav.yml b/docs/.nav.yml
index c8bf00efb237..d30c0f12eba4 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -52,6 +52,11 @@ nav:
       - Plugins:
         - design/*plugin*.md
       - design/*
+  - Benchmarking:
+      - benchmarking/README.md
+      - benchmarking/cli.md
+      - benchmarking/sweeps.md
+      - benchmarking/dashboard.md
   - API Reference:
     - api/README.md
     - api/vllm
diff --git a/docs/benchmarking/README.md b/docs/benchmarking/README.md
new file mode 100644
index 000000000000..238290d4762b
--- /dev/null
+++ b/docs/benchmarking/README.md
@@ -0,0 +1,7 @@
+# Benchmark Suites
+
+vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
+
+- **[Benchmark CLI](./cli.md)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing.
+- **[Parameter Sweeps](./sweeps.md)**: Automate `vllm bench` runs for multiple configurations, useful for [optimization and tuning](../configuration/optimization.md).
+- **[Performance Dashboard](./dashboard.md)**: Automated CI that publishes benchmarks on each commit.
diff --git a/docs/contributing/benchmarks.md b/docs/benchmarking/cli.md
similarity index 71%
rename from docs/contributing/benchmarks.md
rename to docs/benchmarking/cli.md
index e4714e626638..44a4c4012595 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/benchmarking/cli.md
@@ -1,22 +1,10 @@
----
-toc_depth: 4
----
+# Benchmark CLI
 
-# Benchmark Suites
+This section guides you through running benchmark tests with the extensive datasets supported on vLLM.
 
-vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
+It's a living document, updated as new features and datasets become available.
 
-- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
-- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
-- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
-
-## Benchmark CLI
-
-This section guides you through running benchmark tests with the extensive
-datasets supported on vLLM. It's a living document, updated as new features and datasets
-become available.
-
-### Dataset Overview
+## Dataset Overview
 
 <style>
 th {
@@ -59,9 +47,9 @@ Legend:
     --dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
     ```
 
-### Examples
+## Examples
 
-#### 🚀 Online Benchmark
+### 🚀 Online Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
@@ -112,7 +100,7 @@ P99 ITL (ms):                            8.39
 ==================================================
 ```
 
-##### Custom Dataset
+#### Custom Dataset
 
 If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
 
@@ -145,7 +133,7 @@ vllm bench serve --port 9001 --save-result --save-detailed \
 
 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
 
-##### VisionArena Benchmark for Vision Language Models
+#### VisionArena Benchmark for Vision Language Models
 
 ```bash
 # need a model with vision capability here
@@ -163,7 +151,7 @@ vllm bench serve \
   --num-prompts 1000
 ```
 
-##### InstructCoder Benchmark with Speculative Decoding
+#### InstructCoder Benchmark with Speculative Decoding
 
 ``` bash
 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@@ -180,7 +168,7 @@ vllm bench serve \
     --num-prompts 2048
 ```
 
-##### Spec Bench Benchmark with Speculative Decoding
+#### Spec Bench Benchmark with Speculative Decoding
 
 ``` bash
 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@@ -217,7 +205,7 @@ vllm bench serve \
     --spec-bench-category "summarization"
 ```
 
-##### Other HuggingFaceDataset Examples
+#### Other HuggingFaceDataset Examples
 
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct
@@ -283,7 +271,7 @@ vllm bench serve \
     --blazedit-max-distance 0.99
 ```
 
-##### Running With Sampling Parameters
+#### Running With Sampling Parameters
 
 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
@@ -301,7 +289,7 @@ vllm bench serve \
   --num-prompts 10
 ```
 
-##### Running With Ramp-Up Request Rate
+#### Running With Ramp-Up Request Rate
 
 The benchmark tool also supports ramping up the request rate over the
 duration of the benchmark run. This can be useful for stress testing the
@@ -318,11 +306,11 @@ The following arguments can be used to control the ramp-up:
 - `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
 - `--ramp-up-end-rps`: The request rate at the end of the benchmark.
 
-##### Load Pattern Configuration
+#### Load Pattern Configuration
 
 vLLM's benchmark serving script provides sophisticated load pattern simulation capabilities through three key parameters that control request generation and concurrency behavior:
 
-###### Load Pattern Control Parameters
+##### Load Pattern Control Parameters
 
 - `--request-rate`: Controls the target request generation rate (requests per second). Set to `inf` for maximum throughput testing or finite values for controlled load simulation.
 - `--burstiness`: Controls traffic variability using a Gamma distribution (range: > 0). Lower values create bursty traffic, higher values create uniform traffic.
@@ -387,7 +375,7 @@ Using KV cache metrics for load pattern configuration:
 
 </details>
 
-#### 📈 Offline Throughput Benchmark
+### 📈 Offline Throughput Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
@@ -408,7 +396,7 @@ Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```
 
-##### VisionArena Benchmark for Vision Language Models
+#### VisionArena Benchmark for Vision Language Models
 
 ```bash
 vllm bench throughput \
@@ -428,7 +416,7 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
 
-##### InstructCoder Benchmark with Speculative Decoding
+#### InstructCoder Benchmark with Speculative Decoding
 
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
@@ -451,7 +439,7 @@ Total num prompt tokens:  261136
 Total num output tokens:  204800
 ```
 
-##### Other HuggingFaceDataset Examples
+#### Other HuggingFaceDataset Examples
 
 `lmms-lab/LLaVA-OneVision-Data`:
 
@@ -509,20 +497,20 @@ vllm bench throughput \
 
 </details>
 
-#### 🛠️ Structured Output Benchmark
+### 🛠️ Structured Output Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of structured output generation (JSON, grammar, regex).
 
-##### Server Setup
+#### Server Setup
 
 ```bash
 vllm serve NousResearch/Hermes-3-Llama-3.1-8B
 ```
 
-##### JSON Schema Benchmark
+#### JSON Schema Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -534,7 +522,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### Grammar-based Generation Benchmark
+#### Grammar-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -546,7 +534,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### Regex-based Generation Benchmark
+#### Regex-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -557,7 +545,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### Choice-based Generation Benchmark
+#### Choice-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -568,7 +556,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### XGrammar Benchmark Dataset
+#### XGrammar Benchmark Dataset
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -581,14 +569,14 @@ python3 benchmarks/benchmark_serving_structured_output.py \
 
 </details>
 
-#### 📚 Long Document QA Benchmark
+### 📚 Long Document QA Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of long document question-answering with prefix caching.
 
-##### Basic Long Document QA Test
+#### Basic Long Document QA Test
 
 ```bash
 python3 benchmarks/benchmark_long_document_qa_throughput.py \
@@ -600,7 +588,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
   --repeat-count 5
 ```
 
-##### Different Repeat Modes
+#### Different Repeat Modes
 
 ```bash
 # Random mode (default) - shuffle prompts randomly
@@ -633,14 +621,14 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
 
 </details>
 
-#### 🗂️ Prefix Caching Benchmark
+### 🗂️ Prefix Caching Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the efficiency of automatic prefix caching.
 
-##### Fixed Prompt with Prefix Caching
+#### Fixed Prompt with Prefix Caching
 
 ```bash
 python3 benchmarks/benchmark_prefix_caching.py \
@@ -651,7 +639,7 @@ python3 benchmarks/benchmark_prefix_caching.py \
   --input-length-range 128:256
 ```
 
-##### ShareGPT Dataset with Prefix Caching
+#### ShareGPT Dataset with Prefix Caching
 
 ```bash
 # download dataset
@@ -682,14 +670,14 @@ vllm bench serve \
 
 </details>
 
-#### ⚡ Request Prioritization Benchmark
+### ⚡ Request Prioritization Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of request prioritization in vLLM.
 
-##### Basic Prioritization Test
+#### Basic Prioritization Test
 
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@@ -700,7 +688,7 @@ python3 benchmarks/benchmark_prioritization.py \
   --scheduling-policy priority
 ```
 
-##### Multiple Sequences per Prompt
+#### Multiple Sequences per Prompt
 
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@@ -714,14 +702,14 @@ python3 benchmarks/benchmark_prioritization.py \
 
 </details>
 
-#### 👁️ Multi-Modal Benchmark
+### 👁️ Multi-Modal Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of multi-modal requests in vLLM.
 
-##### Images (ShareGPT4V)
+#### Images (ShareGPT4V)
 
 Start vLLM:
 
@@ -747,7 +735,7 @@ vllm bench serve \
   --endpoint /v1/chat/completions
 ```
 
-##### Videos (ShareGPT4Video)
+#### Videos (ShareGPT4Video)
 
 Start vLLM:
 
@@ -773,7 +761,7 @@ vllm bench serve \
   --endpoint /v1/chat/completions
 ```
 
-##### Synthetic Random Images (random-mm)
+#### Synthetic Random Images (random-mm)
 
 Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
 
@@ -846,14 +834,14 @@ This should be seen as an edge case, and if this behavior can be avoided by sett
 
 </details>
 
-#### Embedding Benchmark
+### Embedding Benchmark
 
 Benchmark the performance of embedding requests in vLLM.
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
-##### Text Embeddings
+#### Text Embeddings
 
 Unlike generative models which use Completions API or Chat Completions API,
 you should set `--backend openai-embeddings` and `--endpoint /v1/embeddings` to use the Embeddings API.
@@ -879,7 +867,7 @@ vllm bench serve \
   --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
 
-##### Multi-modal Embeddings
+#### Multi-modal Embeddings
 
 Unlike generative models which use Completions API or Chat Completions API,
 you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backend to use depends on the model:
@@ -944,7 +932,7 @@ vllm bench serve \
 
 </details>
 
-#### Reranker Benchmark
+### Reranker Benchmark
 
 Benchmark the performance of rerank requests in vLLM.
 
@@ -988,240 +976,3 @@ to account for the extra prompt which is the query. The token accounting to repo
 throughput numbers correctly is also adjusted.
 
 </details>
-
-## Parameter Sweeps
-
-### Online Benchmark
-
-[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
-
-Follow these steps to run the script:
-
-1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
-2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
-3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
-
-    - Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
-
-    ```json
-    [
-        {
-            "max_num_seqs": 32,
-            "max_num_batched_tokens": 1024
-        },
-        {
-            "max_num_seqs": 64,
-            "max_num_batched_tokens": 1024
-        },
-        {
-            "max_num_seqs": 64,
-            "max_num_batched_tokens": 2048
-        },
-        {
-            "max_num_seqs": 128,
-            "max_num_batched_tokens": 2048
-        },
-        {
-            "max_num_seqs": 128,
-            "max_num_batched_tokens": 4096
-        },
-        {
-            "max_num_seqs": 256,
-            "max_num_batched_tokens": 4096
-        }
-    ]
-    ```
-
-4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
-
-    - Example: Using different input/output lengths for random dataset:
-
-    ```json
-    [
-        {
-            "random_input_len": 128,
-            "random_output_len": 32
-        },
-        {
-            "random_input_len": 256,
-            "random_output_len": 64
-        },
-        {
-            "random_input_len": 512,
-            "random_output_len": 128
-        }
-    ]
-    ```
-
-5. Determine where you want to save the results, and pass that to `--output-dir`.
-
-Example command:
-
-```bash
-vllm bench sweep serve \
-    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
-    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
-    --serve-params benchmarks/serve_hparams.json \
-    --bench-params benchmarks/bench_hparams.json \
-    -o benchmarks/results
-```
-
-!!! important
-    If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
-    You can use `--dry-run` to preview the commands to be run.
-
-    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
-    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
-    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
-
-!!! note
-    By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
-
-!!! tip
-    You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
-  
-### SLA Auto-Tuner
-
-[`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
-
-For example, to ensure E2E latency within different target values for 99% of requests:
-
-```json
-[
-    {
-        "p99_e2el_ms": "<=200"
-    },
-    {
-        "p99_e2el_ms": "<=500"
-    },
-    {
-        "p99_e2el_ms": "<=1000"
-    },
-    {
-        "p99_e2el_ms": "<=2000"
-    }
-]
-```
-
-Example command:
-
-```bash
-vllm bench sweep serve_sla \
-    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
-    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
-    --serve-params benchmarks/serve_hparams.json \
-    --bench-params benchmarks/bench_hparams.json \
-    --sla-params benchmarks/sla_hparams.json \
-    --sla-variable max_concurrency \
-    -o benchmarks/results
-```
-
-The algorithm for adjusting the SLA variable is as follows:
-
-1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
-    - For example, the initial request rate is set to the concurrency under infinite QPS.
-2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
-3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
-
-!!! important
-    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
-
-    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
-
-### Visualizer
-
-[`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results.
-
-Example command:
-
-```bash
-vllm bench sweep plot benchmarks/results/<timestamp> \
-    --var-x max_concurrency \
-    --row-by random_input_len \
-    --col-by random_output_len \
-    --curve-by api_server_count,max_num_batched_tokens \
-    --filter-by 'max_concurrency<=1024'
-```
-
-!!! tip
-    You can use `--dry-run` to preview the figures to be plotted.
-
-### Pareto visualization (tokens/s/user vs tokens/s/GPU)
-
-`vllm bench sweep plot_pareto` helps pick configurations that balance per-user and per-GPU throughput.
-
-Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add per user latency; lower concurrency improves per-user rate but underutilizes GPUs; The Pareto frontier shows the best achievable pairs across your runs.
-
-- x-axis: tokens/s/user = `output_throughput` ÷ concurrency (`--user-count-var`, default `max_concurrency`, fallback `max_concurrent_requests`).
-- y-axis: tokens/s/GPU = `output_throughput` ÷ GPU count (`--gpu-count-var` if set; else gpu_count is TP×PP*DP).
-- Output: a single figure at `OUTPUT_DIR/pareto/PARETO.png`.
-- Show the configuration used in each data point `--label-by` (default: `max_concurrency,gpu_count`).
-
-Example:
-
-```bash
-vllm bench sweep plot_pareto benchmarks/results/<timestamp> \
-  --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size
-```
-
-## Performance Benchmarks
-
-The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
-
-### Manually Trigger the benchmark
-
-Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
-For CPU environment, please use the image with "-cpu" postfix.
-
-Here is an example for docker run command for CPU.
-
-```bash
-docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
-```
-
-Then, run below command inside the docker instance.
-
-```bash
-bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
-```
-
-When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
-
-#### Runtime environment variables
-
-- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
-- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
-- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
-- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
-- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
-- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
-
-For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
-
-The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
-
-More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
-
-### Continuous Benchmarking
-
-The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
-
-#### How It Works
-
-The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
-
-- **Serving tests**: Measure request handling and API performance
-- **Throughput tests**: Evaluate token generation rates
-- **Latency tests**: Assess response time characteristics
-
-#### Benchmark Configuration
-
-The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
-
-1. Navigate to the appropriate GPU directory in the benchmarks configuration
-2. Add your model specifications to the corresponding configuration files
-3. The new models will be included in the next scheduled benchmark run
-
-#### Viewing Results
-
-All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
diff --git a/docs/benchmarking/dashboard.md b/docs/benchmarking/dashboard.md
new file mode 100644
index 000000000000..7cc4d23250df
--- /dev/null
+++ b/docs/benchmarking/dashboard.md
@@ -0,0 +1,58 @@
+# Performance Dashboard
+
+The performance dashboard is used to confirm whether new changes improve/degrade performance under various workloads.
+It is updated by triggering benchmark runs on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
+
+The results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
+
+## Manually Trigger the benchmark
+
+Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
+For CPU environment, please use the image with "-cpu" postfix.
+
+Here is an example for docker run command for CPU.
+
+```bash
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
+```
+
+Then, run below command inside the docker instance.
+
+```bash
+bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
+
+### Runtime environment variables
+
+- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
+- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
+- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
+- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
+- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
+
+For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
+
+More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
+
+## Continuous Benchmarking
+
+The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
+
+### How It Works
+
+The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
+
+- **Serving tests**: Measure request handling and API performance
+- **Throughput tests**: Evaluate token generation rates
+- **Latency tests**: Assess response time characteristics
+
+### Benchmark Configuration
+
+The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
+
+1. Navigate to the appropriate GPU directory in the benchmarks configuration
+2. Add your model specifications to the corresponding configuration files
+3. The new models will be included in the next scheduled benchmark run
diff --git a/docs/benchmarking/sweeps.md b/docs/benchmarking/sweeps.md
new file mode 100644
index 000000000000..ee4d40d876de
--- /dev/null
+++ b/docs/benchmarking/sweeps.md
@@ -0,0 +1,178 @@
+# Parameter Sweeps
+
+## Online Benchmark
+
+### Basic
+
+`vllm bench sweep serve` automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
+
+Follow these steps to run the script:
+
+1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
+2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
+3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
+
+    - Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
+
+    ```json
+    [
+        {
+            "max_num_seqs": 32,
+            "max_num_batched_tokens": 1024
+        },
+        {
+            "max_num_seqs": 64,
+            "max_num_batched_tokens": 1024
+        },
+        {
+            "max_num_seqs": 64,
+            "max_num_batched_tokens": 2048
+        },
+        {
+            "max_num_seqs": 128,
+            "max_num_batched_tokens": 2048
+        },
+        {
+            "max_num_seqs": 128,
+            "max_num_batched_tokens": 4096
+        },
+        {
+            "max_num_seqs": 256,
+            "max_num_batched_tokens": 4096
+        }
+    ]
+    ```
+
+4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
+
+    - Example: Using different input/output lengths for random dataset:
+
+    ```json
+    [
+        {
+            "random_input_len": 128,
+            "random_output_len": 32
+        },
+        {
+            "random_input_len": 256,
+            "random_output_len": 64
+        },
+        {
+            "random_input_len": 512,
+            "random_output_len": 128
+        }
+    ]
+    ```
+
+5. Determine where you want to save the results, and pass that to `--output-dir`.
+
+Example command:
+
+```bash
+vllm bench sweep serve \
+    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
+    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
+    --serve-params benchmarks/serve_hparams.json \
+    --bench-params benchmarks/bench_hparams.json \
+    -o benchmarks/results
+```
+
+!!! important
+    If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
+    You can use `--dry-run` to preview the commands to be run.
+
+    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
+    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
+    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
+
+!!! note
+    By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
+
+!!! tip
+    You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
+  
+### SLA auto-tuner
+
+`vllm bench sweep serve_sla` is a wrapper over `vllm bench sweep serve` that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
+
+For example, to ensure E2E latency within different target values for 99% of requests:
+
+```json
+[
+    {
+        "p99_e2el_ms": "<=200"
+    },
+    {
+        "p99_e2el_ms": "<=500"
+    },
+    {
+        "p99_e2el_ms": "<=1000"
+    },
+    {
+        "p99_e2el_ms": "<=2000"
+    }
+]
+```
+
+Example command:
+
+```bash
+vllm bench sweep serve_sla \
+    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
+    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
+    --serve-params benchmarks/serve_hparams.json \
+    --bench-params benchmarks/bench_hparams.json \
+    --sla-params benchmarks/sla_hparams.json \
+    --sla-variable max_concurrency \
+    -o benchmarks/results
+```
+
+The algorithm for adjusting the SLA variable is as follows:
+
+1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
+    - For example, the initial request rate is set to the concurrency under infinite QPS.
+2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
+3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
+
+!!! important
+    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
+
+    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
+
+## Visualization
+
+### Basic
+
+`vllm bench sweep plot` can be used to plot performance curves from parameter sweep results.
+
+Example command:
+
+```bash
+vllm bench sweep plot benchmarks/results/<timestamp> \
+    --var-x max_concurrency \
+    --row-by random_input_len \
+    --col-by random_output_len \
+    --curve-by api_server_count,max_num_batched_tokens \
+    --filter-by 'max_concurrency<=1024'
+```
+
+!!! tip
+    You can use `--dry-run` to preview the figures to be plotted.
+
+### Pareto chart
+
+`vllm bench sweep plot_pareto` helps pick configurations that balance per-user and per-GPU throughput.
+
+Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add per user latency; lower concurrency improves per-user rate but underutilizes GPUs; The Pareto frontier shows the best achievable pairs across your runs.
+
+- x-axis: tokens/s/user = `output_throughput` ÷ concurrency (`--user-count-var`, default `max_concurrency`, fallback `max_concurrent_requests`).
+- y-axis: tokens/s/GPU = `output_throughput` ÷ GPU count (`--gpu-count-var` if set; else gpu_count is TP×PP*DP).
+- Output: a single figure at `OUTPUT_DIR/pareto/PARETO.png`.
+- Show the configuration used in each data point `--label-by` (default: `max_concurrency,gpu_count`).
+
+Example:
+
+```bash
+vllm bench sweep plot_pareto benchmarks/results/<timestamp> \
+  --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size
+```

From 3cb32e5d6e227fe91d93902e302ef6f667308e04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?=
 <38908462+zhyajie@users.noreply.github.com>
Date: Fri, 28 Nov 2025 18:08:42 +0800
Subject: [PATCH 506/578] [Rocm] Set VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS
 default is disabled (#28985)

Signed-off-by: zhyajie <yajizhan@amd.com>
Co-authored-by: zhyajie <yajizhan@amd.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
---
 vllm/envs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 56558548d398..2ac457419a72 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -116,7 +116,7 @@
     VLLM_ROCM_USE_AITER_TRITON_ROPE: bool = False
     VLLM_ROCM_USE_AITER_FP8BMM: bool = True
     VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
-    VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = True
+    VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = False
     VLLM_ROCM_USE_AITER_TRITON_GEMM: bool = True
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
@@ -969,9 +969,9 @@ def get_vllm_port() -> int | None:
         in ("true", "1")
     ),
     # Whether to use aiter fusion shared experts ops.
-    # By default is enabled.
+    # By default is disabled.
     "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS": lambda: (
-        os.getenv("VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS", "True").lower()
+        os.getenv("VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS", "False").lower()
         in ("true", "1")
     ),
     # Whether to use aiter triton kernels for gemm ops.

From 5c2b5cb422182ae2a0c22a17532d465174396613 Mon Sep 17 00:00:00 2001
From: Wilson Wu <iwilsonwu@gmail.com>
Date: Fri, 28 Nov 2025 18:29:28 +0800
Subject: [PATCH 507/578] [Docs] Add SPLADE and Ultravox models to supported
 models documentation (#29659)

Signed-off-by: Wilson Wu <iwilsonwu@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/models/supported_models.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 25579835faf6..da7c5edf66bf 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -479,6 +479,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
 | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
+| `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | |
 | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
 | `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
@@ -725,6 +726,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ |
 | `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
 | `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
+| `UltravoxModel` | Ultravox | T + A<sup>E+</sup> | `fixie-ai/ultravox-v0_5-llama-3_2-1b` | ✅︎ | ✅︎ |
 
 Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 

From 33b06a6f24be632e99c19cadb5004a27cf9605a0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 20:35:19 +0800
Subject: [PATCH 508/578] [Misc] Remove redundant attention var constants
 (#29650)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../attention/test_attention_selector.py      | 19 +++++++++---------
 .../attention/test_rocm_attention_selector.py | 11 +++++-----
 tests/kernels/utils.py                        | 20 -------------------
 tests/models/quantization/test_fp8.py         |  3 +--
 vllm/attention/selector.py                    |  9 ++++-----
 vllm/model_executor/models/deepseek_eagle.py  |  3 ---
 vllm/utils/__init__.py                        | 17 ----------------
 7 files changed, 19 insertions(+), 63 deletions(-)

diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index cd34b520ea71..c959b2f4bb03 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -11,7 +11,6 @@
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
 @pytest.fixture(autouse=True)
@@ -83,7 +82,7 @@ def test_env(
 ):
     """Test attention backend selection with valid device-backend pairs."""
     with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, name)
+        m.setenv("VLLM_ATTENTION_BACKEND", name)
         m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
 
         if device == "cpu":
@@ -237,27 +236,27 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     )
 
     with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")
 
         # Unsupported CUDA arch
         monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
         backend = get_attn_backend(16, torch.float16, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Reset the monkeypatch for subsequent tests
         monkeypatch.undo()
 
         # Unsupported data type
         backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Unsupported kv cache data type
         backend = get_attn_backend(16, torch.float16, "fp8", 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Unsupported block size
         backend = get_attn_backend(16, torch.float16, None, 8)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # flash-attn is not installed
         import sys
@@ -265,7 +264,7 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
         original_module = sys.modules.get("vllm_flash_attn")
         monkeypatch.setitem(sys.modules, "vllm_flash_attn", None)
         backend = get_attn_backend(16, torch.float16, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Restore the original module if it existed
         if original_module is not None:
@@ -275,7 +274,7 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
 
         # Unsupported head size
         backend = get_attn_backend(17, torch.float16, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
 
 def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
@@ -284,7 +283,7 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
         monkeypatch.context() as m,
         patch("vllm.platforms.current_platform", CudaPlatform()),
     ):
-        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+        m.setenv("VLLM_ATTENTION_BACKEND", "INVALID")
 
         # Should raise ValueError for invalid backend
         with pytest.raises(ValueError) as exc_info:
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index 9b7fb664956c..b61058081c0b 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -6,7 +6,6 @@
 
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_BACKEND_ENV_VAR
 
 
 @pytest.fixture(autouse=True)
@@ -18,7 +17,7 @@ def clear_cache():
 @pytest.mark.skip(reason="Skipped for now. Should be revisited.")
 def test_selector(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_ATTN")
+        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN")
 
         # Set the current platform to ROCm using monkeypatch
         monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
@@ -30,19 +29,19 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         # MLA test for deepseek related
 
         # change the attention backend to triton MLA
-        m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
+        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
         # If attention backend is None
         # If use_mla is true
         # The selected backend is triton MLA
-        m.setenv(STR_BACKEND_ENV_VAR, None)
+        m.setenv("VLLM_ATTENTION_BACKEND", "")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
         # change the attention backend to AITER MLA
-        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
+        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
         assert backend.get_name() == "ROCM_AITER_MLA"
 
@@ -50,7 +49,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         # If use_mla is true
         # If VLLM_ROCM_USE_AITER is enabled
         # The selected backend is ROCM_AITER_MLA
-        m.setenv(STR_BACKEND_ENV_VAR, None)
+        m.setenv("VLLM_ATTENTION_BACKEND", "")
         m.setenv("VLLM_ROCM_USE_AITER", "1")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
         assert backend.get_name() == "ROCM_AITER_MLA"
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index b8148ce06b3f..75e82f9314e7 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -9,7 +9,6 @@
 from typing import Any, NamedTuple
 from unittest.mock import patch
 
-import pytest
 import torch
 from torch._prims_common import TensorLikeType
 
@@ -17,9 +16,6 @@
 from vllm.attention.backends.abstract import AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
-from vllm.utils import (
-    STR_BACKEND_ENV_VAR,
-)
 from vllm.utils.torch_utils import make_tensor_with_pad
 
 # For now, disable "test_aot_dispatch_dynamic" since there are some
@@ -217,22 +213,6 @@ def make_causal_mask(
     return mask
 
 
-def override_backend_env_variable(
-    mpatch: pytest.MonkeyPatch, backend_name: str
-) -> None:
-    """
-    Override the environment variable indicating the vLLM backend temporarily,
-    using pytest monkeypatch to ensure that the env vars get
-    reset once the test context exits.
-
-    Arguments:
-
-    * mpatch: pytest monkeypatch instance
-    * backend_name: attention backend name to force
-    """
-    mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
-
-
 def ref_masked_attention(
     query: torch.Tensor,
     key: torch.Tensor,
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index 2a6f34a9c482..7dfedaf2799d 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -11,7 +11,6 @@
 from tests.quantization.utils import is_quant_method_supported
 from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
 from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
 from ..utils import check_logprobs_close
 
 
@@ -76,7 +75,7 @@ def test_models(
 
     with monkeypatch.context() as m:
         m.setenv("TOKENIZERS_PARALLELISM", "true")
-        m.setenv(STR_BACKEND_ENV_VAR, backend)
+        m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
         MAX_MODEL_LEN = 1024
         NUM_LOG_PROBS = 8
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index ad19b58aa155..a7190df3c4f1 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -19,7 +19,6 @@
 )
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
-from vllm.utils import STR_BACKEND_ENV_VAR
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 logger = init_logger(__name__)
@@ -35,7 +34,7 @@ def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
     * AttentionBackendEnum value if an override is specified
     * None otherwise
     """
-    backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
+    backend_name = os.environ.get("VLLM_ATTENTION_BACKEND")
     if backend_name is None:
         return None
     if backend_name == "XFORMERS":
@@ -139,10 +138,10 @@ def _cached_get_attn_backend(
             if backend_by_env_var.endswith("_VLLM_V1"):
                 logger.warning(
                     "The suffix '_VLLM_V1' in the environment variable "
-                    "%s is no longer necessary as V0 backends have been "
-                    "deprecated. Please remove this suffix from your "
+                    "VLLM_ATTENTION_BACKEND is no longer necessary as "
+                    "V0 backends have been deprecated. "
+                    "Please remove this suffix from your "
                     "environment variable setting.",
-                    STR_BACKEND_ENV_VAR,
                 )
                 backend_by_env_var = backend_by_env_var.removesuffix("_VLLM_V1")
             try:
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index 4d7a37292cb0..8f6b4a4b021f 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -23,12 +23,9 @@
     DeepseekV2DecoderLayer,
     DeepseekV3ForCausalLM,
 )
-from vllm.utils import init_logger
 
 from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
 
-logger = init_logger(__name__)
-
 
 @support_torch_compile
 class DeepseekV2Model(nn.Module):
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index fddcc2720430..25e7978c70fa 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -7,8 +7,6 @@
 
 import torch
 
-from vllm.logger import init_logger
-
 _DEPRECATED_MAPPINGS = {
     "cprofile": "profiling",
     "cprofile_context": "profiling",
@@ -37,21 +35,6 @@ def __dir__() -> list[str]:
     return sorted(list(globals().keys()) + list(_DEPRECATED_MAPPINGS.keys()))
 
 
-logger = init_logger(__name__)
-
-# Constants related to forcing the attention backend selection
-
-# String name of register which may be set in order to
-# force auto-selection of attention backend by Attention
-# wrapper
-STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
-
-# Possible string values of STR_BACKEND_ENV_VAR
-# register, corresponding to possible backends
-STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
-STR_INVALID_VAL: str = "INVALID"
-
 MASK_64_BITS = (1 << 64) - 1
 
 
From 953d9c820b0d8f9a55ee455814287536a498498c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 20:40:47 +0800
Subject: [PATCH 509/578] [mypy] Pass type checking for `vllm/utils` and
 `vllm/v1/pool` (#29666)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tools/pre_commit/mypy.py    |  3 ++-
 vllm/utils/async_utils.py   |  9 ++++++++-
 vllm/utils/jsontree.py      | 23 ++++++++---------------
 vllm/utils/mem_utils.py     |  4 ++--
 vllm/utils/nccl.py          |  6 +++---
 vllm/utils/network_utils.py |  2 +-
 vllm/utils/registry.py      |  6 ++++--
 vllm/utils/torch_utils.py   | 21 ++++++---------------
 vllm/v1/pool/metadata.py    |  6 +++---
 9 files changed, 37 insertions(+), 43 deletions(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 34f6e8c928ff..e3dc40fd0ec7 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -36,8 +36,10 @@
     "vllm/transformers_utils",
     "vllm/triton_utils",
     "vllm/usage",
+    "vllm/utils",
     "vllm/v1/core",
     "vllm/v1/engine",
+    "vllm/v1/pool",
     "vllm/v1/worker",
 ]
 
@@ -59,7 +61,6 @@
     "vllm/v1/executor",
     "vllm/v1/kv_offload",
     "vllm/v1/metrics",
-    "vllm/v1/pool",
     "vllm/v1/sample",
     "vllm/v1/spec_decode",
     "vllm/v1/structured_output",
diff --git a/vllm/utils/async_utils.py b/vllm/utils/async_utils.py
index b6c24e1ceeee..77234cbd0c8c 100644
--- a/vllm/utils/async_utils.py
+++ b/vllm/utils/async_utils.py
@@ -12,7 +12,7 @@
 from collections.abc import AsyncGenerator, Awaitable, Callable
 from concurrent.futures import Executor, ThreadPoolExecutor
 from functools import partial
-from typing import TypeVar
+from typing import TYPE_CHECKING, TypeVar
 
 from transformers.tokenization_utils_base import BatchEncoding
 from typing_extensions import ParamSpec
@@ -257,6 +257,13 @@ def in_loop(event_loop: AbstractEventLoop) -> bool:
         return False
 
 
+# A hack to pass mypy
+if TYPE_CHECKING:
+
+    def anext(it: AsyncGenerator[T, None]):
+        return it.__anext__()
+
+
 async def merge_async_iterators(
     *iterators: AsyncGenerator[T, None],
 ) -> AsyncGenerator[tuple[int, T], None]:
diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py
index cde9aa6ff901..fe757c2f3374 100644
--- a/vllm/utils/jsontree.py
+++ b/vllm/utils/jsontree.py
@@ -4,7 +4,7 @@
 
 from collections.abc import Callable, Iterable
 from functools import reduce
-from typing import TYPE_CHECKING, TypeAlias, TypeVar, cast, overload
+from typing import TYPE_CHECKING, Any, TypeAlias, TypeVar, overload
 
 if TYPE_CHECKING:
     import torch
@@ -82,16 +82,13 @@ def json_map_leaves(
 
 def json_map_leaves(
     func: Callable[[_T], _U],
-    value: "BatchedTensorInputs" | _JSONTree[_T],
+    value: Any,
 ) -> "BatchedTensorInputs" | _JSONTree[_U]:
     """Apply a function to each leaf in a nested JSON structure."""
     if isinstance(value, dict):
-        return {
-            k: json_map_leaves(func, v)  # type: ignore[arg-type]
-            for k, v in value.items()
-        }
+        return {k: json_map_leaves(func, v) for k, v in value.items()}  # type: ignore
     elif isinstance(value, list):
-        return [json_map_leaves(func, v) for v in value]
+        return [json_map_leaves(func, v) for v in value]  # type: ignore
     elif isinstance(value, tuple):
         return tuple(json_map_leaves(func, v) for v in value)
     else:
@@ -140,9 +137,9 @@ def json_reduce_leaves(
 
 
 def json_reduce_leaves(
-    func: Callable[..., _T | _U],
+    func: Callable[[_T, _T], _T] | Callable[[_U, _T], _U],
     value: _JSONTree[_T],
-    initial: _U = cast(_U, ...),  # noqa: B008
+    initial: _U = ...,  # type: ignore[assignment]
     /,
 ) -> _T | _U:
     """
@@ -151,13 +148,9 @@ def json_reduce_leaves(
     sequence to a single value.
     """
     if initial is ...:
-        return reduce(func, json_iter_leaves(value))  # type: ignore[arg-type]
+        return reduce(func, json_iter_leaves(value))  # type: ignore
 
-    return reduce(
-        func,  # type: ignore[arg-type]
-        json_iter_leaves(value),
-        initial,
-    )
+    return reduce(func, json_iter_leaves(value), initial)  # type: ignore
 
 
 def json_count_leaves(value: JSONTree[_T]) -> int:
diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py
index c6a6757bed3b..e2517b935bf2 100644
--- a/vllm/utils/mem_utils.py
+++ b/vllm/utils/mem_utils.py
@@ -68,11 +68,11 @@ class MemorySnapshot:
     timestamp: float = 0.0
     auto_measure: bool = True
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         if self.auto_measure:
             self.measure()
 
-    def measure(self):
+    def measure(self) -> None:
         from vllm.platforms import current_platform
 
         # we measure the torch peak memory usage via allocated_bytes,
diff --git a/vllm/utils/nccl.py b/vllm/utils/nccl.py
index b1459fcbd246..4807bc076f82 100644
--- a/vllm/utils/nccl.py
+++ b/vllm/utils/nccl.py
@@ -3,7 +3,7 @@
 
 from __future__ import annotations
 
-import importlib
+import importlib.util
 import os
 
 import torch
@@ -47,8 +47,8 @@ def find_nccl_include_paths() -> list[str] | None:
 
     try:
         spec = importlib.util.find_spec("nvidia.nccl")
-        if spec and getattr(spec, "submodule_search_locations", None):
-            for loc in spec.submodule_search_locations:
+        if spec and (locs := getattr(spec, "submodule_search_locations", None)):
+            for loc in locs:
                 inc_dir = os.path.join(loc, "include")
                 if os.path.exists(os.path.join(inc_dir, "nccl.h")):
                     paths.append(inc_dir)
diff --git a/vllm/utils/network_utils.py b/vllm/utils/network_utils.py
index 0a68e48ba5e7..80ff0df28c66 100644
--- a/vllm/utils/network_utils.py
+++ b/vllm/utils/network_utils.py
@@ -72,7 +72,7 @@ def get_ip() -> str:
     return "0.0.0.0"
 
 
-def test_loopback_bind(address, family):
+def test_loopback_bind(address: str, family: int) -> bool:
     try:
         s = socket.socket(family, socket.SOCK_DGRAM)
         s.bind((address, 0))  # Port 0 = auto assign
diff --git a/vllm/utils/registry.py b/vllm/utils/registry.py
index ac9b859159ea..a136d450e7b1 100644
--- a/vllm/utils/registry.py
+++ b/vllm/utils/registry.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
+from typing import Any, TypeVar
+
+_T = TypeVar("_T", bound=type)
 
 
 class ExtensionManager:
@@ -34,7 +36,7 @@ def register(self, name: str):
         Decorator to register a class with the given name.
         """
 
-        def wrap(cls_to_register):
+        def wrap(cls_to_register: _T) -> _T:
             self.name2class[name] = cls_to_register
             return cls_to_register
 
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index 3661dfd09047..f5c49ac169f0 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -13,7 +13,7 @@
 import torch
 from packaging import version
 from packaging.version import Version
-from torch.library import Library
+from torch.library import Library, infer_schema
 
 import vllm.envs as envs
 
@@ -78,7 +78,6 @@ def guard_cuda_initialization():
         yield
         return
 
-    had_key = "CUDA_VISIBLE_DEVICES" in os.environ
     old_value = os.environ.get("CUDA_VISIBLE_DEVICES")
     os.environ["CUDA_VISIBLE_DEVICES"] = ""
     try:
@@ -90,10 +89,10 @@ def guard_cuda_initialization():
             err_msg = str(e)
         raise RuntimeError(err_msg) from e
     finally:
-        if had_key:
-            os.environ["CUDA_VISIBLE_DEVICES"] = old_value
+        if old_value is None:
+            del os.environ["CUDA_VISIBLE_DEVICES"]
         else:
-            os.environ.pop("CUDA_VISIBLE_DEVICES")
+            os.environ["CUDA_VISIBLE_DEVICES"] = old_value
 
 
 def get_dtype_size(dtype: torch.dtype) -> int:
@@ -525,8 +524,7 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
 
 # Helper function used in testing.
 def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool:
-    torch_version = version.parse(torch_version)
-    return torch_version >= version.parse(target)
+    return version.parse(torch_version) >= version.parse(target)
 
 
 def is_torch_equal_or_newer(target: str) -> bool:
@@ -640,15 +638,8 @@ def direct_register_custom_op(
 
         dispatch_key = current_platform.dispatch_key
 
-    import torch.library
-
-    if hasattr(torch.library, "infer_schema"):
-        schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
-    else:
-        # for pytorch 2.4
-        import torch._custom_op.impl
+    schema_str = infer_schema(op_func, mutates_args=mutates_args)
 
-        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
     my_lib = target_lib or vllm_lib
     my_lib.define(op_name + schema_str, tags=tags)
     my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index 9883ab8fb996..7bd2c7415daf 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -67,16 +67,16 @@ def build_pooling_cursor(
 
     n_seq = len(num_scheduled_tokens)
     index = list(range(n_seq))
-    num_scheduled_tokens = torch.tensor(num_scheduled_tokens, device="cpu")
+    num_scheduled_tokens_cpu = torch.tensor(num_scheduled_tokens, device="cpu")
     cumsum = torch.zeros(
         n_seq + 1, dtype=torch.int64, pin_memory=pin_memory, device="cpu"
     )
-    torch.cumsum(num_scheduled_tokens, dim=0, out=cumsum[1:])
+    torch.cumsum(num_scheduled_tokens_cpu, dim=0, out=cumsum[1:])
     cumsum = cumsum.to(device, non_blocking=True)
     return PoolingCursor(
         index=index,
         first_token_indices_gpu=cumsum[:n_seq],
         last_token_indices_gpu=cumsum[1:] - 1,
         prompt_lens_cpu=prompt_lens,
-        num_scheduled_tokens_cpu=num_scheduled_tokens,
+        num_scheduled_tokens_cpu=num_scheduled_tokens_cpu,
     )

From 8e7a891602eb49d5e520e082148b1f021f9f801e Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 28 Nov 2025 04:52:23 -0800
Subject: [PATCH 510/578] [BugFix] Fix spec decoding max_tokens scheduling perf
 issue (#29542)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/test_outputs.py        | 24 +++++++++++-------------
 vllm/v1/core/sched/scheduler.py | 14 +++++++++-----
 vllm/v1/outputs.py              | 28 ++++++++--------------------
 3 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/tests/v1/test_outputs.py b/tests/v1/test_outputs.py
index af9df844249e..89d551e344cf 100644
--- a/tests/v1/test_outputs.py
+++ b/tests/v1/test_outputs.py
@@ -43,7 +43,7 @@ def test_slice_without_cu_num_generated_tokens(self):
             cu_num_generated_tokens=None,
         )
 
-        sliced = logprobsLists.slice(1, 3)
+        sliced = logprobsLists.slice_request(1, num_positions=2)
         assert sliced.logprob_token_ids == [[2], [3]]
         assert sliced.logprobs == [[0.2], [0.3]]
         assert sliced.sampled_token_ranks == [2, 3]
@@ -51,7 +51,7 @@ def test_slice_without_cu_num_generated_tokens(self):
 
     def test_slice_from_start(self):
         """Test slicing from the start position"""
-        sliced = self.logprobsLists.slice(0, 2)
+        sliced = self.logprobsLists.slice_request(0, num_positions=5)
         assert len(sliced.logprob_token_ids) == 5
         assert sliced.logprob_token_ids == [
             [1, 2],
@@ -60,11 +60,11 @@ def test_slice_from_start(self):
             [7, 8],
             [9, 10],
         ]
-        assert sliced.cu_num_generated_tokens == [0, 2, 5]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_from_middle(self):
         """Test slicing from the middle position"""
-        sliced = self.logprobsLists.slice(1, 3)
+        sliced = self.logprobsLists.slice_request(1, num_positions=7)
         assert len(sliced.logprob_token_ids) == 7
         assert sliced.logprob_token_ids == [
             [5, 6],
@@ -75,27 +75,25 @@ def test_slice_from_middle(self):
             [15, 16],
             [17, 18],
         ]
-        assert sliced.cu_num_generated_tokens == [0, 3, 7]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_single_request(self):
         """Test slicing a single request"""
-        sliced = self.logprobsLists.slice(1, 2)
+        sliced = self.logprobsLists.slice_request(1, num_positions=3)
         assert len(sliced.logprob_token_ids) == 3
         assert sliced.logprob_token_ids == [[5, 6], [7, 8], [9, 10]]
-        assert sliced.cu_num_generated_tokens == [0, 3]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_last_request(self):
         """Test slicing the last request"""
-        sliced = self.logprobsLists.slice(2, 3)
+        sliced = self.logprobsLists.slice_request(2, num_positions=4)
         assert len(sliced.logprob_token_ids) == 4
         assert sliced.logprob_token_ids == [[11, 12], [13, 14], [15, 16], [17, 18]]
-        assert sliced.cu_num_generated_tokens == [0, 4]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_all_requests(self):
         """Test slicing all requests (full slice)"""
-        sliced = self.logprobsLists.slice(0, 3)
+        sliced = self.logprobsLists.slice_request(0, num_positions=9)
         assert len(sliced.logprob_token_ids) == 9  # All tokens
         assert sliced.logprob_token_ids == self.logprobsLists.logprob_token_ids
-        assert (
-            sliced.cu_num_generated_tokens == self.logprobsLists.cu_num_generated_tokens
-        )
+        assert sliced.cu_num_generated_tokens is None
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 0304a8ec48bf..e3ec8440a932 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -234,11 +234,15 @@ def schedule(self) -> SchedulerOutput:
                 num_new_tokens = self.scheduler_config.long_prefill_token_threshold
             num_new_tokens = min(num_new_tokens, token_budget)
 
-            # Make sure the input position does not exceed the max model len or
-            # request's max_tokens.
-            # This is necessary when using spec decoding and/or async scheduling.
+            num_spec_placeholders = max(0, request.num_output_placeholders - 1)
             max_total_tokens = min(
-                request.num_prompt_tokens + request.max_tokens, self.max_model_len
+                # Avoid scheduling tokens that we're sure won't will be needed based on
+                # request.max_tokens. For this calculation we assume placeholder
+                # speculated output tokens are rejected.
+                request.num_prompt_tokens + request.max_tokens + num_spec_placeholders,
+                # Make sure the input position does not exceed the max model len.
+                # This is necessary when using spec decoding.
+                self.max_model_len,
             )
             num_new_tokens = min(
                 num_new_tokens, max_total_tokens - 1 - request.num_computed_tokens
@@ -1089,7 +1093,7 @@ def update_from_output(
                 and request.sampling_params.logprobs is not None
                 and logprobs
             ):
-                new_logprobs = logprobs.slice(req_index, req_index + 1)
+                new_logprobs = logprobs.slice_request(req_index, len(new_token_ids))
 
             if new_token_ids and self.structured_output_manager.should_advance(request):
                 struct_output_request = request.structured_output_request
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index e32d5bb608b1..8110deb5a610 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -29,27 +29,15 @@ class LogprobsLists(NamedTuple):
     # different for each request.
     cu_num_generated_tokens: list[int] | None = None
 
-    def slice(self, start_req_idx: int, end_req_idx: int):
-        if self.cu_num_generated_tokens:
-            start = self.cu_num_generated_tokens[start_req_idx]
-            end = self.cu_num_generated_tokens[end_req_idx]
-            # Recompute cumulative array starting from 0
-            cu_num_offset = self.cu_num_generated_tokens[start_req_idx]
-            sliced_cu_num_generated_tokens = [
-                cu_num - cu_num_offset
-                for cu_num in self.cu_num_generated_tokens[
-                    start_req_idx : end_req_idx + 1
-                ]
-            ]
-        else:
-            start = start_req_idx
-            end = end_req_idx
-            sliced_cu_num_generated_tokens = None
+    def slice_request(self, req_idx: int, num_positions: int):
+        if self.cu_num_generated_tokens is not None:
+            req_idx = self.cu_num_generated_tokens[req_idx]
+        end_idx = req_idx + num_positions
         return LogprobsLists(
-            self.logprob_token_ids[start:end],
-            self.logprobs[start:end],
-            self.sampled_token_ranks[start:end],
-            sliced_cu_num_generated_tokens,
+            self.logprob_token_ids[req_idx:end_idx],
+            self.logprobs[req_idx:end_idx],
+            self.sampled_token_ranks[req_idx:end_idx],
+            None,
         )
 
 
From 1168768a2d17f50dfbdd03b5f4c6e6809fe7aaa3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 21:26:47 +0800
Subject: [PATCH 511/578] [Optimization] Early return for `_apply_matches` and
 `_iter_placeholders` (#29668)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/processing.py | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 85a03efd5bb9..691eff9acf86 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -727,18 +727,35 @@ def _find_matches(
     return mode, matches_to_apply
 
 
+def _all_items_found(
+    mm_item_counts: dict[str, int],
+    mm_found_counts: dict[str, int],
+) -> bool:
+    return all(
+        item_idx >= mm_item_counts[modality]
+        for modality, item_idx in mm_found_counts.items()
+    )
+
+
 def _apply_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
     tokenizer: AnyTokenizer,
 ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
     prompt_len = len(prompt)
+    mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
 
     out_seqs = list[str | list[int]]()
     out_result: MultiModalPromptUpdatesApplyResult = {
         m: [None] * len(items) for m, items in mm_prompt_updates.items()
     }
 
+    mm_found_counts = {
+        m: sum(r is not None for r in res) for m, res in out_result.items()
+    }
+    if _all_items_found(mm_item_counts, mm_found_counts):
+        return [prompt], out_result
+
     start_idx = prev_end_idx = 0
     while start_idx < max(prompt_len, 1):  # Allow inserts into empty prompt
         found = False
@@ -776,6 +793,12 @@ def _apply_matches(
                 # Exclude overlapping matches
                 start_idx = prev_end_idx = match.end_idx
 
+            mm_found_counts = {
+                m: sum(r is not None for r in res) for m, res in out_result.items()
+            }
+            if _all_items_found(mm_item_counts, mm_found_counts):
+                break
+
         if not found:
             start_idx += 1
 
@@ -832,12 +855,15 @@ def _iter_placeholders(
 
     Note that empty matches are ignored.
     """
-    prompt_len = len(prompt)
     mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
+    item_idx_by_modality = {modality: 0 for modality in mm_prompt_updates}
 
-    item_idx_by_modality = defaultdict[str, int](lambda: 0)
+    if _all_items_found(mm_item_counts, item_idx_by_modality):
+        return
 
+    prompt_len = len(prompt)
     start_idx = 0
+
     while start_idx < prompt_len:
         found = False
 
@@ -875,6 +901,9 @@ def _iter_placeholders(
                     break
 
             if found:
+                if _all_items_found(mm_item_counts, item_idx_by_modality):
+                    return
+
                 break  # Go back to the outer while loop
 
         if not found:

From f8151b66fa23f79c470d639f34768eb37709df0a Mon Sep 17 00:00:00 2001
From: HappyAmazonian <91216626+HappyAmazonian@users.noreply.github.com>
Date: Fri, 28 Nov 2025 05:29:05 -0800
Subject: [PATCH 512/578] =?UTF-8?q?Revert=20"Supress=20verbose=20logs=20fr?=
 =?UTF-8?q?om=20model=5Fhosting=5Fcontainer=5Fstandards=20(=E2=80=A6=20(#2?=
 =?UTF-8?q?9335)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Shen Teng <sheteng@amazon.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 requirements/common.txt               | 2 +-
 vllm/entrypoints/openai/api_server.py | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 3f8cd588422d..8b9e6b935bd2 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -49,4 +49,4 @@ cbor2 # Required for cross-language serialization of hashable objects
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
-model-hosting-container-standards < 1.0.0
\ No newline at end of file
+model-hosting-container-standards >= 0.1.9, < 1.0.0
\ No newline at end of file
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 70174250ceab..a7a2733913b0 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -5,7 +5,6 @@
 import importlib
 import inspect
 import json
-import logging
 import multiprocessing
 import multiprocessing.forkserver as forkserver
 import os
@@ -2099,9 +2098,6 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     # Add process-specific prefix to stdout and stderr.
     decorate_logs("APIServer")
 
-    # Suppress verbose logs from model_hosting_container_standards
-    logging.getLogger("model_hosting_container_standards").setLevel(logging.ERROR)
-
     listen_address, sock = setup_server(args)
     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
 

From e2f56c309d2a28899c68975a7e104502d56deb8f Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 28 Nov 2025 21:37:54 +0800
Subject: [PATCH 513/578] [CPU] Update torch 2.9.1 for CPU backend (#29664)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/scripts/hardware_ci/run-cpu-test.sh |  4 ++--
 csrc/cpu/utils.cpp                             | 13 +++++++------
 docker/Dockerfile.cpu                          |  2 +-
 requirements/cpu-build.txt                     |  5 ++---
 requirements/cpu.txt                           | 15 ++++-----------
 vllm/platforms/cpu.py                          |  1 -
 6 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 2267718f75ca..438fe522c870 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 5199ba2af024..3dacfc7b2b7a 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -51,12 +51,13 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
       if (node_id != -1) {
         node_ids.insert(node_id);
       }
-      TORCH_WARN(node_id == mem_node_id, "CPU ", cpu_id, " is on NUMA node ",
-                 node_id, ", but CPU ", omp_cpu_ids.front(),
-                 " is on NUMA node ", mem_node_id,
-                 ". All CPUs should be on the same NUMA node for optimal "
-                 "performance. Memory will be bound to NUMA node ",
-                 mem_node_id, ".");
+      if (node_id != mem_node_id) {
+        TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
+                   omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
+                   ". All CPUs should be on the same NUMA node for optimal "
+                   "performance. Memory will be bound to NUMA node ",
+                   mem_node_id, ".");
+      }
     }
     // Concatenate all node_ids into a single comma-separated string
     if (!node_ids.empty()) {
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index eb3807ef0ca4..8d55ecfba3e5 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -132,7 +132,7 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
       esac; \
     }; \
     remove_packages_not_supported_on_aarch64 && \
-    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
+    sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
     sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
     sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
     uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 81d429a5e5f8..e18e0825fc42 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -4,9 +4,8 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.9.0; platform_system == "Darwin"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index e23d3286f3f7..21571be479c8 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -4,25 +4,18 @@
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
 
 # Dependencies for CPUs
-packaging>=24.2
-setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.9.0; platform_system == "Darwin"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
-torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.8.0; platform_machine == "ppc64le"
+torchaudio; platform_machine != "s390x"
 
 # required for the image processor of phi3v, this must be updated alongside torch
-torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.23.0; platform_machine == "ppc64le"
-datasets # for benchmark scripts
+torchvision; platform_machine != "s390x"
 
 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
-triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
 
 # Use this to gather CPU info and optimize based on ARM Neoverse cores
 py-cpuinfo; platform_machine == "aarch64"
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 5f9561366e0d..2b2c2f9cdc57 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -384,7 +384,6 @@ def get_allowed_cpu_core_node_list(cls) -> tuple[list[int], list[LogicalCPUInfo]
 
     @classmethod
     def is_pin_memory_available(cls) -> bool:
-        logger.warning("Pin memory is not supported on CPU.")
         return False
 
     @classmethod

From 460d8bbf2d19fc4757b3253050a0a18b61c136e3 Mon Sep 17 00:00:00 2001
From: Mingyuan Ma <111467530+Victor49152@users.noreply.github.com>
Date: Fri, 28 Nov 2025 05:52:42 -0800
Subject: [PATCH 514/578] Remove upstream fa checks (#29471)

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/attention/layer.py                       | 57 +++----------------
 vllm/attention/ops/vit_attn_wrappers.py       | 10 +---
 vllm/attention/utils/fa_utils.py              |  8 +++
 vllm/model_executor/models/dots_ocr.py        |  8 ---
 vllm/model_executor/models/ernie45_vl.py      |  9 ---
 vllm/model_executor/models/glm4_1v.py         | 12 +---
 vllm/model_executor/models/keye.py            |  1 -
 vllm/model_executor/models/paddleocr_vl.py    | 15 -----
 vllm/model_executor/models/qwen2_5_vl.py      | 18 ------
 vllm/model_executor/models/qwen2_vl.py        |  8 ---
 .../models/qwen3_omni_moe_thinker.py          |  6 --
 vllm/model_executor/models/qwen3_vl.py        | 12 ----
 vllm/model_executor/models/siglip2navit.py    |  2 -
 13 files changed, 18 insertions(+), 148 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 62ac38751aa0..da5a62617129 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -56,53 +56,28 @@
 logger = init_logger(__name__)
 
 
-def check_upstream_fa_availability(dtype: torch.dtype):
-    if (
-        dtype in (torch.float16, torch.bfloat16)
-        and current_platform.is_cuda()
-        and current_platform.has_device_capability(80)
-    ):
-        from transformers.utils import is_flash_attn_2_available
-
-        return is_flash_attn_2_available()
-    if current_platform.is_rocm():
-        from importlib.util import find_spec
-
-        return find_spec("flash_attn") is not None
-    return False
-
-
 def maybe_get_vit_flash_attn_backend(
     attn_backend: AttentionBackendEnum,
-    use_upstream_fa: bool,
     attn_backend_override: AttentionBackendEnum | None = None,
 ) -> tuple[AttentionBackendEnum, Callable | None]:
     if current_platform.is_rocm():
         if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
             attn_backend = AttentionBackendEnum.ROCM_AITER_FA
-
         elif (
-            check_upstream_fa_availability(torch.get_default_dtype())
+            attn_backend_override is None
             and on_gfx9()
-            and attn_backend_override is None
+            and attn_backend == AttentionBackendEnum.FLASH_ATTN
         ):
-            attn_backend = AttentionBackendEnum.FLASH_ATTN
-            use_upstream_fa = True
+            pass
         else:
             return AttentionBackendEnum.TORCH_SDPA, None
-
     elif current_platform.is_cuda():
-        if (
-            attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            attn_backend = AttentionBackendEnum.FLASH_ATTN
-            use_upstream_fa = True
+        pass
     elif current_platform.is_xpu():
         assert attn_backend == AttentionBackendEnum.FLASH_ATTN, (
             "XPU platform only supports FLASH_ATTN as vision attention backend."
         )
-        use_upstream_fa = False
+        pass
     else:
         return AttentionBackendEnum.TORCH_SDPA, None
 
@@ -113,10 +88,7 @@ def maybe_get_vit_flash_attn_backend(
         if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
             from aiter import flash_attn_varlen_func
         else:
-            if use_upstream_fa:
-                from flash_attn import flash_attn_varlen_func
-            else:
-                from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+            from vllm.attention.utils.fa_utils import flash_attn_varlen_func
     else:
         flash_attn_varlen_func = None
 
@@ -501,11 +473,6 @@ def __init__(
             attn_backend_override=attn_backend_override,
         )
 
-        # Some auto-selected backends can be upgraded
-        # to upstream flash attention if available.
-        # If vllm native fa is selected, we use it directly.
-        use_upstream_fa = False
-
         self.attn_backend = (
             backend
             if backend
@@ -521,7 +488,6 @@ def __init__(
         self.attn_backend, self._flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -531,17 +497,8 @@ def __init__(
             AttentionBackendEnum.ROCM_AITER_FA,
         }
 
-        # this condition is just to make sure that the
-        # use_upstream_fa in the log is correct
-        if (
-            current_platform.is_rocm()
-            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
-        ):
-            use_upstream_fa = True
-
         logger.info_once(
-            f"MultiHeadAttention attn_backend: {self.attn_backend}, "
-            f"use_upstream_fa: {use_upstream_fa}"
+            f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder."
         )
 
     def forward(
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 46f8f5117f7a..d9f15f1e4285 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -27,15 +27,11 @@ def flash_attn_maxseqlen_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    use_upstream_fa: bool,
 ) -> torch.Tensor:
     if is_rocm_aiter:
         from aiter import flash_attn_varlen_func
     else:
-        if use_upstream_fa:
-            from flash_attn import flash_attn_varlen_func
-        else:
-            from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+        from vllm.attention.utils.fa_utils import flash_attn_varlen_func
     q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
     output = flash_attn_varlen_func(
         q,
@@ -62,7 +58,6 @@ def flash_attn_maxseqlen_wrapper_fake(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    use_upstream_fa: bool,
 ) -> torch.Tensor:
     b, s, h, d = q.shape
     return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
@@ -83,10 +78,9 @@ def vit_flash_attn_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    use_upstream_fa: bool,
 ) -> torch.Tensor:
     return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
-        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, use_upstream_fa
+        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter
     )
 
 
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index adb9b08a6573..8a46587473e4 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -18,6 +18,14 @@
     reshape_and_cache_flash = ops.reshape_and_cache_flash
     flash_attn_varlen_func = ops.flash_attn_varlen_func
     get_scheduler_metadata = ops.get_scheduler_metadata
+elif current_platform.is_rocm():
+    try:
+        from flash_attn import flash_attn_varlen_func  # noqa: F401
+    except ImportError as e:
+        raise ImportError(
+            "Rocm platform requires upstream flash-attn "
+            "to be installed. Please install flash-attn first."
+        ) from e
 
 
 def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 5460018d0d67..5cc2a48f26d6 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -11,7 +11,6 @@
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.config import VllmConfig
@@ -294,12 +293,10 @@ def __init__(
             torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -569,11 +566,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
         self.out_hidden_size = config.hidden_size
         # Keep blocks for compatibility with other vision towers
         num_layers = (
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 07b34fbc8add..81663dd7bbb4 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -38,7 +38,6 @@
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.config import VllmConfig
@@ -201,12 +200,9 @@ def __init__(
             attn_backend_override=attn_backend_override,
         )
 
-        self.use_upstream_fa = False
-
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -498,11 +494,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 7e0370886884..fe238861ecce 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -47,10 +47,7 @@
 from transformers.video_utils import VideoMetadata
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import (
-    check_upstream_fa_availability,
-    maybe_get_vit_flash_attn_backend,
-)
+from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state
@@ -296,12 +293,10 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -730,11 +725,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 302260b95299..881760155814 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -418,7 +418,6 @@ def __init__(
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                use_upstream_fa=False,
                 attn_backend_override=attn_backend_override,
             )
         )
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 74bb868492da..5256d8ba7fd8 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -33,7 +33,6 @@
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.attention.ops.vit_attn_wrappers import (
@@ -582,7 +581,6 @@ def __init__(
         prefix: str = "",
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         attn_backend_override: AttentionBackendEnum | None = None,
-        use_upstream_fa: bool = False,
     ) -> None:
         super().__init__()
 
@@ -612,11 +610,9 @@ def __init__(
         )
 
         self.attn_backend = attn_backend
-        self.use_upstream_fa = use_upstream_fa
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -680,7 +676,6 @@ def forward(
                 max_seqlen,
                 batch_size,
                 self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
-                self.use_upstream_fa,
             )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             outputs = []
@@ -783,7 +778,6 @@ def __init__(
         *,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         attn_backend_override: AttentionBackendEnum | None = None,
-        use_upstream_fa: bool = False,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
@@ -796,7 +790,6 @@ def __init__(
             prefix=f"{prefix}.self_attn",
             attn_backend=attn_backend,
             attn_backend_override=attn_backend_override,
-            use_upstream_fa=use_upstream_fa,
         )
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
@@ -852,13 +845,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
-        if self.attn_backend not in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        } and check_upstream_fa_availability(torch.get_default_dtype()):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
-            self.use_upstream_fa = True
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
@@ -875,7 +861,6 @@ def __init__(
                     prefix=f"{prefix}.layers.{layer_idx}",
                     attn_backend=self.attn_backend,
                     attn_backend_override=attn_backend_override,
-                    use_upstream_fa=self.use_upstream_fa,
                 )
                 for layer_idx in range(config.num_hidden_layers)
             ]
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 8c707c2561af..6ca490f46763 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -307,7 +307,6 @@ def __init__(
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        use_upstream_fa: bool = False,
         attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
@@ -344,24 +343,13 @@ def __init__(
             disable_tp=use_data_parallel,
         )
         self.attn_backend = attn_backend
-        self.use_upstream_fa = use_upstream_fa
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
-        # On ROCm with FLASH_ATTN backend, upstream flash_attn is used
-        from vllm.platforms import current_platform
 
-        if (
-            current_platform.is_rocm()
-            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
-        ):
-            self.use_upstream_fa = True
-        if current_platform.is_xpu():
-            self.use_upstream_fa = False
         self.is_flash_attn_backend = self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
@@ -415,7 +403,6 @@ def forward(
                 max_seqlen,
                 batch_size,
                 self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
-                self.use_upstream_fa,
             )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
@@ -459,7 +446,6 @@ def __init__(
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        use_upstream_fa: bool = False,
         attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
@@ -475,7 +461,6 @@ def __init__(
             prefix=f"{prefix}.attn",
             use_data_parallel=use_data_parallel,
             attn_backend=attn_backend,
-            use_upstream_fa=use_upstream_fa,
             attn_backend_override=attn_backend_override,
         )
         self.mlp = Qwen2_5_VisionMLP(
@@ -644,7 +629,6 @@ def __init__(
             is_neox_style=True,
         )
 
-        use_upstream_fa = False
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
             dtype=torch.get_default_dtype(),
@@ -654,7 +638,6 @@ def __init__(
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -681,7 +664,6 @@ def __init__(
                         prefix=f"{prefix}.blocks.{layer_idx}",
                         use_data_parallel=use_data_parallel,
                         attn_backend=self.attn_backend,
-                        use_upstream_fa=use_upstream_fa,
                         attn_backend_override=attn_backend_override,
                     )
                     for layer_idx in range(depth)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 9d1d023aed17..672659aa6042 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -45,7 +45,6 @@
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.config import VllmConfig
@@ -335,12 +334,10 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -657,11 +654,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index f5f88f66eff9..39dd42552ae8 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -47,7 +47,6 @@
 from transformers.models.whisper import WhisperFeatureExtractor
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import check_upstream_fa_availability
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
@@ -381,11 +380,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 39fe8336b84a..f0ba631e6680 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -49,7 +49,6 @@
 from transformers.video_utils import VideoMetadata
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import check_upstream_fa_availability
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
@@ -202,7 +201,6 @@ def __init__(
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        use_upstream_fa: bool = False,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -217,7 +215,6 @@ def __init__(
             prefix=f"{prefix}.attn",
             use_data_parallel=use_data_parallel,
             attn_backend=attn_backend,
-            use_upstream_fa=use_upstream_fa,
         )
         self.mlp = Qwen3_VisionMLP(
             dim,
@@ -378,14 +375,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        use_upstream_fa = False
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and self.attn_backend != AttentionBackendEnum.ROCM_AITER_FA
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
-            use_upstream_fa = True
 
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
@@ -407,7 +396,6 @@ def __init__(
                     prefix=f"{prefix}.blocks.{layer_idx}",
                     use_data_parallel=use_data_parallel,
                     attn_backend=self.attn_backend,
-                    use_upstream_fa=use_upstream_fa,
                 )
                 for layer_idx in range(vision_config.depth)
             ]
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index c185b45345bd..bbce01995412 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -255,12 +255,10 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )

From 0808eb813b6b219324092ab8cc25d3223e5ccb77 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Nov 2025 23:07:23 +0800
Subject: [PATCH 515/578] [Misc] Remove `yapf` directives (#29675)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../ec_transfer/ec_connector/factory.py       |  3 -
 vllm/entrypoints/openai/serving_tokens.py     | 92 +++++++++++--------
 2 files changed, 52 insertions(+), 43 deletions(-)

diff --git a/vllm/distributed/ec_transfer/ec_connector/factory.py b/vllm/distributed/ec_transfer/ec_connector/factory.py
index bfdf51d775bd..e51b32e6f6df 100644
--- a/vllm/distributed/ec_transfer/ec_connector/factory.py
+++ b/vllm/distributed/ec_transfer/ec_connector/factory.py
@@ -5,15 +5,12 @@
 from collections.abc import Callable
 from typing import TYPE_CHECKING
 
-# yapf: disable
 from vllm.distributed.ec_transfer.ec_connector.base import (
     ECConnectorBase,
     ECConnectorRole,
 )
 from vllm.logger import init_logger
 
-# yapf: enable
-
 if TYPE_CHECKING:
     from vllm.config import ECTransferConfig, VllmConfig
 
diff --git a/vllm/entrypoints/openai/serving_tokens.py b/vllm/entrypoints/openai/serving_tokens.py
index 69a526b9b70d..daa739e41fa0 100644
--- a/vllm/entrypoints/openai/serving_tokens.py
+++ b/vllm/entrypoints/openai/serving_tokens.py
@@ -7,7 +7,6 @@
 
 from fastapi import Request
 
-# yapf: disable
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
@@ -49,22 +48,26 @@ def __init__(
         enable_prompt_tokens_details: bool = False,
         enable_log_outputs: bool = False,
     ):
-        super().__init__(engine_client=engine_client,
-                         models=models,
-                         request_logger=request_logger,
-                         return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         log_error_stack=log_error_stack)
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            log_error_stack=log_error_stack,
+        )
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         self.enable_log_outputs = enable_log_outputs
         self.force_no_detokenize = force_no_detokenize
         if force_no_detokenize:
-            logger.info("Tokens-only mode is enabled, skipping detokenization "
-            "step for incoming requests.")
+            logger.info(
+                "Tokens-only mode is enabled, skipping detokenization "
+                "step for incoming requests."
+            )
 
     async def serve_tokens(
         self,
         request: GenerateRequest,
-        raw_request: Request | None = None
+        raw_request: Request | None = None,
     ) -> GenerateResponse | ErrorResponse:
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
@@ -78,13 +81,13 @@ async def serve_tokens(
             raise self.engine_client.dead_error
 
         lora_request = None
-        lora_request = self._maybe_get_adapters(request,
-            supports_default_mm_loras=True)
+        lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True)
 
         model_name = self.models.model_name(lora_request)
 
-        request_id = "generate-tokens-" \
-                     f"{self._base_request_id(raw_request, request.request_id)}"
+        request_id = (
+            f"generate-tokens-{self._base_request_id(raw_request, request.request_id)}"
+        )
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
         if raw_request:
@@ -106,13 +109,18 @@ async def serve_tokens(
             if self.force_no_detokenize:
                 sampling_params.detokenize = False
 
-            self._log_inputs(request_id,
-                             request.token_ids,
-                             params=sampling_params,
-                             lora_request=lora_request)
+            self._log_inputs(
+                request_id,
+                request.token_ids,
+                params=sampling_params,
+                lora_request=lora_request,
+            )
 
-            trace_headers = (None if raw_request is None else await
-                             self._get_trace_headers(raw_request.headers))
+            trace_headers = (
+                None
+                if raw_request is None
+                else await self._get_trace_headers(raw_request.headers)
+            )
 
             result_generator = self.engine_client.generate(
                 engine_prompt,
@@ -131,8 +139,8 @@ async def serve_tokens(
         try:
             assert result_generator is not None
             return await self.serve_tokens_full_generator(
-                request, result_generator, request_id, model_name,
-                request_metadata)
+                request, result_generator, request_id, model_name, request_metadata
+            )
         except ValueError as e:
             return self.create_error_response(str(e))
 
@@ -144,7 +152,6 @@ async def serve_tokens_full_generator(
         model_name: str,
         request_metadata: RequestResponseMetadata,
     ) -> ErrorResponse | GenerateResponse:
-
         created_time = int(time.time())
         final_res: RequestOutput | None = None
         sampling_params: SamplingParams = request.sampling_params
@@ -179,9 +186,9 @@ async def serve_tokens_full_generator(
             choice_data = GenerateResponseChoice(
                 index=output.index,
                 logprobs=logprobs,
-                finish_reason=output.finish_reason
-                if output.finish_reason else "stop",
-                token_ids=as_list(output.token_ids))
+                finish_reason=output.finish_reason if output.finish_reason else "stop",
+                token_ids=as_list(output.token_ids),
+            )
 
             choices.append(choice_data)
             num_generated_tokens += len(output.token_ids)
@@ -191,14 +198,16 @@ async def serve_tokens_full_generator(
         if final_res.encoder_prompt_token_ids is not None:
             num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
 
-        usage = UsageInfo(prompt_tokens=num_prompt_tokens,
-                          completion_tokens=num_generated_tokens,
-                          total_tokens=num_prompt_tokens +
-                          num_generated_tokens)
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+        )
         if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
             # This info is not available at the /coordinator level
             usage.prompt_tokens_details = PromptTokenUsageInfo(
-                cached_tokens=final_res.num_cached_tokens)
+                cached_tokens=final_res.num_cached_tokens
+            )
 
         request_metadata.final_usage_info = usage
 
@@ -218,14 +227,13 @@ async def serve_tokens_full_generator(
                 # Get the corresponding output token IDs
                 output_token_ids = None
                 if choice.index < len(final_res.outputs):
-                    output_token_ids = final_res.outputs[
-                        choice.index].token_ids
+                    output_token_ids = final_res.outputs[choice.index].token_ids
 
                 if output_token_ids:
                     # Log token_ids only.
                     self.request_logger.log_outputs(
                         request_id=request_id,
-                        outputs="", 
+                        outputs="",
                         output_token_ids=output_token_ids,
                         finish_reason=choice.finish_reason,
                         is_streaming=False,
@@ -246,10 +254,12 @@ def _create_tokens_logprobs(
         for i, token_id in enumerate(token_ids):
             token = f"token_id:{token_id}"
             step_top_logprobs = top_logprobs[i]
-            if step_top_logprobs is None or step_top_logprobs.get(
-                    token_id) is None:
+            if step_top_logprobs is None or step_top_logprobs.get(token_id) is None:
                 logprobs_content.append(
-                    ChatCompletionLogProbsContent(token=token, ))
+                    ChatCompletionLogProbsContent(
+                        token=token,
+                    )
+                )
             else:
                 step_token = step_top_logprobs[token_id]
 
@@ -261,9 +271,11 @@ def _create_tokens_logprobs(
                             ChatCompletionLogProb(
                                 token=token,
                                 logprob=max(p[1].logprob, -9999.0),
-                            ) for i, p in enumerate(step_top_logprobs.items())
-                            if num_output_top_logprobs
-                            and i < num_output_top_logprobs
-                        ]))
+                            )
+                            for i, p in enumerate(step_top_logprobs.items())
+                            if num_output_top_logprobs and i < num_output_top_logprobs
+                        ],
+                    )
+                )
 
         return ChatCompletionLogProbs(content=logprobs_content)

From 9eec282cb5a6fe0b39449476dc4d14da0516984c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 28 Nov 2025 16:34:48 +0000
Subject: [PATCH 516/578] Guard FlashInfer sampler using the same check as
 FlashInfer attention backend (#29415)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/v1/sample/ops/topk_topp_sampler.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 5b2d130b0ea4..c9229e788b6b 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -33,6 +33,16 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
             and current_platform.is_cuda()
         ):
             if envs.VLLM_USE_FLASHINFER_SAMPLER:
+                from vllm.v1.attention.backends.flashinfer import FlashInferBackend
+
+                capability = current_platform.get_device_capability()
+                assert capability is not None
+                if not FlashInferBackend.supports_compute_capability(capability):
+                    capability_str = capability.as_version_str()
+                    raise RuntimeError(
+                        "FlashInfer does not support compute capability "
+                        f"{capability_str}, unset VLLM_USE_FLASHINFER_SAMPLER=1."
+                    )
                 # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
                 logger.info_once(
                     "Using FlashInfer for top-p & top-k sampling.",

From 9e6bcda3ac13b97ea620155c8dc98ca434cdb234 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 00:39:27 +0800
Subject: [PATCH 517/578] [mypy] Enable type checking for more directories
 (#29674)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tools/pre_commit/mypy.py                             | 12 ++++++------
 vllm/distributed/kv_transfer/kv_connector/v1/base.py |  2 +-
 .../kv_connector/v1/lmcache_mp_connector.py          |  2 +-
 .../kv_transfer/kv_connector/v1/metrics.py           | 12 ++++++------
 .../kv_transfer/kv_connector/v1/multi_connector.py   |  4 ++--
 .../kv_transfer/kv_connector/v1/nixl_connector.py    |  8 ++++----
 vllm/engine/arg_utils.py                             |  4 +++-
 vllm/transformers_utils/config.py                    |  4 ++--
 vllm/triton_utils/__init__.py                        |  3 ++-
 vllm/v1/metrics/loggers.py                           |  8 ++++----
 vllm/v1/sample/logits_processor/__init__.py          |  2 +-
 vllm/v1/spec_decode/metrics.py                       |  5 +++--
 12 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index e3dc40fd0ec7..47e01fc93b48 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -27,19 +27,24 @@
     "vllm/*.py",
     "vllm/assets",
     "vllm/distributed",
+    "vllm/engine",
     "vllm/entrypoints",
     "vllm/executor",
     "vllm/inputs",
     "vllm/logging_utils",
     "vllm/multimodal",
     "vllm/platforms",
+    "vllm/plugins",
     "vllm/transformers_utils",
     "vllm/triton_utils",
     "vllm/usage",
     "vllm/utils",
+    "vllm/worker",
     "vllm/v1/core",
     "vllm/v1/engine",
+    "vllm/v1/metrics",
     "vllm/v1/pool",
+    "vllm/v1/sample",
     "vllm/v1/worker",
 ]
 
@@ -50,24 +55,19 @@
     # v0 related
     "vllm/attention",
     "vllm/compilation",
-    "vllm/engine",
-    "vllm/inputs",
     "vllm/lora",
     "vllm/model_executor",
-    "vllm/plugins",
-    "vllm/worker",
     # v1 related
     "vllm/v1/attention",
     "vllm/v1/executor",
     "vllm/v1/kv_offload",
-    "vllm/v1/metrics",
-    "vllm/v1/sample",
     "vllm/v1/spec_decode",
     "vllm/v1/structured_output",
 ]
 
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 EXCLUDE = [
+    "vllm/engine/arg_utils.py",
     "vllm/model_executor/parallel_utils",
     "vllm/model_executor/models",
     "vllm/model_executor/layers/fla/ops",
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index cac45425bb7a..d37ec25675b7 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -565,7 +565,7 @@ def build_prom_metrics(
         vllm_config: "VllmConfig",
         metric_types: dict[type["PromMetric"], type["PromMetricT"]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> Optional["KVConnectorPromMetrics"]:
         """
         Create a KVConnectorPromMetrics subclass which should register
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index a4bddf5e0316..78256a6552c2 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -806,7 +806,7 @@ def build_prom_metrics(
         vllm_config: "VllmConfig",
         metric_types: dict[type["PromMetric"], type["PromMetricT"]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> Optional["KVConnectorPromMetrics"]:
         """
         Create a KVConnectorPromMetrics subclass which should register
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
index d6ea4f1ab4cf..eb8342eb7129 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
@@ -52,13 +52,13 @@ def is_empty(self) -> bool:
 
 
 class KVConnectorLogging:
-    def __init__(self, kv_tranfer_config: KVTransferConfig):
+    def __init__(self, kv_transfer_config: KVTransferConfig | None):
         # This should be called on frontend process.
         assert not has_kv_transfer_group()
         # Instantiate the connector's stats class.
-        if kv_tranfer_config and kv_tranfer_config.kv_connector:
+        if kv_transfer_config and kv_transfer_config.kv_connector:
             self.connector_cls = KVConnectorFactory.get_connector_class(
-                kv_tranfer_config
+                kv_transfer_config
             )
         self.reset()
 
@@ -120,7 +120,7 @@ def __init__(
         vllm_config: VllmConfig,
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         self._kv_transfer_config = vllm_config.kv_transfer_config
         self._gauge_cls = metric_types[Gauge]
@@ -129,7 +129,7 @@ def __init__(
         self._labelnames = labelnames
         self._per_engine_labelvalues = per_engine_labelvalues
 
-    def make_per_engine(self, metric: PromMetric) -> PromMetric:
+    def make_per_engine(self, metric: PromMetric) -> dict[int, PromMetric]:
         """
         Create a per-engine child of a prometheus_client.Metric with
         the appropriate labels set. The parent metric must be created
@@ -165,7 +165,7 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         self.prom_metrics: KVConnectorPromMetrics | None = None
         kv_transfer_config = vllm_config.kv_transfer_config
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index f47e8ca7e6c5..51d5df6c6ba1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -85,7 +85,7 @@ def __init__(
         vllm_config: "VllmConfig",
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
         prom_metrics: dict[str, KVConnectorPromMetrics],
     ):
         super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
@@ -434,7 +434,7 @@ def build_prom_metrics(
         vllm_config: "VllmConfig",
         metric_types: dict[type["PromMetric"], type["PromMetricT"]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> KVConnectorPromMetrics:
         prom_metrics: dict[str, KVConnectorPromMetrics] = {}
         for connector_cls, temp_config in cls._get_connector_classes_and_configs(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 24c8d32dafed..41e32bb73d40 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -288,7 +288,7 @@ def build_prom_metrics(
         vllm_config: VllmConfig,
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> KVConnectorPromMetrics:
         return NixlPromMetrics(
             vllm_config, metric_types, labelnames, per_engine_labelvalues
@@ -2345,9 +2345,9 @@ def reduce(self) -> dict[str, int | float]:
         return {
             "Num successful transfers": n,
             "Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3),
-            "P90 xfer time (ms)": round(np.percentile(xfer_time, 90) * 1e3, 3),
+            "P90 xfer time (ms)": round(np.percentile(xfer_time, 90).item() * 1e3, 3),
             "Avg post time (ms)": round(post_time.mean() * 1e3, 3),
-            "P90 post time (ms)": round(np.percentile(post_time, 90) * 1e3, 3),
+            "P90 post time (ms)": round(np.percentile(post_time, 90).item() * 1e3, 3),
             "Avg MB per transfer": round(avg_mb, 3),
             "Throughput (MB/s)": round(throughput_mb_s, 3),
             "Avg number of descriptors": round(descs.mean(), 1),
@@ -2364,7 +2364,7 @@ def __init__(
         vllm_config: VllmConfig,
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ad5a34c56161..1126e9ce12db 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1954,7 +1954,9 @@ def _set_default_chunked_prefill_and_prefix_caching_args(
             self.enable_prefix_caching = False
 
     def _set_default_max_num_seqs_and_batched_tokens_args(
-        self, usage_context: UsageContext, model_config: ModelConfig
+        self,
+        usage_context: UsageContext | None,
+        model_config: ModelConfig,
     ):
         world_size = self.pipeline_parallel_size * self.tensor_parallel_size
         (
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 66680f410cb3..224542c824ef 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -614,12 +614,12 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
 
 def maybe_override_with_speculators(
     model: str,
-    tokenizer: str,
+    tokenizer: str | None,
     trust_remote_code: bool,
     revision: str | None = None,
     vllm_speculative_config: dict[str, Any] | None = None,
     **kwargs,
-) -> tuple[str, str, dict[str, Any] | None]:
+) -> tuple[str, str | None, dict[str, Any] | None]:
     """
     Resolve model configuration when speculators are detected.
 
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index a475d0fa406b..ce459ca91d8e 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING
 
 from vllm.triton_utils.importing import (
     HAS_TRITON,
@@ -7,7 +8,7 @@
     TritonPlaceholder,
 )
 
-if HAS_TRITON:
+if TYPE_CHECKING or HAS_TRITON:
     import triton
     import triton.language as tl
     import triton.language.extra.libdevice as tldevice
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index bd18a152ffc0..429cee3b5af1 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -104,8 +104,8 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self.mm_caching_metrics = CachingMetrics()
 
         self.spec_decoding_logging = SpecDecodingLogging()
-        kv_tranfer_config = self.vllm_config.kv_transfer_config
-        self.kv_connector_logging = KVConnectorLogging(kv_tranfer_config)
+        kv_transfer_config = self.vllm_config.kv_transfer_config
+        self.kv_connector_logging = KVConnectorLogging(kv_transfer_config)
         self.last_prompt_throughput: float = 0.0
         self.last_generation_throughput: float = 0.0
         self.engine_is_idle = False
@@ -380,7 +380,7 @@ def __init__(
         model_name = vllm_config.model_config.served_model_name
         max_model_len = vllm_config.model_config.max_model_len
 
-        per_engine_labelvalues: dict[int, list[str]] = {
+        per_engine_labelvalues: dict[int, list[object]] = {
             idx: [model_name, str(idx)] for idx in engine_indexes
         }
 
@@ -1052,7 +1052,7 @@ def log_engine_initialized(self):
 
 
 def make_per_engine(
-    metric: PromMetric, engine_idxs: list[int], model_name: str
+    metric: PromMetric, engine_idxs: list[int], model_name: object
 ) -> dict[int, PromMetric]:
     return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs}
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 8b174af4c779..f7b70645fd18 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -313,7 +313,7 @@ def _new_state(
                 if (len(inspect.signature(req_lp).parameters) == 3)
                 else [output_ids]
             )
-            return partial(req_lp, *args)
+            return partial(req_lp, *args)  # type: ignore[misc]
         return None
 
     def update_state(self, batch_update: BatchUpdate | None):
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 79d856a143ba..6c16bc686d16 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -144,7 +144,7 @@ def __init__(
         self,
         speculative_config: SpeculativeConfig | None,
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         self.spec_decoding_enabled = speculative_config is not None
         if not self.spec_decoding_enabled:
@@ -215,7 +215,8 @@ def observe(self, spec_decoding_stats: SpecDecodingStats, engine_idx: int = 0):
 
 
 def make_per_engine(
-    counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[str]]
+    counter: prometheus_client.Counter,
+    per_engine_labelvalues: dict[int, list[object]],
 ):
     """Create a counter for each label value."""
     return {

From 3bcbb30cbf19c128b97b7d452a5a201098f2cc48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9E=9C=E5=86=BB=E8=99=BE=E4=BB=81?= <guodong@apache.org>
Date: Sat, 29 Nov 2025 00:41:05 +0800
Subject: [PATCH 518/578] add add_truncate_prompt_tokens in repr for
 PoolingParams (#29683)

---
 vllm/pooling_params.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index c2094a2d920a..4a5caa7e27fc 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -220,6 +220,7 @@ def __repr__(self) -> str:
             f"returned_token_ids={self.returned_token_ids}, "
             f"requires_token_ids={self.requires_token_ids}, "
             f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
+            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
             f"extra_kwargs={self.extra_kwargs})"
         )
 

From fae6943068e05fb90705ccb5c08bbd04d771048f Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Fri, 28 Nov 2025 17:41:41 +0100
Subject: [PATCH 519/578] [Doc]: fixing typos in multiple files. (#29685)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/compilation/sequence_parallelism.py          | 2 +-
 vllm/model_executor/models/nano_nemotron_vl.py    | 2 +-
 vllm/model_executor/models/stablelm.py            | 2 +-
 vllm/utils/gc_utils.py                            | 2 +-
 vllm/v1/spec_decode/ngram_proposer.py             | 2 +-
 vllm/v1/worker/ec_connector_model_runner_mixin.py | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index bb4dcf12d865..cf4b8118f6b5 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -304,7 +304,7 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
-        # Used to cleanup redundant views created temporarily
+        # Used to clean up redundant views created temporarily
         # to circumvent residual shape change issues
         self.noop_cleanup = NoOpEliminationPass(config)
         self.noop_cleanup.pass_name = f"{self.pass_name}.{self.noop_cleanup.pass_name}"
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index cb39c2ae482d..5529089e06ae 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1472,7 +1472,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
             return []
 
         # # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor correspoending to a multimodal data item (image or video).
+        # tensor corresponding to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
         # NOTE: It is important to iterate over the keys in this dictionary
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 6cb98b7b72a5..65092584edce 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -19,7 +19,7 @@
 # This code is based off the following work:
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
-"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
+"""Inference-only StableLM (https://github.com/Stability-AI/StableLM)
 model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
index c56b1794230e..25c8653e0e03 100644
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@@ -17,7 +17,7 @@ class GCDebugConfig:
     """
     Config for GC Debugger.
     - 0: disable GC debugger
-    - 1: enable GC debugger with gc.collect elpased times
+    - 1: enable GC debugger with gc.collect elapsed times
     - '{"top_objects":5}': enable GC debugger with top 5 collected objects
     """
 
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index e2f83cb24aa9..10b3f0aa040e 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -269,7 +269,7 @@ def _find_longest_matched_ngram_and_propose_tokens(
                 prev_lps = lps[max_ngram - 1]
             i += 1
         elif prev_lps != 0:
-            # Token mismatch: try the second longest prefix
+            # Token mismatch: try the second-longest prefix
             # among all suffix of tokens[:i],
             # which is the longest prefix of tokens[:prev_lps]
             prev_lps = lps[prev_lps - 1]
diff --git a/vllm/v1/worker/ec_connector_model_runner_mixin.py b/vllm/v1/worker/ec_connector_model_runner_mixin.py
index 00bc909df297..08a41532ea8e 100644
--- a/vllm/v1/worker/ec_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py
@@ -59,7 +59,7 @@ def maybe_get_ec_connector_output(
         )
 
     # This context manager must be used within an active forward context.
-    # It encapsulates the entire EC conector lifecycle within execute_model
+    # It encapsulates the entire EC connector lifecycle within execute_model
     @staticmethod
     @contextmanager
     def _get_ec_connector_output(

From 6f9d81d03b3b60f8a8b51d624c86f99bf26e96a4 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 29 Nov 2025 00:44:33 +0800
Subject: [PATCH 520/578] [V0 deprecation] Clean up legacy paged attention
 helper functions (#28043)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/attention/ops/paged_attn.py            | 211 --------------------
 vllm/attention/ops/rocm_aiter_paged_attn.py | 123 ------------
 2 files changed, 334 deletions(-)
 delete mode 100644 vllm/attention/ops/rocm_aiter_paged_attn.py

diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 8e010ffba32e..4aa4bcf5bbd3 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -1,58 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from dataclasses import dataclass
 
 import torch
 
 from vllm.platforms import current_platform
-from vllm.triton_utils import HAS_TRITON
 
 if current_platform.is_cuda_alike():
     from vllm import _custom_ops as ops
 elif current_platform.is_xpu():
     from vllm._ipex_ops import ipex_ops as ops
 
-if HAS_TRITON:
-    from vllm.attention.ops.prefix_prefill import context_attention_fwd
-
-# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
-_PARTITION_SIZE = 512
-
-
-@dataclass
-class PagedAttentionMetadata:
-    """Metadata for PagedAttention."""
-
-    # (batch_size,). The length of sequences (entire tokens seen so far) per
-    # sequence.
-    seq_lens_tensor: torch.Tensor | None
-    # Maximum sequence length in the batch. 0 if it is prefill-only batch.
-    max_decode_seq_len: int
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
-    # in the kv cache. Each block can contain up to block_size tokens.
-    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
-    # captured.
-    block_tables: torch.Tensor | None
-
 
 class PagedAttention:
-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
-        return [32, 64, 80, 96, 112, 120, 128, 192, 256]
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        cache_dtype_str: str = "auto",
-    ) -> tuple[int, ...]:
-        return (2, num_blocks, block_size * num_kv_heads * head_size)
-
     @staticmethod
     def split_kv_cache(
         kv_cache: torch.Tensor,
@@ -89,174 +49,3 @@ def write_to_paged_cache(
             k_scale,
             v_scale,
         )
-
-    @staticmethod
-    def forward_decode(
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        seq_lens: torch.Tensor,
-        max_seq_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: torch.Tensor | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        tp_rank: int = 0,
-        blocksparse_local_blocks: int = 0,
-        blocksparse_vert_stride: int = 0,
-        blocksparse_block_size: int = 64,
-        blocksparse_head_sliding_step: int = 0,
-    ) -> torch.Tensor:
-        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
-            # use blocksparse paged attention
-            block_size = value_cache.size(-1)
-            assert (
-                blocksparse_block_size > 0 and blocksparse_block_size % block_size == 0
-            ), (
-                f"{blocksparse_block_size=} needs to be a multiple of"
-                f"{block_size=} used in block_tables."
-            )
-
-        output = torch.empty_like(query)
-        block_size = value_cache.shape[3]
-        num_seqs, num_heads, head_size = query.shape
-        max_num_partitions = (max_seq_len + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-        # NOTE(woosuk): We use a simple heuristic to decide whether to use
-        # PagedAttention V1 or V2. If the number of partitions is 1, we use
-        # V1 to avoid the overhead of reduction. Also, if the number of
-        # sequences or heads is large, we use V1 since there is enough work
-        # to parallelize.
-        # TODO(woosuk): Tune this heuristic.
-        # For context len > 8192, use V2 kernel to avoid shared memory shortage.
-        use_v1 = max_seq_len <= 8192 and (
-            max_num_partitions == 1 or num_seqs * num_heads > 512
-        )
-
-        if use_v1:
-            # Run PagedAttention V1.
-            ops.paged_attention_v1(
-                output,
-                query,
-                key_cache,
-                value_cache,
-                num_kv_heads,
-                scale,
-                block_tables,
-                seq_lens,
-                block_size,
-                max_seq_len,
-                alibi_slopes,
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-                tp_rank,
-                blocksparse_local_blocks,
-                blocksparse_vert_stride,
-                blocksparse_block_size,
-                blocksparse_head_sliding_step,
-            )
-        else:
-            # Run PagedAttention V2.
-            assert _PARTITION_SIZE % block_size == 0
-            tmp_output = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions, head_size),
-                dtype=output.dtype,
-                device=output.device,
-            )
-            exp_sums = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions),
-                dtype=torch.float32,
-                device=output.device,
-            )
-            max_logits = torch.empty_like(exp_sums)
-            ops.paged_attention_v2(
-                output,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                num_kv_heads,
-                scale,
-                block_tables,
-                seq_lens,
-                block_size,
-                max_seq_len,
-                alibi_slopes,
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-                tp_rank,
-                blocksparse_local_blocks,
-                blocksparse_vert_stride,
-                blocksparse_block_size,
-                blocksparse_head_sliding_step,
-            )
-        return output
-
-    @staticmethod
-    def forward_prefix(
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache_dtype: str,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        query_start_loc: torch.Tensor,
-        seq_lens_tensor: torch.Tensor,
-        max_query_len: int,
-        alibi_slopes: torch.Tensor | None,
-        sliding_window: int | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-    ) -> torch.Tensor:
-        output = torch.empty_like(query)
-        max_seq_len = None
-        context_attention_fwd(
-            query,
-            key,
-            value,
-            output,
-            kv_cache_dtype,
-            key_cache,
-            value_cache,
-            block_tables,
-            # query_start_loc is (batch_size + 1,)
-            query_start_loc,
-            seq_lens_tensor,
-            max_seq_len,
-            max_query_len,
-            k_scale,
-            v_scale,
-            alibi_slopes,
-            sliding_window,
-        )
-        return output
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        src_key_cache = src_kv_cache[0]
-        dst_key_cache = dst_kv_cache[0]
-        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
-
-        src_value_cache = src_kv_cache[1]
-        dst_value_cache = dst_kv_cache[1]
-        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: list[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        key_caches = [kv_cache[0] for kv_cache in kv_caches]
-        value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        ops.copy_blocks(key_caches, value_caches, src_to_dists)
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
deleted file mode 100644
index bcd1e2cd5644..000000000000
--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import aiter as rocm_aiter
-import torch
-
-from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.platforms import current_platform
-from vllm.utils.math_utils import cdiv
-
-FP8_DTYPE = current_platform.fp8_dtype()
-
-
-class AITERPagedAttention(PagedAttention):
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-    ) -> None:
-        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
-            PagedAttention.write_to_paged_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                slot_mapping,
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-            )
-        else:
-            kv_cache_torch_dtype = FP8_DTYPE if "fp8" in kv_cache_dtype else torch.int8
-            key_cache = key_cache.view(kv_cache_torch_dtype)
-            value_cache = value_cache.view(kv_cache_torch_dtype)
-
-            rocm_aiter.reshape_and_cache_with_pertoken_quant(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                k_scale,
-                v_scale,
-                slot_mapping.flatten(),
-                True,
-            )
-
-    @staticmethod
-    def forward_decode(
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        seq_lens: torch.Tensor,
-        max_seq_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: torch.Tensor | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        tp_rank: int = 0,
-        blocksparse_local_blocks: int = 0,
-        blocksparse_vert_stride: int = 0,
-        blocksparse_block_size: int = 64,
-        blocksparse_head_sliding_step: int = 0,
-    ) -> torch.Tensor:
-        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
-            return PagedAttention.forward_decode(
-                query=query,
-                key_cache=key_cache,
-                value_cache=value_cache,
-                block_tables=block_tables,
-                seq_lens=seq_lens,
-                max_seq_len=max_seq_len,
-                kv_cache_dtype=kv_cache_dtype,
-                num_kv_heads=num_kv_heads,
-                scale=scale,
-                alibi_slopes=alibi_slopes,
-                k_scale=k_scale,
-                v_scale=v_scale,
-                tp_rank=tp_rank,
-                blocksparse_local_blocks=blocksparse_local_blocks,
-                blocksparse_vert_stride=blocksparse_vert_stride,
-                blocksparse_block_size=blocksparse_block_size,
-                blocksparse_head_sliding_step=blocksparse_head_sliding_step,
-            )
-
-        if "fp8" in kv_cache_dtype:
-            key_cache = key_cache.view(current_platform.fp8_dtype())
-            value_cache = value_cache.view(current_platform.fp8_dtype())
-
-        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
-            # use blocksparse paged attention
-            block_size = value_cache.size(-1)
-            assert (
-                blocksparse_block_size > 0 and blocksparse_block_size % block_size == 0
-            ), (
-                f"{blocksparse_block_size=} needs to be a multiple of"
-                f"{block_size=} used in block_tables."
-            )
-
-        output = torch.empty_like(query)
-        block_size = value_cache.shape[3]
-        max_num_blocks_per_seq = cdiv(max_seq_len, block_size)
-
-        rocm_aiter.pa_fwd_asm(
-            query,
-            key_cache,
-            value_cache,
-            block_tables,
-            seq_lens,
-            max_num_blocks_per_seq,
-            k_scale,
-            v_scale,
-            output,
-        )
-        return output

From f946a8d74308f4a47bc1624c9307c1bce45874eb Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 29 Nov 2025 00:46:51 +0800
Subject: [PATCH 521/578] [Chore]: Reorganize model repo operating functions in
 `transformers_utils` (#29680)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/transformers_utils/test_config.py       |   4 +-
 vllm/engine/arg_utils.py                      |   2 +-
 .../model_loader/default_loader.py            |   2 +-
 vllm/model_executor/models/adapters.py        |   2 +-
 vllm/transformers_utils/config.py             | 279 +----------------
 vllm/transformers_utils/gguf_utils.py         |   2 +-
 vllm/transformers_utils/repo_utils.py         | 287 ++++++++++++++++++
 vllm/transformers_utils/tokenizer.py          |   6 +-
 8 files changed, 304 insertions(+), 280 deletions(-)
 create mode 100644 vllm/transformers_utils/repo_utils.py

diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py
index de28ab5f99e8..7107ad0f7b99 100644
--- a/tests/transformers_utils/test_config.py
+++ b/tests/transformers_utils/test_config.py
@@ -8,7 +8,7 @@
 
 import pytest
 
-from vllm.transformers_utils.config import list_filtered_repo_files
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 
 
 @pytest.mark.parametrize(
@@ -44,7 +44,7 @@ def _glob_path() -> list[str]:
 
         # Patch list_repo_files called by fn
         with patch(
-            "vllm.transformers_utils.config.list_repo_files",
+            "vllm.transformers_utils.repo_utils.list_repo_files",
             MagicMock(return_value=_glob_path()),
         ) as mock_list_repo_files:
             out_files = sorted(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1126e9ce12db..8f0eb832064f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -83,10 +83,10 @@
 from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
 from vllm.transformers_utils.config import (
-    get_model_path,
     is_interleaved,
     maybe_override_with_speculators,
 )
+from vllm.transformers_utils.repo_utils import get_model_path
 from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 7401a7a0e2db..88c6d1e27e39 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -31,7 +31,7 @@
     safetensors_weights_iterator,
 )
 from vllm.platforms import current_platform
-from vllm.transformers_utils.config import list_filtered_repo_files
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 5aba46f8614b..738400ae864a 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -14,9 +14,9 @@
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.models.config import VerifyAndUpdateConfig
 from vllm.transformers_utils.config import (
-    get_hf_file_bytes,
     try_get_dense_modules,
 )
+from vllm.transformers_utils.repo_utils import get_hf_file_bytes
 
 from .interfaces_base import VllmModelForPooling, is_pooling_model
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 224542c824ef..45c4358bbc8f 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,30 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import fnmatch
-import json
 import os
-import time
 from collections.abc import Callable
 from dataclasses import asdict
 from functools import cache, partial
 from importlib.metadata import version
 from pathlib import Path
-from typing import Any, Literal, TypeAlias, TypeVar
+from typing import Any, Literal, TypeAlias
 
 import huggingface_hub
 from huggingface_hub import (
     get_safetensors_metadata,
-    hf_hub_download,
-    try_to_load_from_cache,
-)
-from huggingface_hub import list_repo_files as hf_list_repo_files
-from huggingface_hub.utils import (
-    EntryNotFoundError,
-    HfHubHTTPError,
-    LocalEntryNotFoundError,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
 )
 from packaging.version import Version
 from transformers import GenerationConfig, PretrainedConfig
@@ -40,6 +27,14 @@
 from vllm import envs
 from vllm.logger import init_logger
 from vllm.transformers_utils.config_parser_base import ConfigParserBase
+from vllm.transformers_utils.repo_utils import (
+    _get_hf_token,
+    file_or_path_exists,
+    get_hf_file_to_dict,
+    list_repo_files,
+    try_get_local_file,
+    with_retry,
+)
 from vllm.transformers_utils.utils import (
     check_gguf_file,
     is_gguf,
@@ -58,21 +53,6 @@
 logger = init_logger(__name__)
 
 
-def _get_hf_token() -> str | None:
-    """
-    Get the HuggingFace token from environment variable.
-
-    Returns None if the token is not set, is an empty string,
-    or contains only whitespace.
-    This follows the same pattern as huggingface_hub library which
-    treats empty string tokens as None to avoid authentication errors.
-    """
-    token = os.getenv("HF_TOKEN")
-    if token and token.strip():
-        return token
-    return None
-
-
 class LazyConfigDict(dict):
     def __getitem__(self, key):
         if isinstance(value := super().__getitem__(key), type):
@@ -308,143 +288,6 @@ def _wrapper(config_parser_cls):
     return _wrapper
 
 
-_R = TypeVar("_R")
-
-
-def with_retry(
-    func: Callable[[], _R],
-    log_msg: str,
-    max_retries: int = 2,
-    retry_delay: int = 2,
-) -> _R:
-    for attempt in range(max_retries):
-        try:
-            return func()
-        except Exception as e:
-            if attempt == max_retries - 1:
-                logger.error("%s: %s", log_msg, e)
-                raise
-            logger.error(
-                "%s: %s, retrying %d of %d", log_msg, e, attempt + 1, max_retries
-            )
-            time.sleep(retry_delay)
-            retry_delay *= 2
-
-    raise AssertionError("Should not be reached")
-
-
-# @cache doesn't cache exceptions
-@cache
-def list_repo_files(
-    repo_id: str,
-    *,
-    revision: str | None = None,
-    repo_type: str | None = None,
-    token: str | bool | None = None,
-) -> list[str]:
-    def lookup_files() -> list[str]:
-        # directly list files if model is local
-        if (local_path := Path(repo_id)).exists():
-            return [
-                str(file.relative_to(local_path))
-                for file in local_path.rglob("*")
-                if file.is_file()
-            ]
-        # if model is remote, use hf_hub api to list files
-        try:
-            if envs.VLLM_USE_MODELSCOPE:
-                from vllm.transformers_utils.utils import modelscope_list_repo_files
-
-                return modelscope_list_repo_files(
-                    repo_id,
-                    revision=revision,
-                    token=os.getenv("MODELSCOPE_API_TOKEN", None),
-                )
-            return hf_list_repo_files(
-                repo_id, revision=revision, repo_type=repo_type, token=token
-            )
-        except huggingface_hub.errors.OfflineModeIsEnabled:
-            # Don't raise in offline mode,
-            # all we know is that we don't have this
-            # file cached.
-            return []
-
-    return with_retry(lookup_files, "Error retrieving file list")
-
-
-def list_filtered_repo_files(
-    model_name_or_path: str,
-    allow_patterns: list[str],
-    revision: str | None = None,
-    repo_type: str | None = None,
-    token: str | bool | None = None,
-) -> list[str]:
-    try:
-        all_files = list_repo_files(
-            repo_id=model_name_or_path,
-            revision=revision,
-            token=token,
-            repo_type=repo_type,
-        )
-    except Exception:
-        logger.error(
-            "Error retrieving file list. Please ensure your `model_name_or_path`"
-            "`repo_type`, `token` and `revision` arguments are correctly set. "
-            "Returning an empty list."
-        )
-        return []
-
-    file_list = []
-    # Filter patterns on filenames
-    for pattern in allow_patterns:
-        file_list.extend(
-            [
-                file
-                for file in all_files
-                if fnmatch.fnmatch(os.path.basename(file), pattern)
-            ]
-        )
-    return file_list
-
-
-def file_exists(
-    repo_id: str,
-    file_name: str,
-    *,
-    repo_type: str | None = None,
-    revision: str | None = None,
-    token: str | bool | None = None,
-) -> bool:
-    file_list = list_repo_files(
-        repo_id, repo_type=repo_type, revision=revision, token=token
-    )
-    return file_name in file_list
-
-
-# In offline mode the result can be a false negative
-def file_or_path_exists(
-    model: str | Path, config_name: str, revision: str | None
-) -> bool:
-    if (local_path := Path(model)).exists():
-        return (local_path / config_name).is_file()
-
-    # Offline mode support: Check if config file is cached already
-    cached_filepath = try_to_load_from_cache(
-        repo_id=model, filename=config_name, revision=revision
-    )
-    if isinstance(cached_filepath, str):
-        # The config file exists in cache- we can continue trying to load
-        return True
-
-    # NB: file_exists will only check for the existence of the config file on
-    # hf_hub. This will fail in offline mode.
-
-    # Call HF to check if the file exists
-    return file_exists(
-        str(model), config_name, revision=revision, token=_get_hf_token()
-    )
-
-
 def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None:
     """Some models may have no rope_theta in their config but still use RoPE.
     This function sets a default rope_theta if it's missing."""
@@ -836,72 +679,6 @@ def get_config(
     return config
 
 
-def try_get_local_file(
-    model: str | Path, file_name: str, revision: str | None = "main"
-) -> Path | None:
-    file_path = Path(model) / file_name
-    if file_path.is_file():
-        return file_path
-    else:
-        try:
-            cached_filepath = try_to_load_from_cache(
-                repo_id=model, filename=file_name, revision=revision
-            )
-            if isinstance(cached_filepath, str):
-                return Path(cached_filepath)
-        except ValueError:
-            ...
-    return None
-
-
-def get_hf_file_to_dict(
-    file_name: str, model: str | Path, revision: str | None = "main"
-):
-    """
-    Downloads a file from the Hugging Face Hub and returns
-    its contents as a dictionary.
-
-    Parameters:
-    - file_name (str): The name of the file to download.
-    - model (str): The name of the model on the Hugging Face Hub.
-    - revision (str): The specific version of the model.
-
-    Returns:
-    - config_dict (dict): A dictionary containing
-    the contents of the downloaded file.
-    """
-
-    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
-
-    if file_path is None:
-        try:
-            hf_hub_file = hf_hub_download(model, file_name, revision=revision)
-        except huggingface_hub.errors.OfflineModeIsEnabled:
-            return None
-        except (
-            RepositoryNotFoundError,
-            RevisionNotFoundError,
-            EntryNotFoundError,
-            LocalEntryNotFoundError,
-        ) as e:
-            logger.debug("File or repository not found in hf_hub_download", e)
-            return None
-        except HfHubHTTPError as e:
-            logger.warning(
-                "Cannot connect to Hugging Face Hub. Skipping file download for '%s':",
-                file_name,
-                exc_info=e,
-            )
-            return None
-        file_path = Path(hf_hub_file)
-
-    if file_path is not None and file_path.is_file():
-        with open(file_path) as file:
-            return json.load(file)
-
-    return None
-
-
 @cache
 def get_pooling_config(model: str, revision: str | None = "main") -> dict | None:
     """
@@ -1316,41 +1093,3 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
         )
 
     return max_position_embeddings
-
-
-def get_model_path(model: str | Path, revision: str | None = None):
-    if os.path.exists(model):
-        return model
-    assert huggingface_hub.constants.HF_HUB_OFFLINE
-    common_kwargs = {
-        "local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE,
-        "revision": revision,
-    }
-
-    if envs.VLLM_USE_MODELSCOPE:
-        from modelscope.hub.snapshot_download import snapshot_download
-
-        return snapshot_download(model_id=model, **common_kwargs)
-
-    from huggingface_hub import snapshot_download
-
-    return snapshot_download(repo_id=model, **common_kwargs)
-
-
-def get_hf_file_bytes(
-    file_name: str, model: str | Path, revision: str | None = "main"
-) -> bytes | None:
-    """Get file contents from HuggingFace repository as bytes."""
-    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
-
-    if file_path is None:
-        hf_hub_file = hf_hub_download(
-            model, file_name, revision=revision, token=_get_hf_token()
-        )
-        file_path = Path(hf_hub_file)
-
-    if file_path is not None and file_path.is_file():
-        with open(file_path, "rb") as file:
-            return file.read()
-
-    return None
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index f727b1b4726b..c5b4d3f00090 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -9,7 +9,7 @@
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import list_filtered_repo_files
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
diff --git a/vllm/transformers_utils/repo_utils.py b/vllm/transformers_utils/repo_utils.py
new file mode 100644
index 000000000000..3ccec04fc487
--- /dev/null
+++ b/vllm/transformers_utils/repo_utils.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utilities for model repo interaction."""
+
+import fnmatch
+import json
+import os
+import time
+from collections.abc import Callable
+from functools import cache
+from pathlib import Path
+from typing import TypeVar
+
+import huggingface_hub
+from huggingface_hub import (
+    hf_hub_download,
+    try_to_load_from_cache,
+)
+from huggingface_hub import list_repo_files as hf_list_repo_files
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    HfHubHTTPError,
+    LocalEntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
+
+from vllm import envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def _get_hf_token() -> str | None:
+    """
+    Get the HuggingFace token from environment variable.
+
+    Returns None if the token is not set, is an empty string,
+    or contains only whitespace.
+    This follows the same pattern as huggingface_hub library which
+    treats empty string tokens as None to avoid authentication errors.
+    """
+    token = os.getenv("HF_TOKEN")
+    if token and token.strip():
+        return token
+    return None
+
+
+_R = TypeVar("_R")
+
+
+def with_retry(
+    func: Callable[[], _R],
+    log_msg: str,
+    max_retries: int = 2,
+    retry_delay: int = 2,
+) -> _R:
+    for attempt in range(max_retries):
+        try:
+            return func()
+        except Exception as e:
+            if attempt == max_retries - 1:
+                logger.error("%s: %s", log_msg, e)
+                raise
+            logger.error(
+                "%s: %s, retrying %d of %d", log_msg, e, attempt + 1, max_retries
+            )
+            time.sleep(retry_delay)
+            retry_delay *= 2
+
+    raise AssertionError("Should not be reached")
+
+
+# @cache doesn't cache exceptions
+@cache
+def list_repo_files(
+    repo_id: str,
+    *,
+    revision: str | None = None,
+    repo_type: str | None = None,
+    token: str | bool | None = None,
+) -> list[str]:
+    def lookup_files() -> list[str]:
+        # directly list files if model is local
+        if (local_path := Path(repo_id)).exists():
+            return [
+                str(file.relative_to(local_path))
+                for file in local_path.rglob("*")
+                if file.is_file()
+            ]
+        # if model is remote, use hf_hub api to list files
+        try:
+            if envs.VLLM_USE_MODELSCOPE:
+                from vllm.transformers_utils.utils import modelscope_list_repo_files
+
+                return modelscope_list_repo_files(
+                    repo_id,
+                    revision=revision,
+                    token=os.getenv("MODELSCOPE_API_TOKEN", None),
+                )
+            return hf_list_repo_files(
+                repo_id, revision=revision, repo_type=repo_type, token=token
+            )
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            # Don't raise in offline mode,
+            # all we know is that we don't have this
+            # file cached.
+            return []
+
+    return with_retry(lookup_files, "Error retrieving file list")
+
+
+def list_filtered_repo_files(
+    model_name_or_path: str,
+    allow_patterns: list[str],
+    revision: str | None = None,
+    repo_type: str | None = None,
+    token: str | bool | None = None,
+) -> list[str]:
+    try:
+        all_files = list_repo_files(
+            repo_id=model_name_or_path,
+            revision=revision,
+            token=token,
+            repo_type=repo_type,
+        )
+    except Exception:
+        logger.error(
+            "Error retrieving file list. Please ensure your `model_name_or_path`"
+            "`repo_type`, `token` and `revision` arguments are correctly set. "
+            "Returning an empty list."
+        )
+        return []
+
+    file_list = []
+    # Filter patterns on filenames
+    for pattern in allow_patterns:
+        file_list.extend(
+            [
+                file
+                for file in all_files
+                if fnmatch.fnmatch(os.path.basename(file), pattern)
+            ]
+        )
+    return file_list
+
+
+def file_exists(
+    repo_id: str,
+    file_name: str,
+    *,
+    repo_type: str | None = None,
+    revision: str | None = None,
+    token: str | bool | None = None,
+) -> bool:
+    file_list = list_repo_files(
+        repo_id, repo_type=repo_type, revision=revision, token=token
+    )
+    return file_name in file_list
+
+
+# In offline mode the result can be a false negative
+def file_or_path_exists(
+    model: str | Path, config_name: str, revision: str | None
+) -> bool:
+    if (local_path := Path(model)).exists():
+        return (local_path / config_name).is_file()
+
+    # Offline mode support: Check if config file is cached already
+    cached_filepath = try_to_load_from_cache(
+        repo_id=model, filename=config_name, revision=revision
+    )
+    if isinstance(cached_filepath, str):
+        # The config file exists in cache- we can continue trying to load
+        return True
+
+    # NB: file_exists will only check for the existence of the config file on
+    # hf_hub. This will fail in offline mode.
+
+    # Call HF to check if the file exists
+    return file_exists(
+        str(model), config_name, revision=revision, token=_get_hf_token()
+    )
+
+
+def get_model_path(model: str | Path, revision: str | None = None):
+    if os.path.exists(model):
+        return model
+    assert huggingface_hub.constants.HF_HUB_OFFLINE
+    common_kwargs = {
+        "local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE,
+        "revision": revision,
+    }
+
+    if envs.VLLM_USE_MODELSCOPE:
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        return snapshot_download(model_id=model, **common_kwargs)
+
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id=model, **common_kwargs)
+
+
+def get_hf_file_bytes(
+    file_name: str, model: str | Path, revision: str | None = "main"
+) -> bytes | None:
+    """Get file contents from HuggingFace repository as bytes."""
+    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
+
+    if file_path is None:
+        hf_hub_file = hf_hub_download(
+            model, file_name, revision=revision, token=_get_hf_token()
+        )
+        file_path = Path(hf_hub_file)
+
+    if file_path is not None and file_path.is_file():
+        with open(file_path, "rb") as file:
+            return file.read()
+
+    return None
+
+
+def try_get_local_file(
+    model: str | Path, file_name: str, revision: str | None = "main"
+) -> Path | None:
+    file_path = Path(model) / file_name
+    if file_path.is_file():
+        return file_path
+    else:
+        try:
+            cached_filepath = try_to_load_from_cache(
+                repo_id=model, filename=file_name, revision=revision
+            )
+            if isinstance(cached_filepath, str):
+                return Path(cached_filepath)
+        except ValueError:
+            ...
+    return None
+
+
+def get_hf_file_to_dict(
+    file_name: str, model: str | Path, revision: str | None = "main"
+):
+    """
+    Downloads a file from the Hugging Face Hub and returns
+    its contents as a dictionary.
+
+    Parameters:
+    - file_name (str): The name of the file to download.
+    - model (str): The name of the model on the Hugging Face Hub.
+    - revision (str): The specific version of the model.
+
+    Returns:
+    - config_dict (dict): A dictionary containing
+    the contents of the downloaded file.
+    """
+
+    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
+
+    if file_path is None:
+        try:
+            hf_hub_file = hf_hub_download(model, file_name, revision=revision)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            return None
+        except (
+            RepositoryNotFoundError,
+            RevisionNotFoundError,
+            EntryNotFoundError,
+            LocalEntryNotFoundError,
+        ) as e:
+            logger.debug("File or repository not found in hf_hub_download", e)
+            return None
+        except HfHubHTTPError as e:
+            logger.warning(
+                "Cannot connect to Hugging Face Hub. Skipping file download for '%s':",
+                file_name,
+                exc_info=e,
+            )
+            return None
+        file_path = Path(hf_hub_file)
+
+    if file_path is not None and file_path.is_file():
+        with open(file_path) as file:
+            return json.load(file)
+
+    return None
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 929dc8bf481c..9eb7fe37912b 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -15,11 +15,9 @@
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import (
-    get_sentence_transformer_tokenizer_config,
-    list_filtered_repo_files,
-)
+from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
 from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import (
     check_gguf_file,

From 4332955602f6b8d2e15b0f2658b7dbd4dc76024c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 28 Nov 2025 17:10:08 +0000
Subject: [PATCH 522/578] [Docs] Add CLI reference doc for `vllm bench sweep
 plot_pareto` (#29689)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/cli/bench/sweep/plot_pareto.md | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 docs/cli/bench/sweep/plot_pareto.md

diff --git a/docs/cli/bench/sweep/plot_pareto.md b/docs/cli/bench/sweep/plot_pareto.md
new file mode 100644
index 000000000000..f5dc257ce677
--- /dev/null
+++ b/docs/cli/bench/sweep/plot_pareto.md
@@ -0,0 +1,9 @@
+# vllm bench sweep plot_pareto
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Arguments
+
+--8<-- "docs/argparse/bench_sweep_plot_pareto.inc.md"

From d40c854009225adde370be30a1a876afe1e5b529 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 29 Nov 2025 01:10:29 +0800
Subject: [PATCH 523/578] [CI/Build] Rework CPU multimodal processor test
 (#29684)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .buildkite/test-pipeline.yaml | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 375645fde747..3347d74cbb23 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -820,14 +820,24 @@ steps:
   commands:
     - pytest -v -s models/language/pooling_mteb_test
 
-- label: Multi-Modal Processor Test # 44min
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor Test
   timeout_in_minutes: 60
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
 
 - label: Multi-Modal Models Test (Standard) # 60min
   timeout_in_minutes: 80

From 8d9338fae4ad2b9236daaa87d7c9fb6245b8755f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 01:35:41 +0800
Subject: [PATCH 524/578] [Chore] Rename `Processor` to `InputProcessor`
 (#29682)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../entrypoints/openai/test_lora_resolvers.py |   2 +-
 tests/entrypoints/openai/test_serving_chat.py |  14 +-
 .../entrypoints/openai/test_serving_engine.py |   2 +-
 .../entrypoints/openai/test_serving_models.py |   2 +-
 .../openai/test_serving_responses.py          |   4 +-
 ...s.py => test_process_multi_modal_uuids.py} |  35 +-
 vllm/engine/protocol.py                       |   4 +-
 vllm/entrypoints/llm.py                       |   6 +-
 vllm/entrypoints/openai/serving_engine.py     |  10 +-
 vllm/entrypoints/openai/serving_models.py     |   2 +-
 vllm/v1/engine/async_llm.py                   |  29 +-
 vllm/v1/engine/input_processor.py             | 637 +++++++++++++++++
 vllm/v1/engine/llm_engine.py                  |  28 +-
 vllm/v1/engine/processor.py                   | 641 +-----------------
 14 files changed, 723 insertions(+), 693 deletions(-)
 rename tests/v1/engine/{test_processor_multi_modal_uuids.py => test_process_multi_modal_uuids.py} (87%)
 create mode 100644 vllm/v1/engine/input_processor.py

diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index b05fa379c69f..4856cafef44b 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -114,7 +114,7 @@ async def mock_generate(*args, **kwargs):
     mock_engine.add_lora.reset_mock()
 
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     models = OpenAIServingModels(
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index dd10384a7e8c..492e15fc82a6 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -429,7 +429,7 @@ async def test_serving_chat_returns_correct_model_name():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     serving_chat = _build_serving_chat(mock_engine)
@@ -459,7 +459,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     serving_chat = _build_serving_chat(mock_engine)
@@ -492,7 +492,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
@@ -537,7 +537,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
@@ -583,7 +583,7 @@ async def test_serving_chat_could_load_correct_generation_config():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
@@ -629,7 +629,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     serving_chat = _build_serving_chat(mock_engine)
@@ -662,7 +662,7 @@ async def test_serving_chat_data_parallel_rank_extraction():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Mock the generate method to return an async generator
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 46d8871441a7..29892d0bf38a 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -23,7 +23,7 @@ def serving() -> OpenAIServing:
     model_config.max_model_len = 32768
     models = Mock(spec=OpenAIServingModels)
     models.model_config = model_config
-    models.processor = Mock()
+    models.input_processor = Mock()
     models.io_processor = Mock()
 
     serving = OpenAIServing(
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 3c022870dba4..b585835a0667 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -30,7 +30,7 @@ async def _async_serving_models_init() -> OpenAIServingModels:
     mock_model_config = MagicMock(spec=ModelConfig)
     mock_model_config.max_model_len = 2048
     mock_engine_client.model_config = mock_model_config
-    mock_engine_client.processor = MagicMock()
+    mock_engine_client.input_processor = MagicMock()
     mock_engine_client.io_processor = MagicMock()
 
     serving_models = OpenAIServingModels(
diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py
index 93e11b61020c..6af32774cc5c 100644
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -127,7 +127,7 @@ async def serving_responses_instance(self):
         model_config.get_diff_sampling_param.return_value = {}
         engine_client.model_config = model_config
 
-        engine_client.processor = MagicMock()
+        engine_client.input_processor = MagicMock()
         engine_client.io_processor = MagicMock()
 
         models = MagicMock()
@@ -213,7 +213,7 @@ async def serving_responses_instance(self):
         model_config.get_diff_sampling_param.return_value = {}
         engine_client.model_config = model_config
 
-        engine_client.processor = MagicMock()
+        engine_client.input_processor = MagicMock()
         engine_client.io_processor = MagicMock()
 
         models = MagicMock()
diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py
similarity index 87%
rename from tests/v1/engine/test_processor_multi_modal_uuids.py
rename to tests/v1/engine/test_process_multi_modal_uuids.py
index cb6865e42ef8..1b11b8af49d1 100644
--- a/tests/v1/engine/test_processor_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -7,18 +7,17 @@
 from vllm.assets.video import VideoAsset
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
-from vllm.v1.engine import processor as processor_mod
-from vllm.v1.engine.processor import Processor
+from vllm.v1.engine import input_processor as input_processor_mod
+from vllm.v1.engine.input_processor import InputProcessor
 
 cherry_pil_image = ImageAsset("cherry_blossom").pil_image
 stop_pil_image = ImageAsset("stop_sign").pil_image
 baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
 
 
-# Mock processor for testing
-def _mk_processor(
+def _mock_input_processor(
     monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
-) -> Processor:
+) -> InputProcessor:
     """
     Create a Processor instance with minimal configuration suitable for unit
     tests without accessing external resources.
@@ -36,7 +35,7 @@ def _mk_processor(
         raising=True,
     )
     monkeypatch.setattr(
-        processor_mod,
+        input_processor_mod,
         "processor_cache_from_config",
         lambda vllm_config, mm_registry: None,
         raising=True,
@@ -65,11 +64,11 @@ def __init__(self, gb: float):
         device_config=DeviceConfig(device="cpu"),
     )
 
-    return Processor(vllm_config, tokenizer=None)
+    return InputProcessor(vllm_config, tokenizer=None)
 
 
 def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
-    processor = _mk_processor(monkeypatch)
+    input_processor = _mock_input_processor(monkeypatch)
 
     prompt = {
         "prompt": "USER: <image>\nDescribe\nASSISTANT:",
@@ -79,7 +78,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
     }
 
     with pytest.raises(ValueError, match="must have same length as data"):
-        processor.process_inputs(
+        input_processor.process_inputs(
             request_id="req-1",
             prompt=prompt,  # type: ignore[arg-type]
             params=SamplingParams(),
@@ -87,7 +86,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
 
 
 def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
-    processor = _mk_processor(monkeypatch)
+    input_processor = _mock_input_processor(monkeypatch)
 
     prompt = {
         "prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
@@ -101,7 +100,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
     }
 
     with pytest.raises(ValueError, match="must be provided if multi_modal_data"):
-        processor.process_inputs(
+        input_processor.process_inputs(
             request_id="req-2",
             prompt=prompt,  # type: ignore[arg-type]
             params=SamplingParams(),
@@ -119,7 +118,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
 def test_multi_modal_uuids_accepts_none_and_passes_through(
     monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
 ):
-    processor = _mk_processor(
+    input_processor = _mock_input_processor(
         monkeypatch,
         mm_cache_gb=mm_cache_gb,
         enable_prefix_caching=enable_prefix_caching,
@@ -137,7 +136,7 @@ def fake_preprocess(
 
     # Monkeypatch only the bound preprocess method on this instance
     monkeypatch.setattr(
-        processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
+        input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
     )
 
     # Use a consistent two-image scenario across all configurations
@@ -151,7 +150,7 @@ def fake_preprocess(
         "multi_modal_uuids": mm_uuids,
     }
 
-    processor.process_inputs(
+    input_processor.process_inputs(
         request_id="req-3",
         prompt=prompt,  # type: ignore[arg-type]
         params=SamplingParams(),
@@ -163,7 +162,9 @@ def fake_preprocess(
 def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
     # When both processor cache is 0 and prefix caching disabled, the
     # processor builds overrides from request id instead of using user UUIDs.
-    processor = _mk_processor(monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False)
+    input_processor = _mock_input_processor(
+        monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
+    )
 
     captured: dict[str, object] = {}
 
@@ -174,7 +175,7 @@ def fake_preprocess(
         return {"type": "token", "prompt_token_ids": [1]}
 
     monkeypatch.setattr(
-        processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
+        input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
     )
 
     request_id = "req-42"
@@ -188,7 +189,7 @@ def fake_preprocess(
         "multi_modal_uuids": mm_uuids,
     }
 
-    processor.process_inputs(
+    input_processor.process_inputs(
         request_id=request_id,
         prompt=prompt,  # type: ignore[arg-type]
         params=SamplingParams(),
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 6b3ee042daf3..02741e50f6aa 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -15,7 +15,7 @@
 from vllm.tasks import SupportedTask
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.processor import Processor
+from vllm.v1.engine.input_processor import InputProcessor
 
 
 class EngineClient(ABC):
@@ -23,7 +23,7 @@ class EngineClient(ABC):
 
     vllm_config: VllmConfig
     model_config: ModelConfig
-    processor: Processor
+    input_processor: InputProcessor
     io_processor: IOProcessor | None
 
     @property
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f6ee74678998..2b34f36253ed 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -347,7 +347,7 @@ def __init__(
         self.supported_tasks = supported_tasks
 
         self.model_config = self.llm_engine.model_config
-        self.processor = self.llm_engine.processor
+        self.input_processor = self.llm_engine.input_processor
         self.io_processor = self.llm_engine.io_processor
 
     def get_tokenizer(self) -> AnyTokenizer:
@@ -364,7 +364,7 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
             self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
 
     def reset_mm_cache(self) -> None:
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         self.llm_engine.reset_mm_cache()
 
     def get_default_sampling_params(self) -> SamplingParams:
@@ -1674,7 +1674,7 @@ def _process_inputs(
             tokenization_kwargs,
         )
 
-        engine_request = self.processor.process_inputs(
+        engine_request = self.input_processor.process_inputs(
             request_id,
             engine_prompt,
             params,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index d9feee917ff4..cca2fd982fe0 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -284,7 +284,7 @@ def __init__(
         self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {}
         self.log_error_stack = log_error_stack
 
-        self.processor = self.models.processor
+        self.input_processor = self.models.input_processor
         self.io_processor = self.models.io_processor
         self.model_config = self.models.model_config
         self.max_model_len = self.model_config.max_model_len
@@ -330,7 +330,7 @@ def _get_reasoning_parser(
         return parser
 
     async def reset_mm_cache(self) -> None:
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         await self.engine_client.reset_mm_cache()
 
     async def beam_search(
@@ -348,8 +348,8 @@ async def beam_search(
         length_penalty = params.length_penalty
         include_stop_str_in_output = params.include_stop_str_in_output
 
-        processor = self.processor
-        tokenizer = processor.tokenizer
+        input_processor = self.input_processor
+        tokenizer = input_processor.tokenizer
         if tokenizer is None:
             raise ValueError(
                 "You cannot use beam search when `skip_tokenizer_init` is True"
@@ -1214,7 +1214,7 @@ async def _process_inputs(
             self.max_model_len, params.truncate_prompt_tokens, tokenization_kwargs
         )
 
-        engine_request = self.processor.process_inputs(
+        engine_request = self.input_processor.process_inputs(
             request_id,
             engine_prompt,
             params,
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 24b9587010ca..165de5b618c4 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -69,7 +69,7 @@ def __init__(
             )
         self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
 
-        self.processor = self.engine_client.processor
+        self.input_processor = self.engine_client.input_processor
         self.io_processor = self.engine_client.io_processor
         self.model_config = self.engine_client.model_config
         self.max_model_len = self.model_config.max_model_len
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 827a2736af28..bd28c41fb50e 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 import torch
+from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
@@ -35,9 +36,9 @@
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
+from vllm.v1.engine.input_processor import InputProcessor
 from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector
 from vllm.v1.engine.parallel_sampling import ParentRequest
-from vllm.v1.engine.processor import Processor
 from vllm.v1.executor import Executor
 from vllm.v1.metrics.loggers import (
     StatLoggerFactory,
@@ -112,7 +113,7 @@ def __init__(
         else:
             tokenizer = init_tokenizer_from_configs(self.model_config)
 
-        self.processor = Processor(self.vllm_config, tokenizer)
+        self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
             self.model_config.io_processor_plugin,
@@ -193,6 +194,14 @@ def __init__(
         else:
             self.profiler = None
 
+    @property
+    @deprecated(
+        "`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
+        "The old name will be removed in v0.13."
+    )
+    def processor(self):
+        return self.input_processor
+
     @classmethod
     def from_vllm_config(
         cls,
@@ -293,11 +302,7 @@ async def add_request(
             request = prompt
         else:
             assert prompt_text is None
-            logger.warning_once(
-                "Processor has been moved under OpenAIServing and will "
-                "be removed from AsyncLLM in v0.13."
-            )
-            request = self.processor.process_inputs(
+            request = self.input_processor.process_inputs(
                 request_id,
                 prompt,
                 params,
@@ -481,7 +486,7 @@ def _run_output_handler(self):
         output_processor = self.output_processor
         log_stats = self.log_stats
         logger_manager = self.logger_manager
-        processor = self.processor
+        input_processor = self.input_processor
 
         async def output_handler():
             try:
@@ -532,7 +537,7 @@ async def output_handler():
                             engine_idx=outputs.engine_index,
                             scheduler_stats=outputs.scheduler_stats,
                             iteration_stats=iteration_stats,
-                            mm_cache_stats=processor.stat_mm_cache(),
+                            mm_cache_stats=input_processor.stat_mm_cache(),
                         )
             except Exception as e:
                 logger.exception("AsyncLLM output_handler failed.")
@@ -699,11 +704,11 @@ async def encode(
 
     @property
     def tokenizer(self) -> AnyTokenizer | None:
-        return self.processor.tokenizer
+        return self.input_processor.tokenizer
 
     @tokenizer.setter
     def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
-        self.processor.tokenizer = tokenizer
+        self.input_processor.tokenizer = tokenizer
 
     async def get_tokenizer(self) -> AnyTokenizer:
         if self.tokenizer is None:
@@ -738,7 +743,7 @@ async def stop_profile(self) -> None:
         await asyncio.gather(*coros)
 
     async def reset_mm_cache(self) -> None:
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         await self.engine_core.reset_mm_cache_async()
 
     async def reset_prefix_cache(self) -> None:
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
new file mode 100644
index 000000000000..cfd637931a1c
--- /dev/null
+++ b/vllm/v1/engine/input_processor.py
@@ -0,0 +1,637 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from collections.abc import Mapping
+from typing import Any, Literal, cast
+
+from vllm.config import VllmConfig
+from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
+from vllm.inputs.parse import split_enc_dec_inputs
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import processor_cache_from_config
+from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
+from vllm.multimodal.parse import MultiModalDataParser
+from vllm.multimodal.processing import EncDecMultiModalProcessor
+from vllm.multimodal.utils import argsort_mm_positions
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.metrics.stats import MultiModalCacheStats
+from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
+from vllm.v1.structured_output.backend_lm_format_enforcer import (
+    validate_structured_output_request_lm_format_enforcer,
+)
+from vllm.v1.structured_output.backend_outlines import (
+    validate_structured_output_request_outlines,
+)
+from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar
+
+logger = init_logger(__name__)
+
+
+class InputProcessor:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        tokenizer: AnyTokenizer | None,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.structured_outputs_config = vllm_config.structured_outputs_config
+
+        self.generation_config_fields = self.model_config.try_get_generation_config()
+
+        self.mm_registry = mm_registry
+        self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
+
+        self.input_preprocessor = InputPreprocessor(
+            self.model_config,
+            tokenizer,
+            mm_registry,
+            mm_processor_cache=self.mm_processor_cache,
+        )
+
+    @property
+    def tokenizer(self) -> AnyTokenizer | None:
+        return self.input_preprocessor.tokenizer
+
+    @tokenizer.setter
+    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+        self.input_preprocessor.tokenizer = tokenizer
+
+    def _validate_logprobs(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        max_logprobs = self.model_config.max_logprobs
+        if max_logprobs == -1:
+            max_logprobs = self.model_config.get_vocab_size()
+
+        # Validate sample logprobs.
+        if params.logprobs:
+            num_logprobs = params.logprobs
+            if num_logprobs == -1:
+                num_logprobs = self.model_config.get_vocab_size()
+            if num_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested sample logprobs of {num_logprobs}, "
+                    f"which is greater than max allowed: {max_logprobs}"
+                )
+
+        # Validate prompt logprobs.
+        if params.prompt_logprobs:
+            num_prompt_logprobs = params.prompt_logprobs
+            if num_prompt_logprobs == -1:
+                num_prompt_logprobs = self.model_config.get_vocab_size()
+            if num_prompt_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested prompt logprobs of {num_prompt_logprobs}, "
+                    f"which is greater than max allowed: {max_logprobs}"
+                )
+
+    def _validate_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        self._validate_structured_output(params)
+        self._validate_logit_bias(params)
+
+        if params.allowed_token_ids is None:
+            return
+        if not params.allowed_token_ids:
+            raise ValueError("allowed_token_ids is not None and empty!")
+        if self.tokenizer is None:
+            # When skip_tokenizer_init=True, we can't validate token IDs
+            # Skip validation and let the model handle invalid tokens
+            return
+        vocab_size = len(self.tokenizer)
+        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
+            raise ValueError("allowed_token_ids contains out-of-vocab token id!")
+
+    def _validate_logit_bias(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        """Validate logit_bias token IDs are within vocabulary range."""
+        if not params.logit_bias:
+            return
+
+        vocab_size = self.model_config.get_vocab_size()
+        invalid_token_ids = []
+
+        for token_id in params.logit_bias:
+            if token_id < 0 or token_id >= vocab_size:
+                invalid_token_ids.append(token_id)
+
+        if invalid_token_ids:
+            raise ValueError(
+                f"token_id(s) {invalid_token_ids} in logit_bias contain "
+                f"out-of-vocab token ids. Vocabulary size: {vocab_size}"
+            )
+
+    def _validate_supported_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        # Logits processors not supported.
+        if params.logits_processors:
+            raise ValueError(
+                "vLLM V1 does not support per request user provided logits processors."
+            )
+        # Async scheduling + spec decode currently incompatible with some
+        # sampling parameters.
+        if (
+            self.vllm_config.speculative_config is not None
+            and self.vllm_config.scheduler_config.async_scheduling
+            and (
+                params.frequency_penalty != 0.0
+                or params.presence_penalty != 0.0
+                or params.repetition_penalty != 1.0
+                or params.bad_words_token_ids
+                or params.structured_outputs
+            )
+        ):
+            raise ValueError(
+                "async scheduling with spec decoding doesn't yet support "
+                "penalties, bad words or structured outputs in sampling parameters."
+            )
+
+    def _validate_params(
+        self,
+        params: SamplingParams | PoolingParams,
+    ):
+        """
+        Validate supported SamplingParam.
+        Should raise ValueError if unsupported for API Server.
+        """
+
+        if isinstance(params, PoolingParams):
+            return
+
+        self._validate_logprobs(params)
+        self._validate_sampling_params(params)
+        self._validate_supported_sampling_params(params)
+
+    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
+        """
+        Validate that user-provided multi_modal_uuids align with
+        multi_modal_data in the incoming request prompt(s).
+        Only checks lengths; `None` entries are allowed and will be
+        auto-hashed downstream.
+        """
+
+        def _validate_single_prompt(single_prompt: dict | str) -> None:
+            if not isinstance(single_prompt, dict):
+                return
+            mm_data = single_prompt.get("multi_modal_data")
+            mm_uuids = single_prompt.get("multi_modal_uuids")
+            if not mm_data or not mm_uuids:
+                return
+
+            for modality, items in mm_data.items():
+                if modality in mm_uuids:
+                    data_len = len(items) if isinstance(items, list) else 1
+                    uuid_len = (
+                        len(mm_uuids[modality])
+                        if isinstance(mm_uuids[modality], list)
+                        else 1
+                    )
+                    if uuid_len != data_len:
+                        raise ValueError(
+                            f"multi_modal_uuids for modality '{modality}' "
+                            "must have same length as data: got "
+                            f"{uuid_len} uuids vs "
+                            f"{data_len} items."
+                        )
+                else:
+                    raise ValueError(
+                        f"multi_modal_uuids for modality '{modality}' must "
+                        "be provided if multi_modal_data is provided."
+                    )
+
+        # Handle explicit encoder/decoder prompts or singleton prompt
+        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
+            enc = prompt.get("encoder_prompt")
+            dec = prompt.get("decoder_prompt")
+            if enc is not None:
+                _validate_single_prompt(cast(dict | str, enc))
+            if dec is not None:
+                _validate_single_prompt(cast(dict | str, dec))
+        else:
+            _validate_single_prompt(prompt)  # type: ignore[arg-type]
+
+    def _validate_lora(self, lora_request: LoRARequest | None) -> None:
+        if lora_request is None:
+            return
+
+        # LoRA request passed in while LoRA is not enabled
+        if not self.lora_config:
+            raise ValueError(
+                f"Got lora_request {lora_request} but LoRA is not enabled!"
+            )
+
+        if self.tokenizer is not None:
+            logger.warning_once(
+                "vLLM has deprecated support for supporting different "
+                "tokenizers for different LoRAs. By default, vLLM uses base "
+                "model's tokenizer. If you are using a LoRA "
+                "with its own tokenizer, consider specifying `--tokenizer "
+                "[lora_path]` to use the LoRA tokenizer."
+            )
+
+    def _validate_structured_output(self, params: SamplingParams) -> None:
+        if not params.structured_outputs or not self.structured_outputs_config:
+            return
+
+        if self.model_config.skip_tokenizer_init and params.structured_outputs:
+            raise ValueError(
+                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
+            )
+
+        backend = self.structured_outputs_config.backend
+        if _backend := params.structured_outputs._backend:
+            # Request-level backend selection is not supported.
+            # The values may differ if `params` is reused and was set
+            # to a specific backend based on `auto` behavior in a previous
+            # request. We remember that it was set as a result of `auto`
+            # using the `_backend_was_auto` field set in the params.
+            if backend != _backend and not (
+                backend == "auto" and params.structured_outputs._backend_was_auto
+            ):
+                raise ValueError(
+                    "Request-level structured output backend selection is not "
+                    f"supported. The request specified '{_backend}', but vLLM "
+                    f"was initialised with '{backend}'. This error can be "
+                    "resolved by removing '_backend' from the request."
+                )
+        else:
+            params.structured_outputs._backend = backend
+
+        # Request content validation
+        if (
+            isinstance(params.structured_outputs.choice, list)
+            and not params.structured_outputs.choice
+        ):
+            # It is invalid for choice to be an empty list
+            raise ValueError(
+                f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
+            )
+        # Reject empty string grammar early to avoid engine-side crashes
+        if (
+            isinstance(params.structured_outputs.grammar, str)
+            and params.structured_outputs.grammar.strip() == ""
+        ):
+            raise ValueError("structured_outputs.grammar cannot be an empty string")
+
+        if backend.startswith("xgrammar"):
+            # xgrammar with no fallback
+            validate_xgrammar_grammar(params)
+        elif backend.startswith("guidance"):
+            # TODO: ideally we would have the LLTokenizer here as Lark syntax
+            # allows <|special_token|> and similar, see
+            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
+            # Without tokenizer these are disallowed in grammars.
+            if isinstance(self.tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "Mistral tokenizer is not supported for the 'guidance' "
+                    "structured output backend. Please use ['xgrammar', 'outlines'] "
+                    "backends or tokenizer_mode='hf' instead."
+                )
+            validate_guidance_grammar(params, tokenizer=None)
+        elif backend == "outlines":
+            # outlines backend
+            validate_structured_output_request_outlines(params)
+        elif backend == "lm-format-enforcer":
+            # lm format enforcer backend
+            if isinstance(self.tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "Mistral tokenizer is not supported for the 'lm-format-enforcer' "
+                    "structured output backend. Please use ['xgrammar', 'outlines'] "
+                    "backends or tokenizer_mode='hf' instead."
+                )
+            validate_structured_output_request_lm_format_enforcer(params)
+        else:
+            # NOTE: backend must be "auto" here, because we have
+            # checked supported_backends above.
+            # In this mode, we set opinionated defaults based on what we think
+            # will satisfy the most use cases without having to worry about
+            # this setting. We include fallback behavior here, but not with any
+            # other setting where a specific backend was specified.
+            try:
+                validate_xgrammar_grammar(params)
+                params.structured_outputs._backend = "xgrammar"
+            except ValueError:
+                # The request either failed validation
+                # or includes some jsonschema feature(s) that
+                # are not supported in xgrammar.
+                if isinstance(self.tokenizer, MistralTokenizer):
+                    # Fall back to outlines if the tokenizer is Mistral
+                    validate_structured_output_request_outlines(params)
+                    params.structured_outputs._backend = "outlines"
+                else:
+                    # Fall back to guidance by default.
+                    validate_guidance_grammar(params, tokenizer=None)
+                    params.structured_outputs._backend = "guidance"
+            # Remember that this backend was set automatically
+            params.structured_outputs._backend_was_auto = True
+
+    def _maybe_build_mm_uuids(
+        self,
+        request_id: str,
+        prompt: PromptType,
+    ) -> MultiModalUUIDDict | None:
+        """Build per-item multimodal hash overrides when enabled. In this case,
+        multimodal data items are identified by their request id, modality and
+        index rather than their content.
+
+        Returns a dictionary of modality -> list[str] of overrides, or None if
+        disabled or no multimodal data is present.
+        """
+
+        def _extract_mm_data(p: PromptType):
+            if isinstance(p, dict) and "encoder_prompt" in p:
+                enc = p.get("encoder_prompt")
+                if isinstance(enc, dict):
+                    return enc.get("multi_modal_data")
+                return None
+            if isinstance(p, dict):
+                return p.get("multi_modal_data")
+            return None
+
+        mm_data = _extract_mm_data(prompt)
+        if not mm_data:
+            return None
+
+        mm_uuids: dict[str, list[str | None] | str] = {}
+        for modality, data in mm_data.items():
+            # Hash each item for embedding inputs.
+            n = (
+                len(data)
+                if isinstance(data, list) or MultiModalDataParser.is_embeddings(data)
+                else 1
+            )
+            mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
+        return mm_uuids
+
+    def process_inputs(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: SamplingParams | PoolingParams,
+        arrival_time: float | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
+        priority: int = 0,
+        data_parallel_rank: int | None = None,
+    ) -> EngineCoreRequest:
+        self._validate_lora(lora_request)
+        self._validate_params(params)
+
+        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
+        if data_parallel_rank is not None and not (
+            0 <= data_parallel_rank < data_parallel_size
+        ):
+            raise ValueError(
+                f"data_parallel_rank {data_parallel_rank} "
+                f"is out of range [0, {data_parallel_size})."
+            )
+
+        if arrival_time is None:
+            arrival_time = time.time()
+
+        # Optionally generate multimodal hash overrides to avoid hashing
+        # multimodal data items by their content as their identifiers.
+
+        # NOTE: when users explicitly turn off BOTH prefix caching and input
+        # processing caching, no multimodal features or embeddings will be
+        # reused across requests, therefore identifying multimodal data items
+        # by their content is no longer necessary, and we create uuids with
+        # request id-modality-index as multimodal hash overrides.
+        if (
+            self.model_config.multimodal_config
+            and self.model_config.multimodal_config.mm_processor_cache_gb == 0
+            and not self.cache_config.enable_prefix_caching
+        ):
+            mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
+        else:
+            # Otherwise, use user-provided uuids as multimodal hash overrides
+            # if provided.
+            self._validate_multi_modal_uuids(prompt)
+            if isinstance(prompt, dict):
+                mm_uuids = cast(
+                    MultiModalUUIDDict | None, prompt.get("multi_modal_uuids")
+                )
+            else:
+                mm_uuids = None
+
+        # Process inputs, which includes:
+        # 1. Tokenize text prompt, with LoRA request if one exists.
+        # 2. For multimodal models with a merged preprocessor, preprocess
+        #   multimodal data and expand prompt token ids accordingly.
+        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
+            prompt,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
+        from vllm.platforms import current_platform
+
+        current_platform.validate_request(
+            prompt=prompt,
+            params=params,
+            processed_inputs=processed_inputs,
+        )
+
+        eos_token_id = self.input_preprocessor.get_eos_token_id()
+
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        self._validate_model_inputs(encoder_inputs, decoder_inputs)
+
+        # Mypy can be conservative for TypedDict unions; normalize access.
+        if decoder_inputs["type"] == "embeds":
+            prompt_token_ids = None
+            prompt_embeds = decoder_inputs["prompt_embeds"]
+        else:
+            prompt_token_ids = decoder_inputs["prompt_token_ids"]
+            prompt_embeds = None
+
+        sampling_params = None
+        pooling_params = None
+        if isinstance(params, SamplingParams):
+            # TODO: can we avoid cloning here in multiproc case?
+            sampling_params = params.clone()
+            # If unset max tokens, then generate up to the max_model_len.
+            if sampling_params.max_tokens is None:
+                seq_len = length_from_prompt_token_ids_or_embeds(
+                    prompt_token_ids, prompt_embeds
+                )
+                sampling_params.max_tokens = self.model_config.max_model_len - seq_len
+            sampling_params.update_from_generation_config(
+                self.generation_config_fields, eos_token_id
+            )
+            if self.tokenizer is not None:
+                sampling_params.update_from_tokenizer(self.tokenizer)
+        else:
+            pooling_params = params.clone()
+
+        # Multimodal related.
+        mm_features: list[MultiModalFeatureSpec] | None = None
+
+        if decoder_inputs["type"] == "multimodal":
+            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
+            decoder_mm_positions = decoder_inputs["mm_placeholders"]
+            decoder_mm_hashes = decoder_inputs["mm_hashes"]
+
+            # Merge and flatten multimodal placeholders, hashes and inputs
+            # from dictionaries to lists, and sort them by each item's position
+            # in the input sequence.
+            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
+
+            mm_features = []
+            for modality, idx in sorted_mm_idxs:
+                mm_features.append(
+                    MultiModalFeatureSpec(
+                        data=decoder_mm_inputs[modality][idx],
+                        modality=modality,
+                        identifier=decoder_mm_hashes[modality][idx],
+                        mm_position=decoder_mm_positions[modality][idx],
+                    )
+                )
+
+        return EngineCoreRequest(
+            request_id=request_id,
+            prompt_token_ids=prompt_token_ids,
+            prompt_embeds=prompt_embeds,
+            mm_features=mm_features,
+            sampling_params=sampling_params,
+            pooling_params=pooling_params,
+            eos_token_id=eos_token_id,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
+            cache_salt=decoder_inputs.get("cache_salt"),
+            priority=priority,
+            data_parallel_rank=data_parallel_rank,
+            trace_headers=trace_headers,
+        )
+
+    def _validate_model_inputs(
+        self, encoder_inputs: SingletonInputs | None, decoder_inputs: SingletonInputs
+    ):
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs, prompt_type="encoder")
+
+        self._validate_model_input(decoder_inputs, prompt_type="decoder")
+
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        *,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        model_config = self.model_config
+
+        prompt_ids = (
+            None
+            if prompt_inputs["type"] == "embeds"
+            else prompt_inputs["prompt_token_ids"]
+        )
+        prompt_embeds = (
+            prompt_inputs["prompt_embeds"]
+            if prompt_inputs["type"] == "embeds"
+            else None
+        )
+        prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
+        if not prompt_ids:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            elif prompt_inputs["type"] == "embeds":
+                pass  # Prompt embeds should not have prompt_ids.
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
+
+        tokenizer = self.tokenizer
+        if tokenizer is not None:
+            max_input_id = max(prompt_ids or [], default=0)
+
+            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
+            # self.model_config.get_vocab_size() is the model’s vocab size.
+            # For Qwen3 models, the language model has extra tokens that do
+            # not exist in the tokenizer, and vice versa for multimodal
+            # placeholder tokens in some multimodal models.
+            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
+            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
+
+            # Here we take the max of the two to determine if a token id is
+            # truly out-of-vocabulary.
+            if max_input_id > max(
+                tokenizer.max_token_id, self.model_config.get_vocab_size() - 1
+            ):
+                raise ValueError(f"Token id {max_input_id} is out of vocabulary")
+
+        max_prompt_len = self.model_config.max_model_len
+        if prompt_len > max_prompt_len:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config,
+                    tokenizer=tokenizer,
+                )
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
+
+            if model_config.is_multimodal_model:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens plus multimodal tokens. For image "
+                    "inputs, the number of image tokens depends on the number "
+                    "of images, and possibly their aspect ratios as well."
+                )
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens."
+                )
+
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}"
+            )
+
+            # TODO: Find out how many placeholder tokens are there so we can
+            # check that chunked prefill does not truncate them
+            # max_batch_len = self.scheduler_config.max_num_batched_tokens
+
+        if (
+            prompt_len == max_prompt_len
+            and prompt_type == "decoder"
+            and not model_config.is_multimodal_model
+            and self.model_config.runner_type != "pooling"
+        ):
+            suggestion = (
+                "Make sure that `max_model_len` is no smaller than the "
+                "number of text tokens (prompt + requested output tokens)."
+            )
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) plus the number of "
+                f"requested output tokens (at least 1) is longer than the maximum "
+                f"model length of {max_prompt_len}. {suggestion}"
+            )
+
+    def stat_mm_cache(self) -> MultiModalCacheStats | None:
+        return self.input_preprocessor.stat_mm_cache()
+
+    def clear_mm_cache(self) -> None:
+        self.input_preprocessor.clear_mm_cache()
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index dffe05445ee4..ead553e98a97 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -7,7 +7,7 @@
 from typing import Any, cast
 
 import torch.nn as nn
-from typing_extensions import TypeVar
+from typing_extensions import TypeVar, deprecated
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig
@@ -28,9 +28,9 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.input_processor import InputProcessor
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.parallel_sampling import ParentRequest
-from vllm.v1.engine.processor import Processor
 from vllm.v1.executor import Executor
 from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
 from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
@@ -88,7 +88,7 @@ def __init__(
         else:
             tokenizer = init_tokenizer_from_configs(self.model_config)
 
-        self.processor = Processor(self.vllm_config, tokenizer)
+        self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
             self.model_config.io_processor_plugin,
@@ -135,6 +135,14 @@ def __init__(
         # Don't keep the dummy data in memory
         self.reset_mm_cache()
 
+    @property
+    @deprecated(
+        "`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
+        "The old name will be removed in v0.13."
+    )
+    def processor(self):
+        return self.input_processor
+
     @classmethod
     def from_vllm_config(
         cls,
@@ -231,11 +239,7 @@ def add_request(
             request = prompt
         else:
             assert prompt_text is None
-            logger.warning_once(
-                "Processor has been moved under LLM and will "
-                "be removed from LLMEngine in v0.13."
-            )
-            request = self.processor.process_inputs(
+            request = self.input_processor.process_inputs(
                 request_id,
                 prompt,
                 params,
@@ -307,7 +311,7 @@ def step(self) -> list[RequestOutput | PoolingRequestOutput]:
                 self.logger_manager.record(
                     scheduler_stats=outputs.scheduler_stats,
                     iteration_stats=iteration_stats,
-                    mm_cache_stats=self.processor.stat_mm_cache(),
+                    mm_cache_stats=self.input_processor.stat_mm_cache(),
                 )
                 self.do_log_stats_with_interval()
 
@@ -320,7 +324,7 @@ def stop_profile(self):
         self.engine_core.profile(False)
 
     def reset_mm_cache(self):
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         self.engine_core.reset_mm_cache()
 
     def reset_prefix_cache(self):
@@ -347,11 +351,11 @@ def get_metrics(self) -> list[Metric]:
 
     @property
     def tokenizer(self) -> AnyTokenizer | None:
-        return self.processor.tokenizer
+        return self.input_processor.tokenizer
 
     @tokenizer.setter
     def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
-        self.processor.tokenizer = tokenizer
+        self.input_processor.tokenizer = tokenizer
 
     def get_tokenizer(self) -> AnyTokenizer:
         if self.tokenizer is None:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index af4f0e410e25..bc5c7fc400fd 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,637 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import warnings
 
-import time
-from collections.abc import Mapping
-from typing import Any, Literal, cast
 
-from vllm.config import VllmConfig
-from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
-from vllm.inputs.parse import split_enc_dec_inputs
-from vllm.inputs.preprocess import InputPreprocessor
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.cache import processor_cache_from_config
-from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
-from vllm.multimodal.parse import MultiModalDataParser
-from vllm.multimodal.processing import EncDecMultiModalProcessor
-from vllm.multimodal.utils import argsort_mm_positions
-from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-from vllm.utils import length_from_prompt_token_ids_or_embeds
-from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.metrics.stats import MultiModalCacheStats
-from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
-from vllm.v1.structured_output.backend_lm_format_enforcer import (
-    validate_structured_output_request_lm_format_enforcer,
-)
-from vllm.v1.structured_output.backend_outlines import (
-    validate_structured_output_request_outlines,
-)
-from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar
+def __getattr__(name: str):
+    if name == "Processor":
+        from .input_processor import InputProcessor
 
-logger = init_logger(__name__)
-
-
-class Processor:
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        tokenizer: AnyTokenizer | None,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-    ) -> None:
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.structured_outputs_config = vllm_config.structured_outputs_config
-
-        self.generation_config_fields = self.model_config.try_get_generation_config()
-
-        self.mm_registry = mm_registry
-        self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
-
-        self.input_preprocessor = InputPreprocessor(
-            self.model_config,
-            tokenizer,
-            mm_registry,
-            mm_processor_cache=self.mm_processor_cache,
+        warnings.warn(
+            "`vllm.v1.engine.processor.Processor` has been moved to "
+            "`vllm.v1.engine.input_processor.InputProcessor`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
         )
 
-    @property
-    def tokenizer(self) -> AnyTokenizer | None:
-        return self.input_preprocessor.tokenizer
-
-    @tokenizer.setter
-    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
-        self.input_preprocessor.tokenizer = tokenizer
-
-    def _validate_logprobs(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        max_logprobs = self.model_config.max_logprobs
-        if max_logprobs == -1:
-            max_logprobs = self.model_config.get_vocab_size()
-
-        # Validate sample logprobs.
-        if params.logprobs:
-            num_logprobs = params.logprobs
-            if num_logprobs == -1:
-                num_logprobs = self.model_config.get_vocab_size()
-            if num_logprobs > max_logprobs:
-                raise ValueError(
-                    f"Requested sample logprobs of {num_logprobs}, "
-                    f"which is greater than max allowed: {max_logprobs}"
-                )
-
-        # Validate prompt logprobs.
-        if params.prompt_logprobs:
-            num_prompt_logprobs = params.prompt_logprobs
-            if num_prompt_logprobs == -1:
-                num_prompt_logprobs = self.model_config.get_vocab_size()
-            if num_prompt_logprobs > max_logprobs:
-                raise ValueError(
-                    f"Requested prompt logprobs of {num_prompt_logprobs}, "
-                    f"which is greater than max allowed: {max_logprobs}"
-                )
-
-    def _validate_sampling_params(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        self._validate_structured_output(params)
-        self._validate_logit_bias(params)
-
-        if params.allowed_token_ids is None:
-            return
-        if not params.allowed_token_ids:
-            raise ValueError("allowed_token_ids is not None and empty!")
-        if self.tokenizer is None:
-            # When skip_tokenizer_init=True, we can't validate token IDs
-            # Skip validation and let the model handle invalid tokens
-            return
-        vocab_size = len(self.tokenizer)
-        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
-            raise ValueError("allowed_token_ids contains out-of-vocab token id!")
-
-    def _validate_logit_bias(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        """Validate logit_bias token IDs are within vocabulary range."""
-        if not params.logit_bias:
-            return
-
-        vocab_size = self.model_config.get_vocab_size()
-        invalid_token_ids = []
-
-        for token_id in params.logit_bias:
-            if token_id < 0 or token_id >= vocab_size:
-                invalid_token_ids.append(token_id)
-
-        if invalid_token_ids:
-            raise ValueError(
-                f"token_id(s) {invalid_token_ids} in logit_bias contain "
-                f"out-of-vocab token ids. Vocabulary size: {vocab_size}"
-            )
-
-    def _validate_supported_sampling_params(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        # Logits processors not supported.
-        if params.logits_processors:
-            raise ValueError(
-                "vLLM V1 does not support per request user provided logits processors."
-            )
-        # Async scheduling + spec decode currently incompatible with some
-        # sampling parameters.
-        if (
-            self.vllm_config.speculative_config is not None
-            and self.vllm_config.scheduler_config.async_scheduling
-            and (
-                params.frequency_penalty != 0.0
-                or params.presence_penalty != 0.0
-                or params.repetition_penalty != 1.0
-                or params.bad_words_token_ids
-                or params.structured_outputs
-            )
-        ):
-            raise ValueError(
-                "async scheduling with spec decoding doesn't yet support "
-                "penalties, bad words or structured outputs in sampling parameters."
-            )
-
-    def _validate_params(
-        self,
-        params: SamplingParams | PoolingParams,
-    ):
-        """
-        Validate supported SamplingParam.
-        Should raise ValueError if unsupported for API Server.
-        """
-
-        if isinstance(params, PoolingParams):
-            return
-
-        self._validate_logprobs(params)
-        self._validate_sampling_params(params)
-        self._validate_supported_sampling_params(params)
-
-    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
-        """
-        Validate that user-provided multi_modal_uuids align with
-        multi_modal_data in the incoming request prompt(s).
-        Only checks lengths; `None` entries are allowed and will be
-        auto-hashed downstream.
-        """
-
-        def _validate_single_prompt(single_prompt: dict | str) -> None:
-            if not isinstance(single_prompt, dict):
-                return
-            mm_data = single_prompt.get("multi_modal_data")
-            mm_uuids = single_prompt.get("multi_modal_uuids")
-            if not mm_data or not mm_uuids:
-                return
-
-            for modality, items in mm_data.items():
-                if modality in mm_uuids:
-                    data_len = len(items) if isinstance(items, list) else 1
-                    uuid_len = (
-                        len(mm_uuids[modality])
-                        if isinstance(mm_uuids[modality], list)
-                        else 1
-                    )
-                    if uuid_len != data_len:
-                        raise ValueError(
-                            f"multi_modal_uuids for modality '{modality}' "
-                            "must have same length as data: got "
-                            f"{uuid_len} uuids vs "
-                            f"{data_len} items."
-                        )
-                else:
-                    raise ValueError(
-                        f"multi_modal_uuids for modality '{modality}' must "
-                        "be provided if multi_modal_data is provided."
-                    )
-
-        # Handle explicit encoder/decoder prompts or singleton prompt
-        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
-            enc = prompt.get("encoder_prompt")
-            dec = prompt.get("decoder_prompt")
-            if enc is not None:
-                _validate_single_prompt(cast(dict | str, enc))
-            if dec is not None:
-                _validate_single_prompt(cast(dict | str, dec))
-        else:
-            _validate_single_prompt(prompt)  # type: ignore[arg-type]
-
-    def _validate_lora(self, lora_request: LoRARequest | None) -> None:
-        if lora_request is None:
-            return
-
-        # LoRA request passed in while LoRA is not enabled
-        if not self.lora_config:
-            raise ValueError(
-                f"Got lora_request {lora_request} but LoRA is not enabled!"
-            )
-
-        if self.tokenizer is not None:
-            logger.warning_once(
-                "vLLM has deprecated support for supporting different "
-                "tokenizers for different LoRAs. By default, vLLM uses base "
-                "model's tokenizer. If you are using a LoRA "
-                "with its own tokenizer, consider specifying `--tokenizer "
-                "[lora_path]` to use the LoRA tokenizer."
-            )
-
-    def _validate_structured_output(self, params: SamplingParams) -> None:
-        if not params.structured_outputs or not self.structured_outputs_config:
-            return
-
-        if self.model_config.skip_tokenizer_init and params.structured_outputs:
-            raise ValueError(
-                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
-            )
-
-        backend = self.structured_outputs_config.backend
-        if _backend := params.structured_outputs._backend:
-            # Request-level backend selection is not supported.
-            # The values may differ if `params` is reused and was set
-            # to a specific backend based on `auto` behavior in a previous
-            # request. We remember that it was set as a result of `auto`
-            # using the `_backend_was_auto` field set in the params.
-            if backend != _backend and not (
-                backend == "auto" and params.structured_outputs._backend_was_auto
-            ):
-                raise ValueError(
-                    "Request-level structured output backend selection is not "
-                    f"supported. The request specified '{_backend}', but vLLM "
-                    f"was initialised with '{backend}'. This error can be "
-                    "resolved by removing '_backend' from the request."
-                )
-        else:
-            params.structured_outputs._backend = backend
-
-        # Request content validation
-        if (
-            isinstance(params.structured_outputs.choice, list)
-            and not params.structured_outputs.choice
-        ):
-            # It is invalid for choice to be an empty list
-            raise ValueError(
-                f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
-            )
-        # Reject empty string grammar early to avoid engine-side crashes
-        if (
-            isinstance(params.structured_outputs.grammar, str)
-            and params.structured_outputs.grammar.strip() == ""
-        ):
-            raise ValueError("structured_outputs.grammar cannot be an empty string")
-
-        if backend.startswith("xgrammar"):
-            # xgrammar with no fallback
-            validate_xgrammar_grammar(params)
-        elif backend.startswith("guidance"):
-            # TODO: ideally we would have the LLTokenizer here as Lark syntax
-            # allows <|special_token|> and similar, see
-            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
-            # Without tokenizer these are disallowed in grammars.
-            if isinstance(self.tokenizer, MistralTokenizer):
-                raise ValueError(
-                    "Mistral tokenizer is not supported for the 'guidance' "
-                    "structured output backend. Please use ['xgrammar', 'outlines'] "
-                    "backends or tokenizer_mode='hf' instead."
-                )
-            validate_guidance_grammar(params, tokenizer=None)
-        elif backend == "outlines":
-            # outlines backend
-            validate_structured_output_request_outlines(params)
-        elif backend == "lm-format-enforcer":
-            # lm format enforcer backend
-            if isinstance(self.tokenizer, MistralTokenizer):
-                raise ValueError(
-                    "Mistral tokenizer is not supported for the 'lm-format-enforcer' "
-                    "structured output backend. Please use ['xgrammar', 'outlines'] "
-                    "backends or tokenizer_mode='hf' instead."
-                )
-            validate_structured_output_request_lm_format_enforcer(params)
-        else:
-            # NOTE: backend must be "auto" here, because we have
-            # checked supported_backends above.
-            # In this mode, we set opinionated defaults based on what we think
-            # will satisfy the most use cases without having to worry about
-            # this setting. We include fallback behavior here, but not with any
-            # other setting where a specific backend was specified.
-            try:
-                validate_xgrammar_grammar(params)
-                params.structured_outputs._backend = "xgrammar"
-            except ValueError:
-                # The request either failed validation
-                # or includes some jsonschema feature(s) that
-                # are not supported in xgrammar.
-                if isinstance(self.tokenizer, MistralTokenizer):
-                    # Fall back to outlines if the tokenizer is Mistral
-                    validate_structured_output_request_outlines(params)
-                    params.structured_outputs._backend = "outlines"
-                else:
-                    # Fall back to guidance by default.
-                    validate_guidance_grammar(params, tokenizer=None)
-                    params.structured_outputs._backend = "guidance"
-            # Remember that this backend was set automatically
-            params.structured_outputs._backend_was_auto = True
-
-    def _maybe_build_mm_uuids(
-        self,
-        request_id: str,
-        prompt: PromptType,
-    ) -> MultiModalUUIDDict | None:
-        """Build per-item multimodal hash overrides when enabled. In this case,
-        multimodal data items are identified by their request id, modality and
-        index rather than their content.
-
-        Returns a dictionary of modality -> list[str] of overrides, or None if
-        disabled or no multimodal data is present.
-        """
-
-        def _extract_mm_data(p: PromptType):
-            if isinstance(p, dict) and "encoder_prompt" in p:
-                enc = p.get("encoder_prompt")
-                if isinstance(enc, dict):
-                    return enc.get("multi_modal_data")
-                return None
-            if isinstance(p, dict):
-                return p.get("multi_modal_data")
-            return None
-
-        mm_data = _extract_mm_data(prompt)
-        if not mm_data:
-            return None
-
-        mm_uuids: dict[str, list[str | None] | str] = {}
-        for modality, data in mm_data.items():
-            # Hash each item for embedding inputs.
-            n = (
-                len(data)
-                if isinstance(data, list) or MultiModalDataParser.is_embeddings(data)
-                else 1
-            )
-            mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
-        return mm_uuids
-
-    def process_inputs(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: SamplingParams | PoolingParams,
-        arrival_time: float | None = None,
-        lora_request: LoRARequest | None = None,
-        tokenization_kwargs: dict[str, Any] | None = None,
-        trace_headers: Mapping[str, str] | None = None,
-        priority: int = 0,
-        data_parallel_rank: int | None = None,
-    ) -> EngineCoreRequest:
-        self._validate_lora(lora_request)
-        self._validate_params(params)
-
-        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
-        if data_parallel_rank is not None and not (
-            0 <= data_parallel_rank < data_parallel_size
-        ):
-            raise ValueError(
-                f"data_parallel_rank {data_parallel_rank} "
-                f"is out of range [0, {data_parallel_size})."
-            )
-
-        if arrival_time is None:
-            arrival_time = time.time()
-
-        # Optionally generate multimodal hash overrides to avoid hashing
-        # multimodal data items by their content as their identifiers.
-
-        # NOTE: when users explicitly turn off BOTH prefix caching and input
-        # processing caching, no multimodal features or embeddings will be
-        # reused across requests, therefore identifying multimodal data items
-        # by their content is no longer necessary, and we create uuids with
-        # request id-modality-index as multimodal hash overrides.
-        if (
-            self.model_config.multimodal_config
-            and self.model_config.multimodal_config.mm_processor_cache_gb == 0
-            and not self.cache_config.enable_prefix_caching
-        ):
-            mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
-        else:
-            # Otherwise, use user-provided uuids as multimodal hash overrides
-            # if provided.
-            self._validate_multi_modal_uuids(prompt)
-            if isinstance(prompt, dict):
-                mm_uuids = cast(
-                    MultiModalUUIDDict | None, prompt.get("multi_modal_uuids")
-                )
-            else:
-                mm_uuids = None
-
-        # Process inputs, which includes:
-        # 1. Tokenize text prompt, with LoRA request if one exists.
-        # 2. For multimodal models with a merged preprocessor, preprocess
-        #   multimodal data and expand prompt token ids accordingly.
-        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
-            prompt,
-            tokenization_kwargs=tokenization_kwargs,
-            mm_uuids=mm_uuids,
-        )
-        from vllm.platforms import current_platform
-
-        current_platform.validate_request(
-            prompt=prompt,
-            params=params,
-            processed_inputs=processed_inputs,
-        )
-
-        eos_token_id = self.input_preprocessor.get_eos_token_id()
-
-        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
-        self._validate_model_inputs(encoder_inputs, decoder_inputs)
-
-        # Mypy can be conservative for TypedDict unions; normalize access.
-        if decoder_inputs["type"] == "embeds":
-            prompt_token_ids = None
-            prompt_embeds = decoder_inputs["prompt_embeds"]
-        else:
-            prompt_token_ids = decoder_inputs["prompt_token_ids"]
-            prompt_embeds = None
-
-        sampling_params = None
-        pooling_params = None
-        if isinstance(params, SamplingParams):
-            # TODO: can we avoid cloning here in multiproc case?
-            sampling_params = params.clone()
-            # If unset max tokens, then generate up to the max_model_len.
-            if sampling_params.max_tokens is None:
-                seq_len = length_from_prompt_token_ids_or_embeds(
-                    prompt_token_ids, prompt_embeds
-                )
-                sampling_params.max_tokens = self.model_config.max_model_len - seq_len
-            sampling_params.update_from_generation_config(
-                self.generation_config_fields, eos_token_id
-            )
-            if self.tokenizer is not None:
-                sampling_params.update_from_tokenizer(self.tokenizer)
-        else:
-            pooling_params = params.clone()
-
-        # Multimodal related.
-        mm_features: list[MultiModalFeatureSpec] | None = None
-
-        if decoder_inputs["type"] == "multimodal":
-            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
-            decoder_mm_positions = decoder_inputs["mm_placeholders"]
-            decoder_mm_hashes = decoder_inputs["mm_hashes"]
-
-            # Merge and flatten multimodal placeholders, hashes and inputs
-            # from dictionaries to lists, and sort them by each item's position
-            # in the input sequence.
-            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
-
-            mm_features = []
-            for modality, idx in sorted_mm_idxs:
-                mm_features.append(
-                    MultiModalFeatureSpec(
-                        data=decoder_mm_inputs[modality][idx],
-                        modality=modality,
-                        identifier=decoder_mm_hashes[modality][idx],
-                        mm_position=decoder_mm_positions[modality][idx],
-                    )
-                )
-
-        return EngineCoreRequest(
-            request_id=request_id,
-            prompt_token_ids=prompt_token_ids,
-            prompt_embeds=prompt_embeds,
-            mm_features=mm_features,
-            sampling_params=sampling_params,
-            pooling_params=pooling_params,
-            eos_token_id=eos_token_id,
-            arrival_time=arrival_time,
-            lora_request=lora_request,
-            cache_salt=decoder_inputs.get("cache_salt"),
-            priority=priority,
-            data_parallel_rank=data_parallel_rank,
-            trace_headers=trace_headers,
-        )
-
-    def _validate_model_inputs(
-        self, encoder_inputs: SingletonInputs | None, decoder_inputs: SingletonInputs
-    ):
-        if encoder_inputs is not None:
-            self._validate_model_input(encoder_inputs, prompt_type="encoder")
-
-        self._validate_model_input(decoder_inputs, prompt_type="decoder")
-
-    def _validate_model_input(
-        self,
-        prompt_inputs: SingletonInputs,
-        *,
-        prompt_type: Literal["encoder", "decoder"],
-    ):
-        model_config = self.model_config
-
-        prompt_ids = (
-            None
-            if prompt_inputs["type"] == "embeds"
-            else prompt_inputs["prompt_token_ids"]
-        )
-        prompt_embeds = (
-            prompt_inputs["prompt_embeds"]
-            if prompt_inputs["type"] == "embeds"
-            else None
-        )
-        prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
-        if not prompt_ids:
-            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                pass  # Mllama may have empty encoder inputs for text-only data
-            elif prompt_inputs["type"] == "embeds":
-                pass  # Prompt embeds should not have prompt_ids.
-            else:
-                raise ValueError(f"The {prompt_type} prompt cannot be empty")
-
-        tokenizer = self.tokenizer
-        if tokenizer is not None:
-            max_input_id = max(prompt_ids or [], default=0)
-
-            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
-            # self.model_config.get_vocab_size() is the model’s vocab size.
-            # For Qwen3 models, the language model has extra tokens that do
-            # not exist in the tokenizer, and vice versa for multimodal
-            # placeholder tokens in some multimodal models.
-            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
-            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
-
-            # Here we take the max of the two to determine if a token id is
-            # truly out-of-vocabulary.
-            if max_input_id > max(
-                tokenizer.max_token_id, self.model_config.get_vocab_size() - 1
-            ):
-                raise ValueError(f"Token id {max_input_id} is out of vocabulary")
-
-        max_prompt_len = self.model_config.max_model_len
-        if prompt_len > max_prompt_len:
-            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                mm_registry = self.input_preprocessor.mm_registry
-                mm_processor = mm_registry.create_processor(
-                    model_config,
-                    tokenizer=tokenizer,
-                )
-                assert isinstance(mm_processor, EncDecMultiModalProcessor)
-
-                if mm_processor.pad_dummy_encoder_prompt:
-                    return  # Skip encoder length check for Whisper
-
-            if model_config.is_multimodal_model:
-                suggestion = (
-                    "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens plus multimodal tokens. For image "
-                    "inputs, the number of image tokens depends on the number "
-                    "of images, and possibly their aspect ratios as well."
-                )
-            else:
-                suggestion = (
-                    "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens."
-                )
-
-            raise ValueError(
-                f"The {prompt_type} prompt (length {prompt_len}) is "
-                f"longer than the maximum model length of {max_prompt_len}. "
-                f"{suggestion}"
-            )
-
-            # TODO: Find out how many placeholder tokens are there so we can
-            # check that chunked prefill does not truncate them
-            # max_batch_len = self.scheduler_config.max_num_batched_tokens
-
-        if (
-            prompt_len == max_prompt_len
-            and prompt_type == "decoder"
-            and not model_config.is_multimodal_model
-            and self.model_config.runner_type != "pooling"
-        ):
-            suggestion = (
-                "Make sure that `max_model_len` is no smaller than the "
-                "number of text tokens (prompt + requested output tokens)."
-            )
-            raise ValueError(
-                f"The {prompt_type} prompt (length {prompt_len}) plus the number of "
-                f"requested output tokens (at least 1) is longer than the maximum "
-                f"model length of {max_prompt_len}. {suggestion}"
-            )
-
-    def stat_mm_cache(self) -> MultiModalCacheStats | None:
-        return self.input_preprocessor.stat_mm_cache()
+        return InputProcessor
 
-    def clear_mm_cache(self) -> None:
-        self.input_preprocessor.clear_mm_cache()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

From fecae12cd7deb969dcbba37fda9d2d234697a944 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 28 Nov 2025 20:26:51 +0000
Subject: [PATCH 525/578] Remove `all_special_tokens_extended` from tokenizer
 code (#29686)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/tokenization/test_cached_tokenizer.py   |  1 -
 tests/tokenization/test_mistral_tokenizer.py  | 86 +++++++++----------
 tests/tokenization/test_tokenizer_registry.py |  4 -
 vllm/transformers_utils/tokenizer.py          |  5 --
 vllm/transformers_utils/tokenizer_base.py     |  5 --
 vllm/transformers_utils/tokenizers/mistral.py |  4 -
 6 files changed, 40 insertions(+), 65 deletions(-)

diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index 074039f9e513..a5bb3dbcfe29 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -31,7 +31,6 @@ def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
     # Cached attributes
     assert target.all_special_ids == expected.all_special_ids
     assert target.all_special_tokens == expected.all_special_tokens
-    assert target.all_special_tokens_extended == expected.all_special_tokens_extended
     assert target.get_vocab() == expected.get_vocab()
     assert len(target) == len(expected)
 
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index c80b698ba384..4cdfa9df95e1 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -258,52 +258,46 @@ def mistral_tokenizer(request) -> MistralTokenizer:
 )
 class TestMistralTokenizer:
     def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer):
-        attributes = [
-            mistral_tokenizer.all_special_tokens,
-            mistral_tokenizer.all_special_tokens_extended,
-        ]
-
-        for attribute in attributes:
-            if mistral_tokenizer.is_tekken:
-                assert attribute == [
-                    "<unk>",
-                    "<s>",
-                    "</s>",
-                    "[INST]",
-                    "[/INST]",
-                    "[AVAILABLE_TOOLS]",
-                    "[/AVAILABLE_TOOLS]",
-                    "[TOOL_RESULTS]",
-                    "[/TOOL_RESULTS]",
-                    "[TOOL_CALLS]",
-                    "[IMG]",
-                    "<pad>",
-                    "[IMG_BREAK]",
-                    "[IMG_END]",
-                    "[PREFIX]",
-                    "[MIDDLE]",
-                    "[SUFFIX]",
-                    "[SYSTEM_PROMPT]",
-                    "[/SYSTEM_PROMPT]",
-                    "[TOOL_CONTENT]",
-                ] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
-                    "[ARGS]",
-                    "[CALL_ID]",
-                    "[THINK]",
-                    "[/THINK]",
-                ] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
-            else:
-                assert attribute == [
-                    "<s>",
-                    "</s>",
-                    "[INST]",
-                    "[/INST]",
-                    "[TOOL_CALLS]",
-                    "[AVAILABLE_TOOLS]",
-                    "[/AVAILABLE_TOOLS]",
-                    "[TOOL_RESULTS]",
-                    "[/TOOL_RESULTS]",
-                ] + [f"[control_{i}]" for i in range(8, 769)]
+        if mistral_tokenizer.is_tekken:
+            assert mistral_tokenizer.all_special_tokens == [
+                "<unk>",
+                "<s>",
+                "</s>",
+                "[INST]",
+                "[/INST]",
+                "[AVAILABLE_TOOLS]",
+                "[/AVAILABLE_TOOLS]",
+                "[TOOL_RESULTS]",
+                "[/TOOL_RESULTS]",
+                "[TOOL_CALLS]",
+                "[IMG]",
+                "<pad>",
+                "[IMG_BREAK]",
+                "[IMG_END]",
+                "[PREFIX]",
+                "[MIDDLE]",
+                "[SUFFIX]",
+                "[SYSTEM_PROMPT]",
+                "[/SYSTEM_PROMPT]",
+                "[TOOL_CONTENT]",
+            ] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
+                "[ARGS]",
+                "[CALL_ID]",
+                "[THINK]",
+                "[/THINK]",
+            ] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
+        else:
+            assert mistral_tokenizer.all_special_tokens == [
+                "<s>",
+                "</s>",
+                "[INST]",
+                "[/INST]",
+                "[TOOL_CALLS]",
+                "[AVAILABLE_TOOLS]",
+                "[/AVAILABLE_TOOLS]",
+                "[TOOL_RESULTS]",
+                "[/TOOL_RESULTS]",
+            ] + [f"[control_{i}]" for i in range(8, 769)]
 
     def get_vocab(self, mistral_tokenizer: MistralTokenizer):
         assert (
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
index d89737888aa2..f13bb4333d61 100644
--- a/tests/tokenization/test_tokenizer_registry.py
+++ b/tests/tokenization/test_tokenizer_registry.py
@@ -15,10 +15,6 @@ class TestTokenizer(TokenizerBase):
     def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
         return TestTokenizer()
 
-    @property
-    def all_special_tokens_extended(self) -> list[str]:
-        raise NotImplementedError()
-
     @property
     def all_special_tokens(self) -> list[str]:
         raise NotImplementedError()
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 9eb7fe37912b..be4325ab9101 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -96,7 +96,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
 
     tokenizer_all_special_ids = tokenizer.all_special_ids
     tokenizer_all_special_tokens = tokenizer.all_special_tokens
-    tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended
     tokenizer_vocab = tokenizer.get_vocab()
     tokenizer_len = len(tokenizer)
 
@@ -118,10 +117,6 @@ def all_special_ids(self) -> list[int]:
         def all_special_tokens(self) -> list[str]:
             return tokenizer_all_special_tokens
 
-        @property
-        def all_special_tokens_extended(self) -> list[str]:
-            return tokenizer_all_special_tokens_extended
-
         @property
         def max_token_id(self) -> int:
             return max_token_id
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index 7421eb534808..52f221d1e373 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -10,11 +10,6 @@
 
 
 class TokenizerBase(ABC):
-    @property
-    @abstractmethod
-    def all_special_tokens_extended(self) -> list[str]:
-        raise NotImplementedError()
-
     @property
     @abstractmethod
     def all_special_tokens(self) -> list[str]:
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index caff43c55ce8..1954e2a815b0 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -254,10 +254,6 @@ def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]:
 
     # the following attributes are set to fit vLLM's design and are used
     # by the structured output backends.
-    @property
-    def all_special_tokens_extended(self) -> list[str]:
-        return self.all_special_tokens
-
     @property
     def all_special_tokens(self) -> list[str]:
         return self._special_tokens

From 3461e7efd8d1af0dd069900383fd5e33c7956c1f Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@users.noreply.github.com>
Date: Fri, 28 Nov 2025 13:51:12 -0800
Subject: [PATCH 526/578] [Frontend] Remap -O to -cc commandline flag (#29557)

Signed-off-by: Yanan Cao <gmagogsfm@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
---
 .../scripts/hardware_ci/run-xpu-test.sh       |  2 +-
 docs/design/debug_vllm_compile.md             | 22 ++++----
 docs/design/torch_compile.md                  |  2 +-
 .../fullgraph/test_basic_correctness.py       |  6 +--
 tests/engine/test_arg_utils.py                | 12 ++---
 tests/utils_/test_argparse_utils.py           | 50 +++++++++++++------
 vllm/config/vllm.py                           |  4 +-
 vllm/engine/arg_utils.py                      |  2 +-
 vllm/utils/argparse_utils.py                  | 11 ++++
 9 files changed, 72 insertions(+), 39 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index d49f3e2f47cf..4d163399cfc6 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -35,7 +35,7 @@ docker run \
     echo $ZE_AFFINITY_MASK
     pip install tblib==3.1.0
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
     VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index 408d2878309d..e565f17da62a 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -8,9 +8,9 @@ TL;DR:
 | Online Flag | Offline Flag   |      Result |
 |----------|----------|-------------|
 | --enforce-eager | enforce_eager=True |  Turn off torch.compile and CUDAGraphs |
-| -O.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
-| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) |  Turn off CUDAGraphs only |
-| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
+| -cc.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
+| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) |  Turn off CUDAGraphs only |
+| -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
 
 ## vLLM-torch.compile overview
 
@@ -86,11 +86,11 @@ LLM(model, enforce_eager=True)
 ```
 
 To turn off just torch.compile, pass `mode = NONE` to the compilation config.
-(`-O` is short for `--compilation_config`):
+(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated):
 
 ```sh
 # Online
-vllm serve -O.mode=0
+vllm serve -cc.mode=0
 ```
 
 ```py
@@ -103,7 +103,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
 
 ```sh
 # Online
-vllm serve -O.cudagraph_mode=NONE
+vllm serve -cc.cudagraph_mode=NONE
 ```
 
 ```py
@@ -183,10 +183,10 @@ help debug the issue:
 
 ```sh
 # Online - using unbacked mode
-vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
 
 # Online - using backed_size_oblivious mode
-vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=backed_size_oblivious
 ```
 
 ```py
@@ -233,7 +233,7 @@ to the compilation config:
 
 ```sh
 # online
-vllm serve -O.backend=eager
+vllm serve -cc.backend=eager
 ```
 
 ```py
@@ -252,7 +252,7 @@ You can also use `TORCH_LOGS=output_code <command>` to print the Inductor output
 ### Editable TorchInductor code
 
 You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`
-or passing `-O.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
+or passing `-cc.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
 
 This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`)
 and print statements in the output code.
@@ -299,7 +299,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
 
 ```sh
 # Online
-vllm serve -O.cudagraph_mode=NONE
+vllm serve -cc.cudagraph_mode=NONE
 ```
 
 ```py
diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 7b0b2c1e9697..4dc0da0c7d65 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -117,7 +117,7 @@ vllm serve meta-llama/Llama-3.2-1B \
 
 
 # Alternative: Using dot notation (simpler for single values)
-vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
 ```
 
 #### Choosing the Right Mode
diff --git a/tests/compile/fullgraph/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py
index 965938c4433d..f2e58b5cc423 100644
--- a/tests/compile/fullgraph/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -115,7 +115,7 @@ def test_compile_correctness(
             str(pp_size),
             "-tp",
             str(tp_size),
-            "-O.cudagraph_mode=none",
+            "-cc.cudagraph_mode=none",
         ]
 
         all_args: list[list[str]] = []
@@ -128,7 +128,7 @@ def test_compile_correctness(
         ]:
             for mode in [CompilationMode.NONE, comp_mode]:
                 all_args.append(
-                    final_args + [f"-O.mode={mode.name}", "-O.backend=inductor"]
+                    final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
                 )
 
             # inductor will change the output, so we only compare if the output
@@ -148,7 +148,7 @@ def test_compile_correctness(
             CompilationMode.DYNAMO_TRACE_ONCE,
             CompilationMode.VLLM_COMPILE,
         ]:
-            all_args.append(final_args + [f"-O.mode={mode.name}", "-O.backend=eager"])
+            all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
             all_envs.append({})
             all_envs.append({})
 
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 0077609b2f36..e46f118f8e84 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -248,15 +248,15 @@ def test_optimization_level(args, expected):
 @pytest.mark.parametrize(
     ("args", "expected"),
     [
-        (["-O.mode=0"], 0),
-        (["-O.mode=1"], 1),
-        (["-O.mode=2"], 2),
-        (["-O.mode=3"], 3),
+        (["-cc.mode=0"], 0),
+        (["-cc.mode=1"], 1),
+        (["-cc.mode=2"], 2),
+        (["-cc.mode=3"], 3),
     ],
 )
 def test_mode_parser(args, expected):
     """
-    Test compilation config modes (-O.mode=int) map to compilation_config.
+    Test compilation config modes (-cc.mode=int) map to compilation_config.
     """
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     parsed_args = parser.parse_args(args)
@@ -273,7 +273,7 @@ def test_compilation_config():
     # set to string form of a dict
     args = parser.parse_args(
         [
-            "-O",
+            "-cc",
             '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
         ]
     )
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index c0519155c4ba..0ea4a43d2602 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -27,7 +27,7 @@ def parser():
     parser.add_argument("--batch-size", type=int)
     parser.add_argument("--enable-feature", action="store_true")
     parser.add_argument("--hf-overrides", type=json.loads)
-    parser.add_argument("-O", "--compilation-config", type=json.loads)
+    parser.add_argument("-cc", "--compilation-config", type=json.loads)
     parser.add_argument("--optimization-level", type=int)
     return parser
 
@@ -167,8 +167,8 @@ def test_dict_args(parser):
         "--hf-overrides.key2.key4",
         "val3",
         # Test compile config and compilation mode
-        "-O.use_inductor_graph_partition=true",
-        "-O.backend",
+        "-cc.use_inductor_graph_partition=true",
+        "-cc.backend",
         "custom",
         "-O1",
         # Test = sign
@@ -191,9 +191,9 @@ def test_dict_args(parser):
         "--hf_overrides.key14.key15",
         "-minus.and.dot",
         # Test array values
-        "-O.custom_ops+",
+        "-cc.custom_ops+",
         "-quant_fp8",
-        "-O.custom_ops+=+silu_mul,-rms_norm",
+        "-cc.custom_ops+=+silu_mul,-rms_norm",
     ]
     parsed_args = parser.parse_args(args)
     assert parsed_args.model_name == "something.something"
@@ -234,7 +234,7 @@ def test_duplicate_dict_args(caplog_vllm, parser):
         "--hf-overrides.key1",
         "val2",
         "-O1",
-        "-O.mode",
+        "-cc.mode",
         "2",
         "-O3",
     ]
@@ -380,29 +380,29 @@ def test_load_config_file(tmp_path):
 
 
 def test_compilation_mode_string_values(parser):
-    """Test that -O.mode accepts both integer and string mode values."""
-    args = parser.parse_args(["-O.mode", "0"])
+    """Test that -cc.mode accepts both integer and string mode values."""
+    args = parser.parse_args(["-cc.mode", "0"])
     assert args.compilation_config == {"mode": 0}
 
     args = parser.parse_args(["-O3"])
     assert args.optimization_level == 3
 
-    args = parser.parse_args(["-O.mode=NONE"])
+    args = parser.parse_args(["-cc.mode=NONE"])
     assert args.compilation_config == {"mode": "NONE"}
 
-    args = parser.parse_args(["-O.mode", "STOCK_TORCH_COMPILE"])
+    args = parser.parse_args(["-cc.mode", "STOCK_TORCH_COMPILE"])
     assert args.compilation_config == {"mode": "STOCK_TORCH_COMPILE"}
 
-    args = parser.parse_args(["-O.mode=DYNAMO_TRACE_ONCE"])
+    args = parser.parse_args(["-cc.mode=DYNAMO_TRACE_ONCE"])
     assert args.compilation_config == {"mode": "DYNAMO_TRACE_ONCE"}
 
-    args = parser.parse_args(["-O.mode", "VLLM_COMPILE"])
+    args = parser.parse_args(["-cc.mode", "VLLM_COMPILE"])
     assert args.compilation_config == {"mode": "VLLM_COMPILE"}
 
-    args = parser.parse_args(["-O.mode=none"])
+    args = parser.parse_args(["-cc.mode=none"])
     assert args.compilation_config == {"mode": "none"}
 
-    args = parser.parse_args(["-O.mode=vllm_compile"])
+    args = parser.parse_args(["-cc.mode=vllm_compile"])
     assert args.compilation_config == {"mode": "vllm_compile"}
 
 
@@ -458,3 +458,25 @@ def test_flat_product():
         (3, 4, "a", 5, 6),
         (3, 4, "b", 5, 6),
     ]
+
+
+def test_o_legacy_syntax_deprecation(caplog_vllm):
+    """Test that -O.* dotted syntax emits warnings and converts correctly to -cc syntax."""
+    parser = FlexibleArgumentParser()
+    parser.add_argument("-cc", "--compilation-config", type=json.loads)
+
+    # Test that -O.backend gets converted correctly AND emits warning
+    args = parser.parse_args(["-O.backend=eager"])
+    assert args.compilation_config == {"backend": "eager"}
+
+    # Check that deprecation warning was logged
+    assert len(caplog_vllm.records) >= 1
+    assert (
+        "The -O.* dotted syntax for --compilation-config is deprecated"
+        in caplog_vllm.text
+    )
+
+    # Test that -O.mode gets converted correctly
+    # Note: warning_once won't emit again in same session
+    args = parser.parse_args(["-O.mode=2"])
+    assert args.compilation_config == {"mode": 2}
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 7ac8cc764322..34e70e3e134b 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -193,8 +193,8 @@ class VllmConfig:
     compilation_config: CompilationConfig = Field(default_factory=CompilationConfig)
     """`torch.compile` and cudagraph capture configuration for the model.
 
-    As a shorthand, one can append compilation arguments via 
-    -0.parameter=argument such as `-O.mode=3` (same as `-O='{"mode":3}'`).
+    As a shorthand, one can append compilation arguments via
+    -cc.parameter=argument such as `-cc.mode=3` (same as `-cc='{"mode":3}'`).
 
     You can specify the full compilation config like so:
     `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8f0eb832064f..31825980f3a1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1107,7 +1107,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--ec-transfer-config", **vllm_kwargs["ec_transfer_config"]
         )
         vllm_group.add_argument(
-            "--compilation-config", "-O", **vllm_kwargs["compilation_config"]
+            "--compilation-config", "-cc", **vllm_kwargs["compilation_config"]
         )
         vllm_group.add_argument(
             "--additional-config", **vllm_kwargs["additional_config"]
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index b68157f02f6c..555fcfea491e 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -257,6 +257,17 @@ def repl(match: re.Match) -> str:
             ):
                 # Convert -O <n> to --optimization-level <n>
                 processed_args.append("--optimization-level")
+            elif arg.startswith("-O."):
+                # Handle -O.* dotted syntax - ALL dotted syntax is deprecated
+                logger.warning_once(
+                    "The -O.* dotted syntax for --compilation-config is "
+                    "deprecated and will be removed in v0.13.0 or v1.0.0"
+                    ", whichever is earlier.  Please use -cc.* instead. "
+                    "Example: -cc.backend=eager instead of "
+                    "-O.backend=eager."
+                )
+                converted_arg = arg.replace("-O", "-cc", 1)
+                processed_args.append(converted_arg)
             else:
                 processed_args.append(arg)
 

From 1986de137502d0d767cb4c1d3cad23dedbd22397 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <bchislett@nvidia.com>
Date: Fri, 28 Nov 2025 17:25:05 -0500
Subject: [PATCH 527/578] [Perf] Optimize EAGLE prepare_inputs_padded with
 triton kernels (#28597)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
---
 tests/v1/spec_decode/test_eagle.py |  30 +++-----
 vllm/v1/spec_decode/eagle.py       | 107 +++++++++++++----------------
 vllm/v1/spec_decode/utils.py       | 105 ++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py |  63 +++++++++--------
 4 files changed, 198 insertions(+), 107 deletions(-)

diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index c93c59d1f4c4..9436ab471c21 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -103,16 +103,23 @@ def test_prepare_next_token_ids():
         mock_request.num_computed_tokens = 0
         mock_requests[req_id] = mock_request
 
+    # explicitly discard the last request
+    discarded_req_mask = torch.tensor(
+        [False, False, False, True], dtype=torch.bool, device=device
+    )
     sampled_token_ids = [
         [0, 1, -1, -1, -1],  # 1 accepted, 3 rejected, "1" sampled
         [0, 1, 2, 3, 4],  # all accepted, "4" sampled
         [-1, -1, -1, -1, -1],  # sampling skipped, use backup token "30"
-        [-1, -1, -1, -1, -1],  # this request will be discarded
+        [0, 1, 2, -1, -1],  # explicitly discarded, sampling should be ignored
     ]
     sampled_token_ids_tensor = torch.tensor(
         sampled_token_ids, dtype=torch.int32, device=device
     )
     sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids]
+    for i in range(len(sampled_token_ids_cpu)):
+        if discarded_req_mask[i]:
+            sampled_token_ids_cpu[i] = []
 
     expected_next_token_ids_cpu = [1, 4, 30, 40]
     expected_next_token_ids_tensor = torch.tensor(
@@ -136,9 +143,6 @@ def test_prepare_next_token_ids():
         device=device,
     )
 
-    discarded_req_indices = torch.tensor([3], dtype=torch.int64, device=device)
-    num_discarded_reqs = 1
-
     expected_valid_sampled_tokens_count = torch.tensor(
         [2, 5, 0, 0], dtype=torch.int32, device=device
     )
@@ -149,8 +153,7 @@ def test_prepare_next_token_ids():
             sampled_token_ids_tensor,
             mock_requests,
             mock_input_batch,
-            discarded_req_indices,
-            num_discarded_reqs,
+            discarded_req_mask,
         )
     )
 
@@ -256,11 +259,6 @@ def test_prepare_inputs_padded():
     - Request 3: query_len = 3, rejected = 2
 
     Expected outputs:
-    token_indices: [0, 1, 2,
-                    3, 4, 5,
-                    6, 7, 8]
-    Reason: Deferred computation should not disturb the original indices.
-
     token_indices_to_sample: [1, 5, 6]
     Reason: After accounting for rejections, these are the valid token positions
             from the original indices to sample from.
@@ -268,9 +266,6 @@ def test_prepare_inputs_padded():
 
     device = torch.device(current_platform.device_type)
 
-    expected_token_indices = torch.tensor(
-        [0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.int32, device=device
-    )
     expected_token_indices_to_sample = torch.tensor(
         [1, 5, 6], dtype=torch.int32, device=device
     )
@@ -305,15 +300,12 @@ def test_prepare_inputs_padded():
 
     proposer = _create_proposer("eagle", num_speculative_tokens)
 
-    output_metadata, token_indices, token_indices_to_sample = (
-        proposer.prepare_inputs_padded(
-            common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
-        )
+    output_metadata, token_indices_to_sample = proposer.prepare_inputs_padded(
+        common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
     )
 
     assert output_metadata.max_query_len == 3
     assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc)
-    assert torch.equal(token_indices, expected_token_indices)
     assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample)
 
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 305abdade8da..72f9d15bc132 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -25,6 +25,7 @@
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.platforms import current_platform
+from vllm.triton_utils import triton
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.tree_attn import (
@@ -40,6 +41,10 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import _SAMPLING_EPS
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+from vllm.v1.spec_decode.utils import (
+    eagle_prepare_inputs_padded_kernel,
+    eagle_prepare_next_token_padded_kernel,
+)
 from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -555,20 +560,15 @@ def prepare_next_token_ids_padded(
         sampled_token_ids: torch.Tensor,
         requests: dict[str, CachedRequestState],
         gpu_input_batch: InputBatch,
-        discard_request_indices: torch.Tensor,
-        num_discarded_requests: int,
+        discard_request_mask: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         This function is used to prepare the inputs for speculative decoding.
         It calculates the next token ids and the number of valid sampled tokens
         for each request, considering the "discarded" requests whose next token
-        is not sampled and comes from `request.get_token_id()` instead.
-        It also accounts for the rejected tokens in `sampled_token_ids`.
-        This function must use device functions to operate on the inputs, and
-        should not introduce any blocking CPU-GPU synchronization.
+        is not sampled and comes from `request.get_token_id()` instead. This is denoted
+        the "backup" token id. It also counts rejected tokens via `sampled_token_ids`.
         """
-        # TODO(Ben): Combine this into a custom fused kernel
-
         # Precompute get_token_id for when there is no valid next token
         num_reqs = gpu_input_batch.num_reqs
         self.backup_next_token_ids.np[:num_reqs] = np.array(
@@ -577,44 +577,39 @@ def prepare_next_token_ids_padded(
                     common_attn_metadata.seq_lens_cpu[i].item()
                 )
                 for i in range(num_reqs)
-            ]
+            ],
+            dtype=np.int32,
         )
         self.backup_next_token_ids.copy_to_gpu(num_reqs)
+        backup_tokens_gpu = self.backup_next_token_ids.gpu
 
-        # Mask out the sampled tokens indices that should not be sampled.
-        discard_sampled_tokens_req_indices = discard_request_indices[
-            :num_discarded_requests
-        ]
+        batch_size, num_tokens = sampled_token_ids.shape
+        device = sampled_token_ids.device
 
-        valid_sampled_token_ids_gpu = sampled_token_ids.clone()
-        valid_sampled_token_ids_gpu.index_fill_(
-            0, discard_sampled_tokens_req_indices, -1
-        )
+        assert discard_request_mask.dtype == torch.bool
+        assert backup_tokens_gpu.dtype == torch.int32
 
-        # Generate a mask for all valid tokens within those requests
-        valid_mask = (valid_sampled_token_ids_gpu != -1) & (
-            valid_sampled_token_ids_gpu < gpu_input_batch.vocab_size
+        next_token_ids = torch.empty((batch_size,), dtype=torch.int32, device=device)
+        valid_sampled_tokens_count = torch.empty(
+            (batch_size,), dtype=torch.int32, device=device
         )
 
-        # Count the number of valid tokens in each request
-        valid_sampled_tokens_count = valid_mask.sum(dim=1)
+        # Kernel grid: one program per request (row)
+        grid = (batch_size,)
 
-        # Get the rightmost valid index per row
-        last_valid_indices = valid_sampled_tokens_count - 1
-        last_valid_indices_safe = torch.clamp(last_valid_indices, min=0)
-
-        # Get last valid token from each row
-        # (assume undefined state where there is no valid token)
-        selected_tokens = torch.gather(
-            valid_sampled_token_ids_gpu, 1, last_valid_indices_safe.unsqueeze(1)
-        ).squeeze(1)
-
-        # Use last token if valid, pre-computed backup if not
-        batch_size = valid_sampled_token_ids_gpu.shape[0]
-        next_token_ids = torch.where(
-            last_valid_indices != -1,
-            selected_tokens,
-            self.backup_next_token_ids.gpu[:batch_size],
+        # Find the next power of 2 for block sizes
+        BLOCK_SIZE_TOKENS = triton.next_power_of_2(num_tokens)
+        eagle_prepare_next_token_padded_kernel[grid](
+            sampled_token_ids,
+            discard_request_mask,
+            backup_tokens_gpu,
+            next_token_ids,
+            valid_sampled_tokens_count,
+            gpu_input_batch.vocab_size,
+            num_tokens,
+            batch_size,
+            sampled_token_ids.stride(0),
+            BLOCK_SIZE_TOKENS=BLOCK_SIZE_TOKENS,
         )
 
         return next_token_ids, valid_sampled_tokens_count
@@ -624,35 +619,35 @@ def prepare_inputs_padded(
         common_attn_metadata: CommonAttentionMetadata,
         spec_decode_metadata: SpecDecodeMetadata,
         valid_sampled_tokens_count: torch.Tensor,
-    ) -> tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]:
+    ) -> tuple[CommonAttentionMetadata, torch.Tensor]:
         """
         This function is used to prepare the inputs for speculative decoding
         It updates the common_attn_metadata for speculative decoding,
         but does not consider the rejected tokens. Instead, all tokens
         are included as inputs to the speculator, with the rejected tokens
         used as padding and filtered out later by `token_indices_to_sample`.
-        No blocking CPU operations should be introduced in this function.
         """
-        num_draft_tokens_gpu = torch.cat(
-            [
-                spec_decode_metadata.cu_num_draft_tokens[0:1],
-                spec_decode_metadata.cu_num_draft_tokens[1:]
-                - spec_decode_metadata.cu_num_draft_tokens[:-1],
-            ]
+        num_reqs = common_attn_metadata.num_reqs
+        device = valid_sampled_tokens_count.device
+
+        token_indices_to_sample = torch.empty(
+            (num_reqs,), dtype=torch.int32, device=device
         )
 
-        num_rejected_tokens_gpu = torch.where(
-            num_draft_tokens_gpu > 0,
-            num_draft_tokens_gpu + 1 - valid_sampled_tokens_count,
-            torch.zeros_like(num_draft_tokens_gpu),
+        # Kernel grid: one program per request (row)
+        grid = (num_reqs,)
+        eagle_prepare_inputs_padded_kernel[grid](
+            spec_decode_metadata.cu_num_draft_tokens,
+            valid_sampled_tokens_count,
+            common_attn_metadata.query_start_loc,
+            token_indices_to_sample,
+            num_reqs,
         )
 
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
-
         new_query_len_per_req = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
         total_num_tokens = query_start_loc_cpu[-1].item()
-        token_indices = self.arange[:total_num_tokens]
 
         spec_common_attn_metadata = CommonAttentionMetadata(
             query_start_loc=common_attn_metadata.query_start_loc,
@@ -665,16 +660,12 @@ def prepare_inputs_padded(
             max_query_len=new_query_len_per_req.max().item(),
             max_seq_len=common_attn_metadata.seq_lens_cpu.max().item(),
             block_table_tensor=common_attn_metadata.block_table_tensor,
-            slot_mapping=common_attn_metadata.slot_mapping[token_indices],
+            slot_mapping=common_attn_metadata.slot_mapping[:total_num_tokens],
             causal=True,
             dcp_local_seq_lens=common_attn_metadata.dcp_local_seq_lens,
         )
 
-        token_indices_to_sample = (
-            common_attn_metadata.query_start_loc[1:] - 1 - num_rejected_tokens_gpu
-        )
-
-        return spec_common_attn_metadata, token_indices, token_indices_to_sample
+        return spec_common_attn_metadata, token_indices_to_sample
 
     def propose_tree(
         self,
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 1901c6fc9f14..9d4399d00487 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.sampling_params import SamplingParams
+from vllm.triton_utils import tl, triton
 
 _SAMPLING_EPS = 1e-5
 
@@ -14,3 +15,107 @@ def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
         or sampling_params.min_p > _SAMPLING_EPS
         or sampling_params.logprobs is not None
     )
+
+
+@triton.jit
+def eagle_prepare_inputs_padded_kernel(
+    cu_num_draft_tokens_ptr,  # [num_reqs]
+    valid_sampled_tokens_count_ptr,  # [num_reqs]
+    query_start_loc_gpu_ptr,  # [num_reqs + 1]
+    token_indices_to_sample_ptr,  # [num_reqs] (output)
+    num_reqs,  # tl.int32
+):
+    """
+    Fused kernel for Eagle prepare_input_padded. This kernel computes the
+    token index to sample for each request, taking into account the number
+    of draft tokens and the number of valid sampled tokens (which is one more than
+    the number of accepted tokens).
+    """
+    req_idx = tl.program_id(axis=0)
+    if req_idx >= num_reqs:
+        return
+
+    # Calculate num_draft_tokens from cu_num_draft_tokens, which is an inclusive
+    # cumulative sum (first entry is the first value, not zero).
+    cu_draft_curr = tl.load(cu_num_draft_tokens_ptr + req_idx)
+
+    num_draft_tokens = 0
+    if req_idx == 0:
+        num_draft_tokens = cu_draft_curr
+    else:
+        cu_draft_prev = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+        num_draft_tokens = cu_draft_curr - cu_draft_prev
+
+    valid_count = tl.load(valid_sampled_tokens_count_ptr + req_idx)
+    num_rejected_tokens = num_draft_tokens + 1 - valid_count
+    num_rejected_tokens = tl.where(num_draft_tokens > 0, num_rejected_tokens, 0)
+
+    # query_start_loc[req_idx + 1] is the start position of the next request,
+    # which is one past the last token of this request.
+    q_last_tok_idx = tl.load(query_start_loc_gpu_ptr + req_idx + 1) - 1
+
+    index_to_sample = q_last_tok_idx - num_rejected_tokens
+    tl.store(token_indices_to_sample_ptr + req_idx, index_to_sample)
+
+
+@triton.jit
+def eagle_prepare_next_token_padded_kernel(
+    sampled_token_ids_ptr,  # [num_reqs, num_sampled_tokens_per_req]
+    discard_request_mask_ptr,  # [num_reqs]
+    backup_next_token_ids_ptr,  # [num_reqs]
+    next_token_ids_ptr,  # [num_reqs] (output)
+    valid_sampled_tokens_count_ptr,  # [num_reqs] (output)
+    vocab_size,  # tl.int32
+    num_sampled_tokens_per_req,  # tl.int32 (num_spec_tokens + 1)
+    num_reqs,  # tl.int32
+    stride_sampled_token_ids,  # tl.int32 (stride for dim 0)
+    BLOCK_SIZE_TOKENS: tl.constexpr,  # Power-of-2 >= num_sampled_tokens_per_req
+):
+    """
+    Fused kernel for Eagle prepare_next_token_ids_padded. This kernel computes the
+    number of valid (1 + accepted) tokens for each request, and the corresponding
+    "next" token id to sample from during speculative decoding. This is the
+    "last accepted token" from the sampled tokens, or the backup token if no
+    tokens were accepted or if the request is marked as discarded.
+    """
+    req_idx = tl.program_id(axis=0)
+    if req_idx >= num_reqs:
+        return
+
+    # Check if this request is discarded.
+    is_discarded = tl.load(discard_request_mask_ptr + req_idx)
+
+    if is_discarded:
+        backup_token = tl.load(backup_next_token_ids_ptr + req_idx)
+        valid_count = tl.full((), 0, dtype=tl.uint32)
+        tl.store(next_token_ids_ptr + req_idx, backup_token)
+        tl.store(valid_sampled_tokens_count_ptr + req_idx, valid_count)
+    else:
+        # Count the number of valid tokens among the sampled tokens.
+        token_offs = tl.arange(0, BLOCK_SIZE_TOKENS)
+        token_mask = token_offs < num_sampled_tokens_per_req
+
+        row_ptr = sampled_token_ids_ptr + req_idx * stride_sampled_token_ids
+        token_ids = tl.load(row_ptr + token_offs, mask=token_mask, other=-1)
+
+        # Rejected tokens are -1, valid tokens are in [0, vocab_size)
+        is_valid_mask = (token_ids != -1) & (token_ids < vocab_size) & token_mask
+        valid_count = tl.sum(is_valid_mask)
+
+        if valid_count > 0:
+            # Guaranteed to be well-defined since
+            # valid_count > 0 implies is_valid_mask is not empty
+            last_valid_index = tl.max(tl.where(is_valid_mask, token_offs, -1))
+
+            # Select the token at that index, using a sum trick since
+            # we don't want to load again to access token_ids[last_valid_index].
+            last_valid_token = tl.sum(
+                tl.where(token_offs == last_valid_index, token_ids, 0)
+            )
+            tl.store(next_token_ids_ptr + req_idx, last_valid_token)
+        else:
+            # No valid tokens found, use backup token
+            backup_token = tl.load(backup_next_token_ids_ptr + req_idx)
+            tl.store(next_token_ids_ptr + req_idx, backup_token)
+
+        tl.store(valid_sampled_tokens_count_ptr + req_idx, valid_count)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6bff83658b45..9b0fb07297ac 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -488,11 +488,9 @@ def __init__(
             self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False
         )
         self.is_token_ids = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
-        self.discard_request_indices = self._make_buffer(
-            self.max_num_reqs, dtype=torch.int64
+        self.discard_request_mask = self._make_buffer(
+            self.max_num_reqs, dtype=torch.bool
         )
-        self.num_discarded_requests = 0
-
         self.num_decode_draft_tokens = self._make_buffer(
             self.max_num_reqs, dtype=torch.int32
         )
@@ -1369,16 +1367,12 @@ def _prepare_inputs(
         num_tokens = [self.requests[r].num_tokens for r in self.input_batch.req_ids]
         num_tokens_np = np.array(num_tokens, dtype=np.int32)
 
-        # Record the index of requests that should not be sampled,
+        # Record which requests should not be sampled,
         # so that we could clear the sampled tokens before returning
-        discard_requests_mask = self.seq_lens.np[:num_reqs] < num_tokens_np
-        discard_request_indices = np.nonzero(discard_requests_mask)[0]
-        self.num_discarded_requests = len(discard_request_indices)
-        self.discard_request_indices.np[: self.num_discarded_requests] = (
-            discard_request_indices
+        self.discard_request_mask.np[:num_reqs] = (
+            self.seq_lens.np[:num_reqs] < num_tokens_np
         )
-
-        self.discard_request_indices.copy_to_gpu(self.num_discarded_requests)
+        self.discard_request_mask.copy_to_gpu(num_reqs)
 
         # Copy the tensors to the GPU.
         self._prepare_input_ids(
@@ -2548,9 +2542,10 @@ def _bookkeeping_sync(
         if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
             num_nans_in_logits = self._get_nans_in_logits(logits)
 
-        discard_sampled_tokens_req_indices = self.discard_request_indices.np[
-            : self.num_discarded_requests
-        ]
+        num_reqs = self.input_batch.num_reqs
+        discard_sampled_tokens_req_indices = np.nonzero(
+            self.discard_request_mask.np[:num_reqs]
+        )[0]
         for i in discard_sampled_tokens_req_indices:
             gen = self.input_batch.generators.get(int(i))
             if gen is not None:
@@ -3131,8 +3126,7 @@ def propose_draft_token_ids(sampled_token_ids):
                         sampled_token_ids,
                         self.requests,
                         self.input_batch,
-                        self.discard_request_indices.gpu,
-                        self.num_discarded_requests,
+                        self.discard_request_mask.gpu,
                     )
                 )
                 self._copy_valid_sampled_token_count(
@@ -3335,8 +3329,7 @@ def propose_draft_token_ids(
                         sampled_token_ids,
                         self.requests,
                         self.input_batch,
-                        self.discard_request_indices.gpu,
-                        self.num_discarded_requests,
+                        self.discard_request_mask.gpu,
                     )
                 )
                 self._copy_valid_sampled_token_count(
@@ -3363,24 +3356,34 @@ def propose_draft_token_ids(
                         sampled_token_ids,
                         spec_decode_metadata.num_draft_tokens,
                     )
+                    target_token_ids = self.input_ids.gpu[token_indices]
+                    target_positions = self._get_positions(token_indices)
+                    if self.use_aux_hidden_state_outputs:
+                        assert aux_hidden_states is not None
+                        target_hidden_states = torch.cat(
+                            [h[token_indices] for h in aux_hidden_states], dim=-1
+                        )
+                    else:
+                        target_hidden_states = hidden_states[token_indices]
                 else:
-                    common_attn_metadata, token_indices, token_indices_to_sample = (
+                    common_attn_metadata, token_indices_to_sample = (
                         self.drafter.prepare_inputs_padded(
                             common_attn_metadata,
                             spec_decode_metadata,
                             valid_sampled_tokens_count,
                         )
                     )
-
-                target_token_ids = self.input_ids.gpu[token_indices]
-                target_positions = self._get_positions(token_indices)
-                if self.use_aux_hidden_state_outputs:
-                    assert aux_hidden_states is not None
-                    target_hidden_states = torch.cat(
-                        [h[token_indices] for h in aux_hidden_states], dim=-1
-                    )
-                else:
-                    target_hidden_states = hidden_states[token_indices]
+                    total_num_tokens = common_attn_metadata.num_actual_tokens
+                    # When padding the batch, token_indices is just a range
+                    target_token_ids = self.input_ids.gpu[:total_num_tokens]
+                    target_positions = self._get_positions(total_num_tokens)
+                    if self.use_aux_hidden_state_outputs:
+                        assert aux_hidden_states is not None
+                        target_hidden_states = torch.cat(
+                            [h[:total_num_tokens] for h in aux_hidden_states], dim=-1
+                        )
+                    else:
+                        target_hidden_states = hidden_states[:total_num_tokens]
 
             if self.supports_mm_inputs:
                 mm_embed_inputs = self._gather_mm_embeddings(

From 7c1ed45848890a06693d36d372d8e8e50cdc9593 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Sat, 29 Nov 2025 00:21:46 +0100
Subject: [PATCH 528/578] [CI/Build]: make it possible to build with a
 free-threaded interpreter (#29241)

Signed-off-by: Ralf Gommers <ralf.gommers@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 cmake/utils.cmake | 8 +++++++-
 setup.py          | 7 ++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index ca0062ba4fab..5047c354ff7d 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -495,7 +495,13 @@ function (define_extension_target MOD_NAME)
     set(SOABI_KEYWORD "")
   endif()
 
-  if (ARG_USE_SABI)
+  run_python(IS_FREETHREADED_PYTHON
+    "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
+    "Failed to determine whether interpreter is free-threaded")
+
+  # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
+  # so avoid using the stable ABI under free-threading only.
+  if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
     Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
   else()
     Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
diff --git a/setup.py b/setup.py
index 8871b04d8fc4..0022e7fe0bf3 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@
 import shutil
 import subprocess
 import sys
+import sysconfig
 from pathlib import Path
 from shutil import which
 
@@ -74,9 +75,13 @@ def is_ninja_available() -> bool:
     return which("ninja") is not None
 
 
+def is_freethreaded():
+    return bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
+
+
 class CMakeExtension(Extension):
     def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
-        super().__init__(name, sources=[], py_limited_api=True, **kwa)
+        super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa)
         self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
 
 
From 7675ba30de041269988f9307b3fcb65e410a3857 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 07:24:47 +0800
Subject: [PATCH 529/578] [Misc] Remove redundant `ClassRegistry` (#29681)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../multimodal/processing/test_common.py      |  2 +-
 .../processing/test_tensor_schema.py          |  2 +-
 vllm/model_executor/models/interfaces.py      |  7 ++++
 vllm/multimodal/registry.py                   | 24 +++++-------
 vllm/utils/collection_utils.py                | 39 +++----------------
 5 files changed, 25 insertions(+), 49 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 313ab2fa8038..9638791ab5ca 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -233,7 +233,7 @@ def _test_processing_correctness(
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
-    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    factories = model_cls._processor_factory
     ctx = InputProcessingContext(
         model_config,
         tokenizer=cached_tokenizer_from_config(model_config),
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 687d1ef349f8..a287d5b87d1b 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -193,7 +193,7 @@ def test_model_tensor_schema(model_id: str):
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     assert supports_multimodal(model_cls)
 
-    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    factories = model_cls._processor_factory
 
     inputs_parse_methods = []
     for attr_name in dir(model_cls):
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index cee0b79e5e5a..2218d688e59f 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -32,11 +32,13 @@
     from vllm.config import VllmConfig
     from vllm.model_executor.models.utils import WeightsMapper
     from vllm.multimodal.inputs import MultiModalFeatureSpec
+    from vllm.multimodal.registry import _ProcessorFactories
     from vllm.sequence import IntermediateTensors
 else:
     VllmConfig = object
     WeightsMapper = object
     MultiModalFeatureSpec = object
+    _ProcessorFactories = object
     IntermediateTensors = object
 
 logger = init_logger(__name__)
@@ -87,6 +89,11 @@ class SupportsMultiModal(Protocol):
     A set indicating CPU-only multimodal fields.
     """
 
+    _processor_factory: ClassVar[_ProcessorFactories]
+    """
+    Set internally by `MultiModalRegistry.register_processor`.
+    """
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         """
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 8f9276e84640..a7eafa76ad17 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,14 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Generic, Protocol, TypeVar
-
-import torch.nn as nn
+from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
-from vllm.utils.collection_utils import ClassRegistry
 
 from .cache import BaseMultiModalProcessorCache
 from .processing import (
@@ -26,10 +23,11 @@
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
+    from vllm.model_executor.models.interfaces import SupportsMultiModal
 
 logger = init_logger(__name__)
 
-N = TypeVar("N", bound=type[nn.Module])
+N = TypeVar("N", bound=type["SupportsMultiModal"])
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
 
@@ -95,9 +93,6 @@ class MultiModalRegistry:
     A registry that dispatches data processing according to the model.
     """
 
-    def __init__(self) -> None:
-        self._processor_factories = ClassRegistry[nn.Module, _ProcessorFactories]()
-
     def _extract_mm_options(
         self,
         model_config: "ModelConfig",
@@ -207,7 +202,7 @@ def register_processor(
         """
 
         def wrapper(model_cls: N) -> N:
-            if self._processor_factories.contains(model_cls, strict=True):
+            if "_processor_factory" in model_cls.__dict__:
                 logger.warning(
                     "Model class %s already has a multi-modal processor "
                     "registered to %s. It is overwritten by the new one.",
@@ -215,7 +210,7 @@ def wrapper(model_cls: N) -> N:
                     self,
                 )
 
-            self._processor_factories[model_cls] = _ProcessorFactories(
+            model_cls._processor_factory = _ProcessorFactories(
                 info=info,
                 dummy_inputs=dummy_inputs,
                 processor=processor,
@@ -225,12 +220,13 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def _get_model_cls(self, model_config: "ModelConfig"):
+    def _get_model_cls(self, model_config: "ModelConfig") -> "SupportsMultiModal":
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
 
         model_cls, _ = get_model_architecture(model_config)
-        return model_cls
+        assert hasattr(model_cls, "_processor_factory")
+        return cast("SupportsMultiModal", model_cls)
 
     def _create_processing_ctx(
         self,
@@ -248,7 +244,7 @@ def _create_processing_info(
         tokenizer: AnyTokenizer | None = None,
     ) -> BaseProcessingInfo:
         model_cls = self._get_model_cls(model_config)
-        factories = self._processor_factories[model_cls]
+        factories = model_cls._processor_factory
         ctx = self._create_processing_ctx(model_config, tokenizer)
         return factories.info(ctx)
 
@@ -266,7 +262,7 @@ def create_processor(
             raise ValueError(f"{model_config.model} is not a multimodal model")
 
         model_cls = self._get_model_cls(model_config)
-        factories = self._processor_factories[model_cls]
+        factories = model_cls._processor_factory
 
         ctx = self._create_processing_ctx(model_config, tokenizer)
 
diff --git a/vllm/utils/collection_utils.py b/vllm/utils/collection_utils.py
index 57271311828c..3b19e1bd7819 100644
--- a/vllm/utils/collection_utils.py
+++ b/vllm/utils/collection_utils.py
@@ -6,64 +6,37 @@
 This is similar in concept to the `collections` module.
 """
 
-from collections import UserDict, defaultdict
+from collections import defaultdict
 from collections.abc import Callable, Generator, Hashable, Iterable, Mapping
 from typing import Generic, Literal, TypeVar
 
 from typing_extensions import TypeIs, assert_never
 
 T = TypeVar("T")
-U = TypeVar("U")
 
 _K = TypeVar("_K", bound=Hashable)
 _V = TypeVar("_V")
 
 
-class ClassRegistry(UserDict[type[T], _V]):
-    """
-    A registry that acts like a dictionary but searches for other classes
-    in the MRO if the original class is not found.
-    """
-
-    def __getitem__(self, key: type[T]) -> _V:
-        for cls in key.mro():
-            if cls in self.data:
-                return self.data[cls]
-
-        raise KeyError(key)
-
-    def __contains__(self, key: object) -> bool:
-        return self.contains(key)
-
-    def contains(self, key: object, *, strict: bool = False) -> bool:
-        if not isinstance(key, type):
-            return False
-
-        if strict:
-            return key in self.data
-
-        return any(cls in self.data for cls in key.mro())
-
-
-class LazyDict(Mapping[str, T], Generic[T]):
+class LazyDict(Mapping[str, _V], Generic[_V]):
     """
     Evaluates dictionary items only when they are accessed.
 
     Adapted from: https://stackoverflow.com/a/47212782/5082708
     """
 
-    def __init__(self, factory: dict[str, Callable[[], T]]):
+    def __init__(self, factory: dict[str, Callable[[], _V]]):
         self._factory = factory
-        self._dict: dict[str, T] = {}
+        self._dict: dict[str, _V] = {}
 
-    def __getitem__(self, key: str) -> T:
+    def __getitem__(self, key: str) -> _V:
         if key not in self._dict:
             if key not in self._factory:
                 raise KeyError(key)
             self._dict[key] = self._factory[key]()
         return self._dict[key]
 
-    def __setitem__(self, key: str, value: Callable[[], T]):
+    def __setitem__(self, key: str, value: Callable[[], _V]):
         self._factory[key] = value
 
     def __iter__(self):

From a51f4186f20d27a8329fc40fa970e22808dd4a27 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Sat, 29 Nov 2025 07:25:26 +0800
Subject: [PATCH 530/578] [Bugfix] fix dots.llm1.inst (#29687)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/model_executor/models/dots1.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 1c2abbe7b3a7..3beee9f86463 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -181,13 +181,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(-1, hidden_dim)
 
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = (
-            self.experts(hidden_states=hidden_states, router_logits=router_logits)
-            * self.routed_scaling_factor
-        )
 
+        shared_out, routed_out = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
         if self.shared_experts is not None:
-            final_hidden_states = final_hidden_states[0] + final_hidden_states[1]
+            final_hidden_states = (routed_out + shared_out) * self.routed_scaling_factor
+        else:
+            final_hidden_states = routed_out * self.routed_scaling_factor
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)

From 3fd1fb0b6016d1471853f5114fd97c74f1a8d29c Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Fri, 28 Nov 2025 15:26:52 -0800
Subject: [PATCH 531/578] Revert "[LoRA] Support FusedMoE LoRA Triton kernel
 for mxfp4 (#28971)" (#29697)

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 .../moe/test_modular_oai_triton_moe.py        | 250 ------------------
 vllm/lora/layers/fused_moe.py                 |  35 +--
 .../fused_moe/gpt_oss_triton_kernels_moe.py   | 146 ----------
 .../layers/quantization/mxfp4.py              |  20 +-
 4 files changed, 11 insertions(+), 440 deletions(-)
 delete mode 100644 tests/kernels/moe/test_modular_oai_triton_moe.py

diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py
deleted file mode 100644
index 3361d85e9250..000000000000
--- a/tests/kernels/moe/test_modular_oai_triton_moe.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Test modular OAI Triton MoE
-"""
-
-import pytest
-import torch
-
-from vllm.utils.import_utils import has_triton_kernels
-
-if not has_triton_kernels():
-    pytest.skip(
-        "triton_kernels not found, skipping all related tests",
-        allow_module_level=True,
-    )
-
-from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
-from triton_kernels.numerics import InFlexData
-from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
-from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
-from triton_kernels.tensor_details import layout
-from triton_kernels.testing import assert_close
-
-from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
-from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
-    OAITritonExperts,
-    UnfusedOAITritonExperts,
-)
-from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
-from vllm.model_executor.layers.utils import shuffle_weight
-from vllm.platforms import current_platform
-
-MNK = [
-    (1, 512, 384),
-    (1, 2880, 2880),
-    (2, 512, 384),
-    (2, 2880, 2880),
-    (32, 2880, 2880),
-    (64, 2880, 2880),
-]
-
-
-def unshuffle_weight(w: torch.Tensor):
-    first = w[..., ::2]
-    second = w[..., 1::2]
-    return torch.concat((first, second), dim=-1)
-
-
-def make_weights(dtype, k, n, e):
-    w1 = torch.randn((e, k, 2 * n), dtype=dtype, device="cuda")
-    w1_bias = torch.randn((e, 2 * n), dtype=dtype, device="cuda")
-
-    w2 = torch.randn((e, n, k), dtype=dtype, device="cuda")
-    w2_bias = torch.randn((e, k), dtype=dtype, device="cuda")
-
-    w1_tri = w1.clone()
-    w2_tri = w2.clone()
-
-    w1_bias_tri = w1_bias.clone()
-    w2_bias_tri = w2_bias.clone()
-    w1_bias_tri = w1_bias_tri.to(torch.float32)
-    w2_bias_tri = w2_bias_tri.to(torch.float32)
-
-    # shuffle weights
-    w1_tri = shuffle_weight(w1_tri)
-    w1_bias_tri = shuffle_weight(w1_bias_tri)
-
-    # quant triton_weights
-    w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1)
-    w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, dtype, axis=1)
-    w1 = unshuffle_weight(w1)
-
-    w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1)
-    w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, dtype, axis=1)
-
-    num_warps = 8
-    w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
-    w_scale_layout, w_scale_layout_opts = (
-        layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps)
-    )
-
-    w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts)
-    w1_scale_tri = convert_layout(
-        wrap_torch_tensor(w1_scale_tri),
-        w_scale_layout,
-        **w_scale_layout_opts,
-    )
-
-    w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts)
-    w2_scale_tri = convert_layout(
-        wrap_torch_tensor(w2_scale_tri),
-        w_scale_layout,
-        **w_scale_layout_opts,
-    )
-
-    w1_precision_config = PrecisionConfig(
-        weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
-    )
-    w2_precision_config = PrecisionConfig(
-        weight_scale=w2_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
-    )
-
-    return (
-        w1,
-        w2,
-        w1_bias,
-        w2_bias,
-        w1_tri,
-        w2_tri,
-        w1_bias_tri,
-        w2_bias_tri,
-        w1_precision_config,
-        w2_precision_config,
-    )
-
-
-def swiglu(x, alpha: float = 1.702, limit: float = 1.0):
-    # Note we add an extra bias of 1 to the linear layer
-    x_glu, x_linear = torch.chunk(x, 2, dim=-1)
-    if limit is not None:
-        x_glu = x_glu.clamp(max=limit)
-    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
-    if limit is not None:
-        x_linear = x_linear.clamp(min=-limit, max=limit)
-    return out_glu * (x_linear + 1)
-
-
-def torch_moe_impl(
-    hidden_states: torch.Tensor,  # (M, K)
-    w1: torch.Tensor,  # (E, K, 2N)
-    w2: torch.Tensor,  # (E, N, K)
-    w1_bias: torch.Tensor,  # (E, 2N)
-    w2_bias: torch.Tensor,  # (E, K)
-    topk_weights: torch.Tensor,  # (M, topk)
-    topk_ids: torch.Tensor,  # (M, topk)
-):
-    w1 = w1[topk_ids, ...]
-    w1_bias = w1_bias[topk_ids, ...]
-    hidden_states = torch.einsum("bekc,bk->bec", w1, hidden_states) + w1_bias
-    hidden_states = swiglu(hidden_states, limit=7)
-
-    w2 = w2[topk_ids, ...]
-    w2_bias = w2_bias[topk_ids, ...]
-    hidden_states = torch.einsum("bekc,bek->bec", w2, hidden_states) + w2_bias
-
-    # Weighted sum of experts
-    hidden_states = torch.einsum("bec,be->bc", hidden_states, topk_weights)
-    return hidden_states
-
-
-def oai_triton_moe_impl(
-    x: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: "PrecisionConfig",
-    w2_scale: "PrecisionConfig",
-    w1_bias: torch.Tensor | None,
-    w2_bias: torch.Tensor | None,
-    num_experts: int,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    unfused: bool = False,
-) -> torch.Tensor:
-    quant_config = mxfp4_w4a16_moe_quant_config(
-        w1_bias=w1_bias,
-        w2_bias=w2_bias,
-        w1_scale=w1_scale,
-        w2_scale=w2_scale,
-    )
-
-    if unfused:
-        fused_experts = UnfusedOAITritonExperts(quant_config)
-    else:
-        fused_experts = OAITritonExperts(quant_config)
-
-    mk = FusedMoEModularKernel(MoEPrepareAndFinalizeNoEP(), fused_experts)
-
-    return mk.forward(
-        hidden_states=x,
-        w1=w1,
-        w2=w2,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        inplace=True,
-        activation="swigluoai",
-        global_num_experts=num_experts,
-        expert_map=None,
-        apply_router_weight_on_input=False,
-    )
-
-
-@pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
-)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.parametrize("m,n,k", MNK)
-@pytest.mark.parametrize("num_experts", [32, 128])
-@pytest.mark.parametrize("topk", [4])
-@pytest.mark.parametrize("unfused", [True, False])
-def test_oai_triton_moe(
-    dtype: torch.dtype,
-    m: int,
-    n: int,
-    k: int,
-    num_experts: int,
-    topk: int,
-    unfused: bool,
-):
-    current_platform.seed_everything(0)
-    (
-        w1,
-        w2,
-        w1_bias,
-        w2_bias,
-        w1_tri,
-        w2_tri,
-        w1_bias_tri,
-        w2_bias_tri,
-        w1_precision_config,
-        w2_precision_config,
-    ) = make_weights(dtype, k, n, num_experts)
-
-    x = torch.randn((m, k), dtype=dtype, device="cuda")
-    router_logits = torch.randn(m, num_experts, device="cuda", dtype=dtype)
-    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1, sorted=True)
-    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
-
-    with set_current_vllm_config(VllmConfig()):
-        out_ref = torch_moe_impl(x, w1, w2, w1_bias, w2_bias, topk_weights, topk_ids)
-
-        out = oai_triton_moe_impl(
-            x,
-            w1_tri,
-            w2_tri,
-            w1_precision_config,
-            w2_precision_config,
-            w1_bias_tri,
-            w2_bias_tri,
-            num_experts,
-            topk_weights,
-            topk_ids,
-            unfused,
-        )
-
-    assert_close(ref=out_ref, tri=out, maxtol=0.025, rmstol=0.005)
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 24cab79a7244..3ad19370962a 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -20,24 +20,15 @@
     _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    MarlinExperts,
+    modular_marlin_fused_moe,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    TritonExperts,
+    modular_triton_fused_moe,
     try_get_optimal_moe_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
 )
-from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
-    UnfusedOAITritonExperts,
-)
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel,
-)
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
 
 from .utils import _get_lora_device
 
@@ -123,23 +114,15 @@ def _inject_lora_into_fused_moe(self):
         self.base_layer.ensure_moe_quant_config_init()
         quant_config = self.base_layer.quant_method.moe_quant_config
 
-        prepare_finalize = MoEPrepareAndFinalizeNoEP()
-        m_fused_moe_fn = FusedMoEModularKernel(
-            prepare_finalize,
-            self.base_layer.quant_method.select_gemm_impl(
-                prepare_finalize, self.base_layer
-            ),
-            self.base_layer.shared_experts,
-            getattr(self.base_layer, "shared_experts_stream", None),
-        )
-        if quant_config.use_mxfp4_w4a16:
-            assert isinstance(
-                m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts)
+        m_fused_moe_fn = (
+            modular_triton_fused_moe(
+                quant_config, shared_experts=self.base_layer.shared_experts
             )
-        else:
-            assert isinstance(
-                m_fused_moe_fn.fused_experts, (MarlinExperts, TritonExperts)
+            if not quant_config.use_mxfp4_w4a16
+            else modular_marlin_fused_moe(
+                quant_config, shared_experts=self.base_layer.shared_experts
             )
+        )
 
         def fwd_decorator(layer, func):
             def wrapper(*args, **kwargs):
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 0b006e15632e..128507639fdf 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -5,7 +5,6 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
@@ -377,148 +376,3 @@ def apply(
             intermediate_cache=workspace2,
             a1q_scale=a1q_scale,
         )
-
-
-class UnfusedOAITritonExperts(BaseOAITritonExperts):
-    """
-    A Triton based MoE expert class that operates on expert standard
-    format and explicitly keeps the activation and reduction (moe_sum) steps
-    unfused from the matmul_ogs kernel. This exposes injection points
-    for activation and moe_sum.
-
-    One use case for it is to inject LoRA modules on the activation and moe_sum.
-    """
-
-    def __init__(self, quant_config: FusedMoEQuantConfig):
-        # TODO (varun) : Enable activation quantization
-        assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
-        super().__init__(quant_config)
-
-    @property
-    def activation_formats(
-        self,
-    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (
-            mk.FusedMoEActivationFormat.Standard,
-            mk.FusedMoEActivationFormat.Standard,
-        )
-
-    def supports_chunking(self) -> bool:
-        return True
-
-    def workspace_shapes(
-        self,
-        M: int,
-        N: int,
-        K: int,
-        topk: int,
-        global_num_experts: int,
-        local_num_experts: int,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
-        # workspace are allocated inside the kernel
-        workspace1 = (M * topk, N // 2)
-        workspace2 = (M * topk, max(N, K))
-        output = (M, K)
-        return (workspace1, workspace2, output)
-
-    def moe_sum(self, input: torch.Tensor, output: torch.Tensor):
-        ops.moe_sum(input, output)
-
-    def apply(
-        self,
-        output: torch.Tensor,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str,
-        global_num_experts: int,
-        expert_map: torch.Tensor | None,
-        a1q_scale: torch.Tensor | None,
-        a2_scale: torch.Tensor | None,
-        workspace13: torch.Tensor,
-        workspace2: torch.Tensor,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-        apply_router_weight_on_input: bool,
-    ):
-        if self.quant_config is None:
-            self.quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
-
-        if expert_map is not None:
-            topk_ids = expert_map[topk_ids]
-
-        local_num_experts = w1.size(0)
-        if global_num_experts == -1:
-            global_num_experts = local_num_experts
-
-        routing_data, gather_indx, scatter_indx = self._make_routing_data(
-            topk_ids, topk_weights, local_num_experts
-        )
-
-        topk = topk_ids.size(1)
-
-        # type check, uint8 means mxfp4
-        assert hidden_states.dtype == torch.bfloat16
-        assert (
-            self.quant_config.w1_bias is None
-            or self.quant_config.w1_bias.dtype == torch.float32
-        )
-        assert (
-            self.quant_config.w2_bias is None
-            or self.quant_config.w2_bias.dtype == torch.float32
-        )
-
-        # Shape check, only check non-mxfp4
-        assert hidden_states.ndim == 2
-        assert hidden_states.shape[-1] == w1.shape[-2]
-        assert w2.shape[-1] == w1.shape[1]
-
-        batch_dim = 1
-        M, K = hidden_states.shape
-        E, _, N = w1.shape
-
-        if global_num_experts == -1:
-            global_num_experts = E
-
-        # Note that the output tensor might be in workspace13
-        intermediate_cache1 = _resize_cache(workspace2, (batch_dim, M * topk, N))
-        intermediate_cache3 = _resize_cache(workspace2, (batch_dim, M * topk, K))
-        intermediate_cache2 = _resize_cache(workspace13, (M * topk, N // 2))
-
-        gammas = routing_data.gate_scal if routing_data else None
-
-        matmul_ogs(
-            hidden_states,
-            w1,
-            self.quant_config.w1_bias,
-            routing_data,
-            gather_indx=gather_indx,
-            precision_config=self.quant_config.w1_precision,
-            gammas=gammas if apply_router_weight_on_input else None,
-            fused_activation=None,
-            y=intermediate_cache1,
-        )
-
-        self.activation(
-            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
-        )
-
-        # matmul_ogs grouped reduction fuse sum across multiple experts:
-        # y[dst_ind // n_expts_act, :] += x[src_ind, :]
-        # Need to set n_expts_act to 1 to unfuse moe_sum
-        routing_data.n_expts_act = 1
-
-        matmul_ogs(
-            intermediate_cache2,
-            w2,
-            self.quant_config.w2_bias,
-            routing_data,
-            scatter_indx=scatter_indx,
-            precision_config=self.quant_config.w2_precision,
-            gammas=None if apply_router_weight_on_input else gammas,
-            y=intermediate_cache3,
-        )
-
-        self.moe_sum(intermediate_cache3.view(-1, topk, K), output)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 74036753496d..bc241ac692e2 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -30,7 +30,6 @@
 )
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
     OAITritonExperts,
-    UnfusedOAITritonExperts,
 )
 from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
@@ -84,21 +83,8 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
     if not current_platform.is_cuda():
         return Mxfp4Backend.NONE
 
-    # If FlashInfer is not available, try either Marlin or Triton
-    triton_kernels_supported = (
-        has_triton_kernels()
-        and is_torch_equal_or_newer("2.8.0")
-        # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
-        # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
-        # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
-        and (9, 0) <= current_platform.get_device_capability() < (11, 0)
-    )
-    if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
-        logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-        return Mxfp4Backend.MARLIN
-
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
-    return Mxfp4Backend.TRITON
+    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
+    return Mxfp4Backend.MARLIN
 
 
 def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
@@ -868,8 +854,6 @@ def select_gemm_impl(
             elif self.mxfp4_backend == Mxfp4Backend.MARLIN:
                 return MarlinExperts(self.moe_quant_config)
             elif self.mxfp4_backend == Mxfp4Backend.TRITON:
-                if self.moe.is_lora_enabled:
-                    return UnfusedOAITritonExperts(self.moe_quant_config)
                 return OAITritonExperts(self.moe_quant_config)
             else:
                 raise NotImplementedError(

From 9726e64530e391a4d617cf7ecb862b433698ff6d Mon Sep 17 00:00:00 2001
From: Augusto Yao <augusto.yjh@antgroup.com>
Date: Sat, 29 Nov 2025 07:52:12 +0800
Subject: [PATCH 532/578] bugfix: correct attn output with base 2 or e (#28840)

Signed-off-by: augusto.yjh <augusto.yjh@antgroup.com>
---
 vllm/attention/ops/common.py             | 42 ++++++++++++++++++------
 vllm/v1/attention/backends/flashinfer.py | 11 +++++--
 vllm/v1/attention/backends/mla/common.py |  7 +++-
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index af6766bdd161..bd6bc864d45d 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -21,6 +21,7 @@ def _correct_attn_cp_out_kernel(
     lse_idx,
     HEAD_DIM: tl.constexpr,
     N_ROUNDED: tl.constexpr,
+    IS_BASE_E: tl.constexpr,
 ):
     """
     Apply the all-gathered lses to correct each local rank's attention
@@ -55,9 +56,14 @@ def _correct_attn_cp_out_kernel(
     lse_max = tl.max(lse, axis=0)
     lse_max = tl.where(lse_max == -float("inf"), 0, lse_max)
     lse -= lse_max
-    lse_exp = tl.exp(lse)
-    lse_acc = tl.sum(lse_exp, axis=0)
-    lse = tl.log(lse_acc)
+    if IS_BASE_E:
+        lse_exp = tl.exp(lse)
+        lse_acc = tl.sum(lse_exp, axis=0)
+        lse = tl.log(lse_acc)
+    else:
+        lse_exp = tl.exp2(lse)
+        lse_acc = tl.sum(lse_exp, axis=0)
+        lse = tl.log2(lse_acc)
     lse += lse_max
 
     lse_offsets = batch_idx * lses_stride_B + head_idx * lses_stride_H
@@ -81,7 +87,7 @@ def _correct_attn_cp_out_kernel(
         -float("inf"),
         lse_finally,
     )
-    factor = tl.exp(lse_finally)
+    factor = tl.exp(lse_finally) if IS_BASE_E else tl.exp2(lse_finally)
     output = tl.load(outputs_ptr + output_offsets)
     output = output * factor
 
@@ -102,7 +108,11 @@ def call_kernel(self, kernel, grid, *regular_args, **const_args):
 
 
 def correct_attn_out(
-    out: torch.Tensor, lses: torch.Tensor, cp_rank: int, ctx: CPTritonContext
+    out: torch.Tensor,
+    lses: torch.Tensor,
+    cp_rank: int,
+    ctx: CPTritonContext,
+    is_lse_base_on_e: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Correct the attention output using the all-gathered lses.
 
@@ -163,8 +173,7 @@ def correct_attn_out(
         l_sH,
         cp_rank,
     )
-    const_args = {"HEAD_DIM": D, "N_ROUNDED": N}
-
+    const_args = {"HEAD_DIM": D, "N_ROUNDED": N, "IS_BASE_E": is_lse_base_on_e}
     ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args, **const_args)
     return out, lse
 
@@ -174,6 +183,7 @@ def _cp_lse_common(
     cp_attn_lse: torch.Tensor,
     cp_group: GroupCoordinator,
     ctx: CPTritonContext | None = None,
+    is_lse_base_on_e=True,
 ):
     """
     cp_attn_out: [ B, H, D ]
@@ -193,7 +203,13 @@ def _cp_lse_common(
 
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
-    out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
+    out, lse = correct_attn_out(
+        cp_attn_out,
+        lses,
+        cp_group.rank_in_group,
+        ctx,
+        is_lse_base_on_e=is_lse_base_on_e,
+    )
     return out, lse
 
 
@@ -203,12 +219,15 @@ def cp_lse_ag_out_rs(
     cp_group: GroupCoordinator,
     ctx: CPTritonContext | None = None,
     return_lse: bool = False,
+    is_lse_base_on_e=True,
 ):
     """
     cp_attn_out: [ B, H, D ]
     cp_attn_lse: [ B, H ]
     """
-    out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
+    out, lse = _cp_lse_common(
+        cp_attn_out, cp_attn_lse, cp_group, ctx=ctx, is_lse_base_on_e=is_lse_base_on_e
+    )
     out = cp_group.reduce_scatter(out, dim=1)
 
     if return_lse:
@@ -225,12 +244,15 @@ def cp_lse_ag_out_ar(
     cp_group: GroupCoordinator,
     ctx: CPTritonContext | None = None,
     return_lse: bool = False,
+    is_lse_base_on_e=True,
 ):
     """
     cp_attn_out: [ B, H, D ]
     cp_attn_lse: [ B, H ]
     """
-    out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
+    out, lse = _cp_lse_common(
+        cp_attn_out, cp_attn_lse, cp_group, ctx=ctx, is_lse_base_on_e=is_lse_base_on_e
+    )
     out = cp_group.all_reduce(out)
 
     if return_lse:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 777398bf8a20..69a6a5e5fae8 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -249,7 +249,11 @@ def run(
             return_lse=True,
         )
         output_context, lse_context = cp_lse_ag_out_rs(
-            output_context_tmp, lse_context_tmp, get_dcp_group(), return_lse=True
+            output_context_tmp,
+            lse_context_tmp,
+            get_dcp_group(),
+            return_lse=True,
+            is_lse_base_on_e=False,
         )
         lse_context = lse_context.transpose(0, 1).contiguous()
 
@@ -1335,7 +1339,10 @@ def forward(
                         return_lse=True,
                     )
                     output[:num_decode_tokens] = cp_lse_ag_out_rs(
-                        output_tmp, lse, get_dcp_group()
+                        output_tmp,
+                        lse,
+                        get_dcp_group(),
+                        is_lse_base_on_e=False,
                     )
                 else:
                     decode_wrapper.run(
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index d94ed9183f63..b09541dbf791 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -2057,7 +2057,12 @@ def forward(
 
             # correct dcp attn_out with lse.
             if self.dcp_world_size > 1:
-                attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group())
+                attn_out = cp_lse_ag_out_rs(
+                    attn_out,
+                    lse,
+                    get_dcp_group(),
+                    is_lse_base_on_e=not self._use_fi_prefill,
+                )
 
             # v_up projection
             self._v_up_proj(attn_out, out=output[:num_decode_tokens])

From 6173682b6e98ae62f612db7af6813831097b23dc Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Fri, 28 Nov 2025 18:58:38 -0500
Subject: [PATCH 533/578] [compile] Include `enable_sleep_mode` into caching
 factors. (#29696)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 vllm/config/model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index b9ae4fec14ef..6b3657381354 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -353,7 +353,6 @@ def compute_hash(self) -> str:
             "hf_token",
             "hf_overrides",
             "logits_processor_pattern",
-            "enable_sleep_mode",
             "override_attention_dtype",
             "logits_processors",
             "io_processor_plugin",

From c625d7b1c628bf8a49aa68d0a2628d95287be1ee Mon Sep 17 00:00:00 2001
From: Mert Unsal <mertunsal1905@gmail.com>
Date: Fri, 28 Nov 2025 16:10:39 -0800
Subject: [PATCH 534/578] =?UTF-8?q?[Bugfix]=20Fix=20O(n=C2=B2)=20multimoda?=
 =?UTF-8?q?l=20string=20prompt=20processing=20(#29667)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: mertunsall <mertunsal1905@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 tests/multimodal/test_processing.py | 36 +++++++++++++++++
 vllm/multimodal/processing.py       | 62 ++++++++++++++---------------
 2 files changed, 65 insertions(+), 33 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 2f04bc6695c8..d860c50e7899 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -15,6 +15,7 @@
     PromptIndexTargets,
     PromptInsertion,
     PromptReplacement,
+    _apply_matches,
     apply_text_matches,
     apply_token_matches,
     find_mm_placeholders,
@@ -1075,3 +1076,38 @@ def test_hf_processor_call_kwargs(
 
     result = ctx.call_hf_processor(processor, {}, inference_kwargs)
     assert result == expected_kwargs
+
+
+def test_apply_matches_no_match_exits_quickly():
+    """
+    Test that _apply_matches exits quickly when no matches are found.
+
+    Previously, _apply_matches had O(n²) behavior when no match was found
+    because it would increment start_idx by 1 each iteration while
+    re-scanning the entire prompt from prev_end_idx=0.
+
+    With the fix, it should exit immediately when no match is found.
+    """
+    import time
+
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    # Create a long prompt with no placeholder
+    long_prompt = "x" * 10000
+
+    # Create update looking for a placeholder that doesn't exist
+    mm_prompt_updates = {
+        "image": [[PromptReplacement("image", "<image>", "REPLACED").resolve(0)]]
+    }
+
+    start = time.perf_counter()
+    result, _ = _apply_matches(
+        long_prompt,
+        mm_prompt_updates,
+        mock_tokenizer,
+    )
+    elapsed = time.perf_counter() - start
+
+    # Should complete in < 100ms (was taking seconds before the fix)
+    assert elapsed < 0.1, f"_apply_matches took {elapsed:.2f}s, expected < 0.1s"
+    assert "".join(result) == long_prompt
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 691eff9acf86..27bf12a5f316 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -742,7 +742,6 @@ def _apply_matches(
     mm_prompt_updates: "MultiModalPromptUpdates",
     tokenizer: AnyTokenizer,
 ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
-    prompt_len = len(prompt)
     mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
 
     out_seqs = list[str | list[int]]()
@@ -750,16 +749,15 @@ def _apply_matches(
         m: [None] * len(items) for m, items in mm_prompt_updates.items()
     }
 
+    # Early exit if no items to find
     mm_found_counts = {
         m: sum(r is not None for r in res) for m, res in out_result.items()
     }
     if _all_items_found(mm_item_counts, mm_found_counts):
         return [prompt], out_result
 
-    start_idx = prev_end_idx = 0
-    while start_idx < max(prompt_len, 1):  # Allow inserts into empty prompt
-        found = False
-
+    prev_end_idx = 0
+    while True:
         mode, matches_to_apply = _find_matches(
             prompt,
             mm_prompt_updates,
@@ -768,39 +766,37 @@ def _apply_matches(
             current_result=out_result,
         )
 
-        if mode is not None:
-            for (modality, item_idx), (match, update_idx) in matches_to_apply:
-                found = True
+        if mode is None:
+            break  # No more matches to find
 
-                matched_update = mm_prompt_updates[modality][item_idx][update_idx]
-                matched_content = matched_update.content.full
+        for (modality, item_idx), (match, update_idx) in matches_to_apply:
+            matched_update = mm_prompt_updates[modality][item_idx][update_idx]
+            matched_content = matched_update.content.full
 
-                if mode == UpdateMode.INSERT:
-                    end_idx_to_insert = match.end_idx
-                elif mode == UpdateMode.REPLACE:
-                    end_idx_to_insert = match.start_idx
-                else:
-                    assert_never(mode)
-
-                out_seqs.append(prompt[prev_end_idx:end_idx_to_insert])
-                out_seqs.append(
-                    _seq2text(tokenizer, matched_content)
-                    if isinstance(prompt, str)
-                    else _seq2tokens(tokenizer, matched_content)
-                )
-                out_result[modality][item_idx] = update_idx
+            if mode == UpdateMode.INSERT:
+                end_idx_to_insert = match.end_idx
+            elif mode == UpdateMode.REPLACE:
+                end_idx_to_insert = match.start_idx
+            else:
+                assert_never(mode)
 
-                # Exclude overlapping matches
-                start_idx = prev_end_idx = match.end_idx
+            out_seqs.append(prompt[prev_end_idx:end_idx_to_insert])
+            out_seqs.append(
+                _seq2text(tokenizer, matched_content)
+                if isinstance(prompt, str)
+                else _seq2tokens(tokenizer, matched_content)
+            )
+            out_result[modality][item_idx] = update_idx
 
-            mm_found_counts = {
-                m: sum(r is not None for r in res) for m, res in out_result.items()
-            }
-            if _all_items_found(mm_item_counts, mm_found_counts):
-                break
+            # Exclude overlapping matches
+            prev_end_idx = match.end_idx
 
-        if not found:
-            start_idx += 1
+        # Early exit if all items found
+        mm_found_counts = {
+            m: sum(r is not None for r in res) for m, res in out_result.items()
+        }
+        if _all_items_found(mm_item_counts, mm_found_counts):
+            break
 
     out_seqs.append(prompt[prev_end_idx:])
 

From ea3370b428e1b192d29a4451d439f4ed0895f1f3 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Fri, 28 Nov 2025 19:31:44 -0600
Subject: [PATCH 535/578] [ROCm][Bugfix] Patch for the `Multi-Modal Processor
 Test` group (#29702)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm                        | 19 ++++-
 docker/Dockerfile.rocm_base                   | 26 ++++++-
 requirements/rocm-test.txt                    | 78 +++++++++++++------
 .../processing/test_tensor_schema.py          |  9 ++-
 4 files changed, 104 insertions(+), 28 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 42466d1801cf..4aabe2661088 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -65,6 +65,8 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+# Centralized v1 package - copied to both test and final stages
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
 
 # -----------------------
 # Test vLLM image
@@ -88,10 +90,22 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 
 # install development dependencies (for testing)
 RUN cd /vllm-workspace \
-    && rm -rf vllm \
     && python3 -m pip install -e tests/vllm_test_utils \
     && python3 -m pip install pytest-shard
 
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+# Copy in the v1 package
+COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
+# Source code is used in the `python_only_compile.sh` test
+# We hide it inside `src/` so that this source code
+# will not be imported by other tests
+RUN mkdir src && mv vllm src/vllm
+
 # -----------------------
 # Final vLLM image
 FROM base AS final
@@ -116,6 +130,9 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
+# Copy in the v1 package
+COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
 ARG COMMON_WORKDIR
 
 # Copy over the benchmark scripts as well
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index df4f9b6c26e7..a57ee728d924 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -5,6 +5,8 @@ ARG PYTORCH_BRANCH="1c57644d"
 ARG PYTORCH_VISION_BRANCH="v0.23.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
+ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
 ARG AITER_BRANCH="59bd8ff2"
@@ -23,6 +25,7 @@ ENV AITER_ROCM_ARCH=gfx942;gfx950
 ENV HSA_NO_SCRATCH_RECLAIM=1
 
 ARG PYTHON_VERSION=3.12
+ENV PYTHON_VERSION=${PYTHON_VERSION}
 
 RUN mkdir -p /app
 WORKDIR /app
@@ -45,6 +48,7 @@ RUN apt-get update -y \
     && python3 --version && python3 -m pip --version
 
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
+RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
 
 FROM base AS build_triton
 ARG TRITON_BRANCH
@@ -66,11 +70,14 @@ RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
 FROM base AS build_pytorch
 ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_AUDIO_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
+ARG PYTORCH_AUDIO_REPO
+
 RUN git clone ${PYTORCH_REPO} pytorch
-RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
-    pip install -r requirements.txt && git submodule update --init --recursive \
+RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
+    && pip install -r requirements.txt && git submodule update --init --recursive \
     && python3 tools/amd_build/build_amd.py \
     && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
     && pip install dist/*.whl
@@ -78,8 +85,15 @@ RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
     && python3 setup.py bdist_wheel --dist-dir=dist \
     && pip install dist/*.whl
+RUN git clone ${PYTORCH_AUDIO_REPO} audio
+RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
-    && cp /app/vision/dist/*.whl /app/install
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/audio/dist/*.whl /app/install
 
 FROM base AS build_fa
 ARG FA_BRANCH
@@ -130,6 +144,8 @@ ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
+ARG PYTORCH_AUDIO_BRANCH
+ARG PYTORCH_AUDIO_REPO
 ARG FA_BRANCH
 ARG FA_REPO
 ARG AITER_BRANCH
@@ -141,7 +157,9 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_AUDIO_BRANCH: ${PYTORCH_AUDIO_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_AUDIO_REPO: ${PYTORCH_AUDIO_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
     && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 8a91b59de6f7..ae61d4c6c6a8 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -1,51 +1,85 @@
 # Common dependencies
 -r common.txt
+
+# Test infrastructure
 tblib==3.1.0
-bm25s==0.2.13
-pystemmer==3.0.0
+pytest==8.3.5
+pytest-asyncio==0.24.0
+pytest-timeout==2.3.1
+pytest-cov==6.3.0
+pytest-forked==1.6.0
+pytest-rerunfailures==14.0
+pytest-shard==0.1.2
+
+# Async/HTTP dependencies
+anyio==4.6.2.post1
+    # via httpx, starlette
+aiohttp==3.13.0
+    # via gpt-oss
+httpx==0.27.2
+    # HTTP testing
 
-# Entrypoints test
-# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
+# Audio processing dependencies
 audioread==3.0.1
+    # via librosa
 cffi==1.17.1
+    # via soundfile
 decorator==5.2.1
+    # via librosa
 lazy-loader==0.4
+    # via librosa
 platformdirs==4.3.6
+    # via pooch
 pooch==1.8.2
-#pycparse==2.22
+    # via librosa
 soundfile==0.13.1
+    # via librosa
 soxr==0.5.0.post1
+    # via librosa
 librosa==0.10.2.post1
 
-# Entrypoints test
-#vllm[video] # required by entrypoints/openai/test_video.py
-decord==0.6.0
-
-# Entrypoints test
-#sentence-transformers # required by entrypoints/openai/test_score.py
-sentence-transformers==3.4.1
-
-# Basic Models Test
-matplotlib==3.10.3
+# Retrieval and search
+bm25s==0.2.13
+    # via mteb
+pystemmer==3.0.0
+    # via mteb
 
-# Multi-Modal Models Test (Extended) 3
+# Multi-modal processing
 blobfile==3.0.0
+    # Multi-Modal Models Test
+decord==0.6.0
+    # video processing, required by entrypoints/openai/test_video.py
 
-# Required for openai schema test.
+# OpenAI compatibility and testing
+gpt-oss==0.0.8
+    # OpenAI compatibility tests
 schemathesis==3.39.15
+    # OpenAI schema test
 
-# Required for mteb test
-mteb[bm25s]>=1.38.11, <2
-
-# Required for eval tests
+# Evaluation and benchmarking
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
 
-# Required for multiprocessed tests that use spawn method
+# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
 multiprocess==0.70.16
 
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
 torchgeo==0.7.0
+    # via terratorch
+# MTEB Benchmark Test
+mteb==2.1.2
+
+# Data processing
+xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@eafd4db51b78acc64b3f0764ef27dfd206c28628
+    # Test async scheduling
+
+# Utilities
+num2words==0.5.14
+    # via lm-eval
+pqdm==0.2.0
+    # via lm-eval
 
 # Required for suffix decoding test
 arctic-inference == 0.1.1
+# Required for Nemotron test
+open-clip-torch==2.32.0
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index a287d5b87d1b..66a3fbe11b6a 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -30,6 +30,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
+from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
@@ -176,6 +177,12 @@ def test_model_tensor_schema(model_id: str):
         exist_overrides=model_info.hf_overrides,
     )
 
+    # ROCm: Detect if model uses AWQ quantization and set appropriate dtype
+    if "awq" in model_id.lower() and current_platform.is_rocm():
+        dtype = "float16"
+    else:
+        dtype = model_info.dtype
+
     model_config = ModelConfig(
         model_id,
         tokenizer=model_info.tokenizer or model_id,
@@ -187,7 +194,7 @@ def test_model_tensor_schema(model_id: str):
         enable_prompt_embeds=model_info.require_embed_inputs,
         enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
+        dtype=dtype,
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)

From 1dcafb3dea62f556011be4df6f71769aa7260561 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Nov 2025 17:53:17 -0800
Subject: [PATCH 536/578] [Model Runner V2] Support penalties using bin counts
 (#29703)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/input_batch.py  |  15 +++
 vllm/v1/worker/gpu/model_runner.py |   9 +-
 vllm/v1/worker/gpu/penalties.py    |  85 ++++++++++++++
 vllm/v1/worker/gpu/sampler.py      |   3 +
 vllm/v1/worker/gpu/states.py       | 182 +++++++++++++++++++++++++++--
 5 files changed, 280 insertions(+), 14 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/penalties.py

diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 2a7048ae3c0e..43fd53d3acaa 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -341,6 +341,8 @@ def _post_update_kernel(
     idx_mapping_ptr,
     num_computed_tokens_ptr,
     last_sampled_tokens_ptr,
+    output_bin_counts_ptr,
+    output_bin_counts_stride,
     sampled_tokens_ptr,
     sampled_tokens_stride,
     num_sampled_ptr,
@@ -357,6 +359,15 @@ def _post_update_kernel(
         )
         tl.store(last_sampled_tokens_ptr + req_state_idx, token_id)
 
+    for i in range(num_sampled):
+        token_id = tl.load(sampled_tokens_ptr + req_id * sampled_tokens_stride + i)
+        token_ptr = (
+            output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + token_id
+        )
+        count = tl.load(token_ptr)
+        count += 1
+        tl.store(token_ptr, count)
+
     query_start = tl.load(query_start_loc_ptr + req_id)
     query_end = tl.load(query_start_loc_ptr + req_id + 1)
     query_len = query_end - query_start
@@ -374,6 +385,8 @@ def post_update(
     num_computed_tokens: torch.Tensor,
     # [max_num_reqs]
     last_sampled_tokens: torch.Tensor,
+    # [max_num_reqs, vocab_size]
+    output_bin_counts: torch.Tensor,
     # [num_reqs, num_speculative_steps + 1]
     sampled_tokens: torch.Tensor,
     # [num_reqs]
@@ -388,6 +401,8 @@ def post_update(
         idx_mapping,
         num_computed_tokens,
         last_sampled_tokens,
+        output_bin_counts,
+        output_bin_counts.stride(0),
         sampled_tokens,
         sampled_tokens.stride(0),
         num_sampled,
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 0c9fdd0077f4..9ba234544421 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -512,7 +512,7 @@ def prepare_inputs(
             idx_mapping_np,
             num_scheduled_tokens,
             query_start_loc_np,
-            self.req_states.prefill_token_ids,
+            self.req_states.prefill_token_ids.np,
             self.req_states.num_computed_prefill_tokens,
             self.input_buffers.input_ids.np,
         )
@@ -681,7 +681,7 @@ def compute_prompt_logprobs(
         # Handle chunked prompts.
         pos_after_step = computed_prefill + input_batch.num_scheduled_tokens
         is_prompt_chunked = pos_after_step < prompt_lens
-        prefill_token_ids = self.req_states.prefill_token_ids
+        prefill_token_ids = self.req_states.prefill_token_ids.np
         query_start_loc = self.input_buffers.query_start_loc.np
         for i, req_id in enumerate(input_batch.req_ids):
             if not needs_prompt_logprobs[i]:
@@ -756,6 +756,7 @@ def postprocess(
             input_batch.idx_mapping,
             self.req_states.num_computed_tokens,
             self.req_states.last_sampled_tokens,
+            self.req_states.output_bin_counts,
             sampled_tokens,
             num_sampled,
             num_rejected,
@@ -785,7 +786,7 @@ def propose_draft(
         idx_mapping_np = input_batch.idx_mapping_np
         with async_barrier(self.spec_decode_event):
             self.input_buffers.next_prefill_tokens.np[:num_reqs] = (
-                self.req_states.prefill_token_ids[
+                self.req_states.prefill_token_ids.np[
                     idx_mapping_np,
                     self.req_states.num_computed_prefill_tokens[idx_mapping_np],
                 ]
@@ -896,7 +897,7 @@ def execute_model(
                 # barrier to avoid race conditions.
                 pos = input_batch.positions[input_batch.logits_indices]
                 sampling_metadata = self.req_states.make_sampling_metadata(
-                    input_batch.idx_mapping_np, pos
+                    input_batch.idx_mapping, input_batch.idx_mapping_np, pos
                 )
                 if input_batch.num_draft_tokens > 0:
                     sampling_metadata = self.req_states.expand_sampling_metadata(
diff --git a/vllm/v1/worker/gpu/penalties.py b/vllm/v1/worker/gpu/penalties.py
new file mode 100644
index 000000000000..f87ee01718cd
--- /dev/null
+++ b/vllm/v1/worker/gpu/penalties.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.states import SamplingMetadata
+
+
+@triton.jit
+def _penalties_kernel(
+    logits_ptr,
+    logits_stride,
+    repetition_penalty_ptr,
+    frequency_penalty_ptr,
+    presence_penalty_ptr,
+    idx_mapping_ptr,
+    prompt_bin_counts_ptr,
+    prompt_bin_counts_stride,
+    output_bin_counts_ptr,
+    output_bin_counts_stride,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    rep_penalty = tl.load(repetition_penalty_ptr + batch_idx)
+    freq_penalty = tl.load(frequency_penalty_ptr + batch_idx)
+    pres_penalty = tl.load(presence_penalty_ptr + batch_idx)
+
+    use_rep_penalty = rep_penalty != 1.0
+    use_freq_penalty = freq_penalty != 0.0
+    use_pres_penalty = pres_penalty != 0.0
+    if not (use_rep_penalty or use_freq_penalty or use_pres_penalty):
+        # No penalties to apply. Early return.
+        return
+
+    block_idx = tl.program_id(1)
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = block < vocab_size
+    logits = tl.load(logits_ptr + batch_idx * logits_stride + block, mask=mask)
+    logits = logits.to(tl.float32)
+
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+    output_bin_counts = tl.load(
+        output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + block,
+        mask=mask,
+    )
+
+    # Apply repetition penalties.
+    if use_rep_penalty:
+        prompt_bin_counts = tl.load(
+            prompt_bin_counts_ptr + req_state_idx * prompt_bin_counts_stride + block,
+            mask=mask,
+        )
+        # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+        scale = tl.where((prompt_bin_counts + output_bin_counts) > 0, rep_penalty, 1.0)
+        # If logits are positive, divide by penalty, otherwise multiply by penalty.
+        scale = tl.where(logits > 0, 1.0 / scale, scale)
+        logits *= scale
+
+    # Apply frequency penalties.
+    logits -= freq_penalty * output_bin_counts
+    # Apply presence penalties.
+    logits -= pres_penalty * (output_bin_counts > 0)
+    # Store back to logits.
+    tl.store(logits_ptr + batch_idx * logits_stride + block, logits, mask=mask)
+
+
+def apply_penalties(logits: torch.Tensor, sampling_metadata: SamplingMetadata) -> None:
+    num_reqs, vocab_size = logits.shape
+    BLOCK_SIZE = 8192
+    num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
+    _penalties_kernel[(num_reqs, num_blocks)](
+        logits,
+        logits.stride(0),
+        sampling_metadata.repetition_penalty,
+        sampling_metadata.frequency_penalty,
+        sampling_metadata.presence_penalty,
+        sampling_metadata.idx_mapping,
+        sampling_metadata.prompt_bin_counts,
+        sampling_metadata.prompt_bin_counts.stride(0),
+        sampling_metadata.output_bin_counts,
+        sampling_metadata.output_bin_counts.stride(0),
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index d8676079ab95..6e0d6150a966 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -8,6 +8,7 @@
 from vllm.triton_utils import tl, triton
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.worker.gpu.penalties import apply_penalties
 from vllm.v1.worker.gpu.states import SamplingMetadata
 
 
@@ -65,6 +66,8 @@ def sample(
         logits = apply_top_k_top_p(
             logits, sampling_metadata.top_k, sampling_metadata.top_p
         )
+        # Apply penalties in place.
+        apply_penalties(logits, sampling_metadata)
 
         sampled = gumbel_sample(
             logits,
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 513d45d95d7c..64874b72e60c 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -8,6 +8,8 @@
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.triton_utils import tl, triton
+from vllm.utils.platform_utils import is_uva_available
+from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.utils import CpuGpuBuffer
 
@@ -23,12 +25,21 @@ class SamplingMetadata:
     top_p: torch.Tensor | None
     top_k: torch.Tensor | None
 
+    repetition_penalty: torch.Tensor
+    frequency_penalty: torch.Tensor
+    presence_penalty: torch.Tensor
+
     seeds: torch.Tensor
     pos: torch.Tensor
 
     # None means no logprobs, 0 means sampled token logprobs only
     max_num_logprobs: int | None
 
+    # For penalties
+    idx_mapping: torch.Tensor
+    prompt_bin_counts: torch.Tensor
+    output_bin_counts: torch.Tensor
+
     @classmethod
     def make_dummy(
         cls,
@@ -44,17 +55,35 @@ def make_dummy(
         # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device)
         top_p = None
         top_k = None
+        # NOTE(woosuk): We must set penalties to their default values to make sure
+        # the penalties kernel does not touch the placeholder bin_counts tensors.
+        repetition_penalty = torch.ones(num_reqs, dtype=torch.float32, device=device)
+        frequency_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        presence_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
         seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device)
         pos = torch.zeros(num_reqs, dtype=torch.int64, device=device)
         max_num_logprobs = 20
 
+        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
+        # NOTE(woosuk): These are placeholder tensors to avoid None checks in the
+        # penalties kernel. We use 2 instead of 1 as vocab_size to avoid Triton
+        # specialization and re-compilation at runtime.
+        prompt_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+        output_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+
         return cls(
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
             seeds=seeds,
             pos=pos,
             max_num_logprobs=max_num_logprobs,
+            idx_mapping=idx_mapping,
+            prompt_bin_counts=prompt_bin_counts,
+            output_bin_counts=output_bin_counts,
         )
 
 
@@ -83,9 +112,10 @@ def __init__(
         self.extra_data: dict[str, ExtraData] = {}
 
         self.prompt_len = np.zeros(self.max_num_reqs, dtype=np.int32)
-        self.prefill_token_ids = np.zeros(
-            (self.max_num_reqs, self.max_model_len),
-            dtype=np.int32,
+        # NOTE(woosuk): This tensor can be extremely large (e.g., several GBs)
+        # depending on the configured max_num_reqs and max_model_len.
+        self.prefill_token_ids = UvaBuffer(
+            self.max_num_reqs, self.max_model_len, dtype=torch.int32
         )
         self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
 
@@ -119,6 +149,9 @@ def __init__(
         self.temperature = self._make_param(self.max_num_reqs, torch.float32)
         self.top_p = self._make_param(self.max_num_reqs, torch.float32)
         self.top_k = self._make_param(self.max_num_reqs, torch.int32)
+        self.repetition_penalty = self._make_param(self.max_num_reqs, torch.float32)
+        self.frequency_penalty = self._make_param(self.max_num_reqs, torch.float32)
+        self.presence_penalty = self._make_param(self.max_num_reqs, torch.float32)
         self.seeds = self._make_param(self.max_num_reqs, torch.int64)
 
         self.num_logprobs = np.empty(self.max_num_reqs, dtype=np.int32)
@@ -126,6 +159,16 @@ def __init__(
         self.num_logprobs.fill(-1)
         self.needs_prompt_logprobs = np.zeros(self.max_num_reqs, dtype=bool)
 
+        # Statistics for penalties.
+        # TODO(woosuk): These tensors are rarely used but can be extremely large.
+        # Optimize the memory usage.
+        self.prompt_bin_counts = torch.zeros(
+            self.max_num_reqs, self.vocab_size, dtype=torch.int32, device=self.device
+        )
+        self.output_bin_counts = torch.zeros(
+            self.max_num_reqs, self.vocab_size, dtype=torch.int32, device=self.device
+        )
+
     def _make_param(self, size: int, dtype: torch.dtype) -> "Param":
         return Param(size, dtype=dtype, device=self.device, pin_memory=self.pin_memory)
 
@@ -159,7 +202,7 @@ def add_request(
             f"prefill_len {prefill_len} < prompt_len {prompt_len}"
         )
         self.prefill_len.np[req_idx] = prefill_len
-        self.prefill_token_ids[req_idx, :prefill_len] = prefill_token_ids
+        self.prefill_token_ids.np[req_idx, :prefill_len] = prefill_token_ids
 
         self.num_computed_prefill_tokens[req_idx] = num_computed_tokens
         # FIXME(woosuk): This triggers a GPU operation whenever adding a new request.
@@ -178,6 +221,18 @@ def add_request(
         else:
             top_k = self.vocab_size
         self.top_k.np[req_idx] = top_k
+        self.repetition_penalty.np[req_idx] = sampling_params.repetition_penalty
+        self.frequency_penalty.np[req_idx] = sampling_params.frequency_penalty
+        self.presence_penalty.np[req_idx] = sampling_params.presence_penalty
+
+        if use_penalty(sampling_params):
+            bincount(
+                self.prefill_token_ids.gpu[req_idx],
+                prefill_len,
+                prompt_len,
+                self.prompt_bin_counts[req_idx],
+                self.output_bin_counts[req_idx],
+            )
 
         if sampling_params.seed is not None:
             seed = sampling_params.seed
@@ -206,24 +261,32 @@ def remove_request(self, req_id: str) -> None:
 
     def make_sampling_metadata(
         self,
-        idx_mapping: np.ndarray,
+        idx_mapping: torch.Tensor,
+        idx_mapping_np: np.ndarray,
         pos: torch.Tensor,
     ) -> SamplingMetadata:
-        temperature = self.temperature.np[idx_mapping]
+        temperature = self.temperature.np[idx_mapping_np]
         temperature = self.temperature.copy_np_to_gpu(temperature)
 
-        top_p = self.top_p.np[idx_mapping]
+        top_p = self.top_p.np[idx_mapping_np]
         no_top_p = np.all(top_p == 1.0)
         top_p = self.top_p.copy_np_to_gpu(top_p) if not no_top_p else None
 
-        top_k = self.top_k.np[idx_mapping]
+        top_k = self.top_k.np[idx_mapping_np]
         no_top_k = np.all(top_k == self.vocab_size)
         top_k = self.top_k.copy_np_to_gpu(top_k) if not no_top_k else None
 
-        seeds = self.seeds.np[idx_mapping]
+        rep_penalty = self.repetition_penalty.np[idx_mapping_np]
+        rep_penalty = self.repetition_penalty.copy_np_to_gpu(rep_penalty)
+        freq_penalty = self.frequency_penalty.np[idx_mapping_np]
+        freq_penalty = self.frequency_penalty.copy_np_to_gpu(freq_penalty)
+        pres_penalty = self.presence_penalty.np[idx_mapping_np]
+        pres_penalty = self.presence_penalty.copy_np_to_gpu(pres_penalty)
+
+        seeds = self.seeds.np[idx_mapping_np]
         seeds = self.seeds.copy_np_to_gpu(seeds)
 
-        num_logprobs = self.num_logprobs[idx_mapping]
+        num_logprobs = self.num_logprobs[idx_mapping_np]
         max_num_logprobs: int | None = int(np.max(num_logprobs))
         if max_num_logprobs == -1:
             max_num_logprobs = None
@@ -232,9 +295,15 @@ def make_sampling_metadata(
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
+            repetition_penalty=rep_penalty,
+            frequency_penalty=freq_penalty,
+            presence_penalty=pres_penalty,
             seeds=seeds,
             pos=pos,
             max_num_logprobs=max_num_logprobs,
+            idx_mapping=idx_mapping,
+            prompt_bin_counts=self.prompt_bin_counts,
+            output_bin_counts=self.output_bin_counts,
         )
 
     def expand_sampling_metadata(
@@ -294,6 +363,14 @@ class ExtraData:
     in_progress_prompt_logprobs: list[LogprobsTensors] = field(default_factory=list)
 
 
+class UvaBuffer:
+    def __init__(self, *size: int | torch.SymInt, dtype: torch.dtype):
+        assert is_uva_available()
+        self.cpu = torch.zeros(*size, dtype=dtype, device="cpu", pin_memory=True)
+        self.np = self.cpu.numpy()
+        self.gpu = get_cuda_view_from_cpu_tensor(self.cpu)
+
+
 # NOTE(woosuk): Re-compilation can happen at runtime since top_p and top_k can be None.
 @triton.jit
 def _expand_sampling_metadata_kernel(
@@ -304,6 +381,12 @@ def _expand_sampling_metadata_kernel(
     top_k_ptr,
     expanded_top_k_ptr,
     seeds_ptr,
+    rep_penalty_ptr,
+    expanded_rep_penalty_ptr,
+    freq_penalty_ptr,
+    expanded_freq_penalty_ptr,
+    pres_penalty_ptr,
+    expanded_pres_penalty_ptr,
     expanded_seeds_ptr,
     cu_num_logits_ptr,
     BLOCK_SIZE: tl.constexpr,
@@ -327,6 +410,15 @@ def _expand_sampling_metadata_kernel(
         top_k = tl.load(top_k_ptr + req_idx)
         tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
 
+    rep_penalty = tl.load(rep_penalty_ptr + req_idx)
+    tl.store(expanded_rep_penalty_ptr + start_idx + block, rep_penalty, mask=mask)
+
+    freq_penalty = tl.load(freq_penalty_ptr + req_idx)
+    tl.store(expanded_freq_penalty_ptr + start_idx + block, freq_penalty, mask=mask)
+
+    pres_penalty = tl.load(pres_penalty_ptr + req_idx)
+    tl.store(expanded_pres_penalty_ptr + start_idx + block, pres_penalty, mask=mask)
+
     seed = tl.load(seeds_ptr + req_idx)
     tl.store(expanded_seeds_ptr + start_idx + block, seed, mask=mask)
 
@@ -341,6 +433,9 @@ def expand_sampling_metadata(
     expanded_temp = create_empty(sampling_metadata.temperature)
     expanded_top_p = create_empty(sampling_metadata.top_p)
     expanded_top_k = create_empty(sampling_metadata.top_k)
+    expanded_repetition_penalty = create_empty(sampling_metadata.repetition_penalty)
+    expanded_frequency_penalty = create_empty(sampling_metadata.frequency_penalty)
+    expanded_presence_penalty = create_empty(sampling_metadata.presence_penalty)
     expanded_seeds = create_empty(sampling_metadata.seeds)
 
     num_reqs = cu_num_logits.shape[0] - 1
@@ -351,6 +446,12 @@ def expand_sampling_metadata(
         expanded_top_p,
         sampling_metadata.top_k,
         expanded_top_k,
+        sampling_metadata.repetition_penalty,
+        expanded_repetition_penalty,
+        sampling_metadata.frequency_penalty,
+        expanded_frequency_penalty,
+        sampling_metadata.presence_penalty,
+        expanded_presence_penalty,
         sampling_metadata.seeds,
         expanded_seeds,
         cu_num_logits,
@@ -361,6 +462,67 @@ def expand_sampling_metadata(
         top_p=expanded_top_p,
         top_k=expanded_top_k,
         seeds=expanded_seeds,
+        repetition_penalty=expanded_repetition_penalty,
+        frequency_penalty=expanded_frequency_penalty,
+        presence_penalty=expanded_presence_penalty,
         pos=sampling_metadata.pos,
         max_num_logprobs=sampling_metadata.max_num_logprobs,
+        # TODO(woosuk): Support penalties with spec decoding.
+        idx_mapping=sampling_metadata.idx_mapping,
+        prompt_bin_counts=sampling_metadata.prompt_bin_counts,
+        output_bin_counts=sampling_metadata.output_bin_counts,
+    )
+
+
+def use_penalty(sampling_params: SamplingParams) -> bool:
+    return (
+        sampling_params.repetition_penalty != 1.0
+        or sampling_params.frequency_penalty != 0.0
+        or sampling_params.presence_penalty != 0.0
+    )
+
+
+@triton.jit(do_not_specialize=["prefill_len", "prompt_len"])
+def _bincount_kernel(
+    prefill_token_ids_ptr,
+    prefill_len,
+    prompt_len,
+    prompt_bin_counts_ptr,
+    output_bin_counts_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    block_idx = tl.program_id(0)
+    if block_idx * BLOCK_SIZE >= prefill_len:
+        return
+
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    if block_idx * BLOCK_SIZE < prompt_len:
+        mask = block < prompt_len
+        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
+        tl.atomic_add(prompt_bin_counts_ptr + prefill_tokens, 1, mask=mask)
+    if (block_idx + 1) * BLOCK_SIZE >= prompt_len:
+        mask = block < prefill_len
+        mask &= block >= prompt_len
+        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
+        tl.atomic_add(output_bin_counts_ptr + prefill_tokens, 1, mask=mask)
+
+
+def bincount(
+    prefill_token_ids: torch.Tensor,
+    prefill_len: int,
+    prompt_len: int,
+    prompt_bin_counts: torch.Tensor,
+    output_bin_counts: torch.Tensor,
+) -> None:
+    prompt_bin_counts.zero_()
+    output_bin_counts.zero_()
+    BLOCK_SIZE = 1024
+    num_blocks = triton.cdiv(prefill_len, BLOCK_SIZE)
+    _bincount_kernel[(num_blocks,)](
+        prefill_token_ids,
+        prefill_len,
+        prompt_len,
+        prompt_bin_counts,
+        output_bin_counts,
+        BLOCK_SIZE=BLOCK_SIZE,
     )

From b2c50eda506d0970512e710650d7702e97d19352 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 10:30:41 +0800
Subject: [PATCH 537/578] [Bugfix] Fix wrong mock attribute (#29704)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_serving_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 492e15fc82a6..6a1b15c4131e 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -399,7 +399,7 @@ async def _fake_process_inputs(
 @dataclass
 class MockEngine:
     model_config: MockModelConfig = field(default_factory=MockModelConfig)
-    processor: MagicMock = field(default_factory=MagicMock)
+    input_processor: MagicMock = field(default_factory=MagicMock)
     io_processor: MagicMock = field(default_factory=MagicMock)
 
 
From 762a4a6ca9020601220daf9ea11d32493b345442 Mon Sep 17 00:00:00 2001
From: Tsukasa OI <li@livegrid.org>
Date: Sat, 29 Nov 2025 11:32:08 +0900
Subject: [PATCH 538/578] [Frontend] Perform offline path replacement to
 `tokenizer` (#29706)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
---
 .../offline_mode/test_offline_mode.py         | 10 ++++++++
 vllm/engine/arg_utils.py                      | 23 ++++++++++++++-----
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 25e663f3af0e..539ff89abe9c 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -23,6 +23,16 @@
         "max_num_seqs": 64,
         "tensor_parallel_size": 1,
     },
+    {
+        "model": "Qwen/Qwen3-0.6B",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.50,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+        "tokenizer": "Qwen/Qwen3-4B",
+    },
     {
         "model": "mistralai/Mistral-7B-Instruct-v0.1",
         "enforce_eager": True,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 31825980f3a1..186a2a414187 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -581,15 +581,26 @@ def __post_init__(self):
         from vllm.plugins import load_general_plugins
 
         load_general_plugins()
-        # when use hf offline,replace model id to local model path
+        # when use hf offline,replace model and tokenizer id to local model path
         if huggingface_hub.constants.HF_HUB_OFFLINE:
             model_id = self.model
             self.model = get_model_path(self.model, self.revision)
-            logger.info(
-                "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]",
-                model_id,
-                self.model,
-            )
+            if model_id is not self.model:
+                logger.info(
+                    "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]",
+                    model_id,
+                    self.model,
+                )
+            if self.tokenizer is not None:
+                tokenizer_id = self.tokenizer
+                self.tokenizer = get_model_path(self.tokenizer, self.tokenizer_revision)
+                if tokenizer_id is not self.tokenizer:
+                    logger.info(
+                        "HF_HUB_OFFLINE is True, replace tokenizer_id [%s] "
+                        "to tokenizer_path [%s]",
+                        tokenizer_id,
+                        self.tokenizer,
+                    )
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:

From ca1b1e7296e44c75e023b3baaec55d9431a33d78 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Nov 2025 19:49:17 -0800
Subject: [PATCH 539/578] [Model Runner V2] Refactor prefill token preparation
 (#29712)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/cudagraph_utils.py   |  2 +-
 vllm/v1/worker/gpu/input_batch.py       | 86 +++++++++++++++----------
 vllm/v1/worker/gpu/model_runner.py      | 46 +++++--------
 vllm/v1/worker/gpu/spec_decode/eagle.py | 19 +++---
 vllm/v1/worker/gpu/states.py            |  8 ++-
 5 files changed, 83 insertions(+), 78 deletions(-)

diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index eb8e610ae471..7f2994eeca00 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -78,7 +78,7 @@ def capture_graph(
         kv_cache_config: KVCacheConfig,
     ) -> None:
         num_reqs = min(num_tokens, self.max_num_reqs)
-        input_ids = input_buffers.input_ids.gpu[:num_tokens]
+        input_ids = input_buffers.input_ids[:num_tokens]
         positions = input_buffers.positions[:num_tokens]
         attn_metadata = prepare_inputs_to_capture(
             num_reqs,
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 43fd53d3acaa..3f8ef03f9644 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -3,7 +3,6 @@
 from dataclasses import dataclass
 from typing import Any
 
-import numba
 import numpy as np
 import torch
 
@@ -30,15 +29,12 @@ def __init__(
         self.pin_memory = pin_memory
 
         self.idx_mapping = self._make_buffer(max_num_reqs, dtype=torch.int32)
-        self.input_ids = self._make_buffer(max_num_tokens, dtype=torch.int32)
+        self.input_ids = torch.zeros(max_num_tokens, dtype=torch.int32, device=device)
         self.positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=device)
         self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
         self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
         self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
 
-        # Spec decoding.
-        self.next_prefill_tokens = self._make_buffer(max_num_reqs, dtype=torch.int32)
-
         # Structured outputs.
         self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
         self.grammar_bitmask = self._make_buffer(
@@ -120,7 +116,7 @@ def make_dummy(
         input_buffers.seq_lens[num_reqs:] = 0
         seq_lens = input_buffers.seq_lens[:num_reqs]
 
-        input_ids = input_buffers.input_ids.copy_to_gpu(num_tokens)
+        input_ids = input_buffers.input_ids[:num_tokens]
         positions = input_buffers.positions[:num_tokens]
         # attn_metadata = defaultdict(lambda: None)
         logits_indices = query_start_loc[1:] - 1
@@ -146,41 +142,63 @@ def make_dummy(
         )
 
 
-@numba.njit(cache=True)
-def _prepare_prefill_inputs(
-    idx_mapping: np.ndarray,  # [B]
-    query_lens: np.ndarray,  # [B]
-    query_start_loc: np.ndarray,  # [B + 1]
-    prefill_token_ids: np.ndarray,  # [N, max_model_len]
-    num_computed_prefill_tokens: np.ndarray,  # [N]
-    input_ids: np.ndarray,  # [num_input_tokens]
-) -> None:
-    num_reqs = idx_mapping.shape[0]
-    query_starts = query_start_loc[:num_reqs]
-    query_ends = query_start_loc[1 : num_reqs + 1]
-    starts = num_computed_prefill_tokens[idx_mapping]
-    ends = starts + query_lens
-    for i in range(num_reqs):
-        input_ids[query_starts[i] : query_ends[i]] = prefill_token_ids[
-            idx_mapping[i], starts[i] : ends[i]
-        ]
+@triton.jit
+def _prepare_prefill_inputs_kernel(
+    input_ids_ptr,
+    next_prefill_tokens_ptr,
+    idx_mapping_ptr,
+    query_start_loc_ptr,
+    prefill_token_ids_ptr,
+    prefill_token_ids_stride,
+    prefill_lens_ptr,
+    num_computed_tokens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+    prefill_len = tl.load(prefill_lens_ptr + req_state_idx)
+    num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
+    if num_computed >= prefill_len:
+        # Not prefill.
+        return
+
+    query_start = tl.load(query_start_loc_ptr + batch_idx)
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    query_len = query_end - query_start
+
+    prefill_ptr = prefill_token_ids_ptr + req_state_idx * prefill_token_ids_stride
+    for i in range(0, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        tokens = tl.load(prefill_ptr + num_computed + block, mask=mask)
+        tl.store(input_ids_ptr + query_start + block, tokens, mask=mask)
+
+    next_pos = num_computed + query_len
+    if next_pos < prefill_len:
+        next_token = tl.load(prefill_ptr + next_pos)
+        tl.store(next_prefill_tokens_ptr + req_state_idx, next_token)
 
 
 def prepare_prefill_inputs(
-    idx_mapping: np.ndarray,
-    num_scheduled_tokens: np.ndarray,
-    query_start_loc: np.ndarray,
-    prefill_token_ids: np.ndarray,
-    num_computed_prefill_tokens: np.ndarray,
-    input_ids: np.ndarray,
+    input_ids: torch.Tensor,
+    next_prefill_tokens: torch.Tensor,
+    idx_mapping: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    prefill_token_ids: torch.Tensor,
+    prefill_len: torch.Tensor,
+    num_computed_tokens: torch.Tensor,
 ) -> None:
-    _prepare_prefill_inputs(
+    num_reqs = idx_mapping.shape[0]
+    _prepare_prefill_inputs_kernel[(num_reqs,)](
+        input_ids,
+        next_prefill_tokens,
         idx_mapping,
-        num_scheduled_tokens,
         query_start_loc,
         prefill_token_ids,
-        num_computed_prefill_tokens,
-        input_ids,
+        prefill_token_ids.stride(0),
+        prefill_len,
+        num_computed_tokens,
+        BLOCK_SIZE=1024,
     )
 
 
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 9ba234544421..1b512b2ee3e5 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -104,11 +104,9 @@ def __init__(
         if self.use_async_scheduling:
             self.input_prep_event = torch.cuda.Event()
             self.structured_outputs_event = torch.cuda.Event()
-            self.spec_decode_event = torch.cuda.Event()
         else:
             self.input_prep_event = None
             self.structured_outputs_event = None
-            self.spec_decode_event = None
 
         if self.speculative_config is not None:
             self.do_spec_decode = True
@@ -412,9 +410,6 @@ def update_states(self, scheduler_output: SchedulerOutput) -> None:
                 cu_num_new_blocks[i].append(x + len(block_ids))
                 new_block_ids[i].extend(block_ids)
             overwrite.append(True)
-        # Update the GPU tensors for request states.
-        if scheduler_output.scheduled_new_reqs:
-            self.req_states.prefill_len.copy_to_gpu()
 
         # Add new blocks for the existing requests.
         cached_reqs = scheduler_output.scheduled_cached_reqs
@@ -507,16 +502,16 @@ def prepare_inputs(
         query_start_loc_cpu = self.input_buffers.query_start_loc.cpu[: num_reqs + 1]
         query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
 
-        # Copy prefill tokens from CPU to GPU.
+        # Get prefill tokens.
         prepare_prefill_inputs(
-            idx_mapping_np,
-            num_scheduled_tokens,
-            query_start_loc_np,
-            self.req_states.prefill_token_ids.np,
-            self.req_states.num_computed_prefill_tokens,
-            self.input_buffers.input_ids.np,
+            self.input_buffers.input_ids,
+            self.req_states.next_prefill_tokens,
+            idx_mapping,
+            query_start_loc_gpu,
+            self.req_states.prefill_token_ids.gpu,
+            self.req_states.prefill_len.gpu,
+            self.req_states.num_computed_tokens,
         )
-        self.input_buffers.input_ids.copy_to_gpu(num_tokens)
 
         # Prepare positions and seq_lens.
         prepare_pos_seq_lens(
@@ -531,7 +526,7 @@ def prepare_inputs(
         # Some input token ids are directly read from the last sampled tokens
         # and draft tokens. Also, get the logits indices to sample tokens from.
         logits_indices = combine_sampled_and_draft_tokens(
-            self.input_buffers.input_ids.gpu,
+            self.input_buffers.input_ids,
             idx_mapping,
             self.req_states.last_sampled_tokens,
             query_start_loc_gpu,
@@ -572,7 +567,7 @@ def prepare_inputs(
             kv_cache_config=self.kv_cache_config,
         )
 
-        input_ids = self.input_buffers.input_ids.gpu[:num_tokens_after_padding]
+        input_ids = self.input_buffers.input_ids[:num_tokens_after_padding]
         positions = self.input_buffers.positions[:num_tokens_after_padding]
         return InputBatch(
             req_ids=req_ids,
@@ -782,20 +777,13 @@ def propose_draft(
         num_sampled: torch.Tensor,
         num_rejected: torch.Tensor,
     ) -> torch.Tensor:
-        num_reqs = input_batch.num_reqs
-        idx_mapping_np = input_batch.idx_mapping_np
-        with async_barrier(self.spec_decode_event):
-            self.input_buffers.next_prefill_tokens.np[:num_reqs] = (
-                self.req_states.prefill_token_ids.np[
-                    idx_mapping_np,
-                    self.req_states.num_computed_prefill_tokens[idx_mapping_np],
-                ]
-            )
-            next_prefill_tokens = self.input_buffers.next_prefill_tokens.copy_to_gpu(
-                num_reqs
-            )
-
         assert self.speculator is not None
+        last_sampled_tokens = self.req_states.last_sampled_tokens[
+            input_batch.idx_mapping
+        ]
+        next_prefill_tokens = self.req_states.next_prefill_tokens[
+            input_batch.idx_mapping
+        ]
         draft_tokens = self.speculator.propose(
             input_batch,
             sampling_metadata,
@@ -803,7 +791,7 @@ def propose_draft(
             aux_hidden_states,
             num_sampled,
             num_rejected,
-            self.req_states.last_sampled_tokens,
+            last_sampled_tokens,
             next_prefill_tokens,
         )
         return draft_tokens
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index daf2775e8b92..580d67246dfa 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -121,7 +121,7 @@ def run_model(
             num_tokens_across_dp=num_tokens_across_dp,
         ):
             ret_hidden_states = self.model(
-                input_ids=self.input_buffers.input_ids.gpu[:num_tokens],
+                input_ids=self.input_buffers.input_ids[:num_tokens],
                 positions=self.input_buffers.positions[:num_tokens],
                 hidden_states=self.hidden_states[:num_tokens],
             )
@@ -194,7 +194,7 @@ def propose(
         num_sampled: torch.Tensor,
         # [num_reqs]
         num_rejected: torch.Tensor,
-        # [max_num_reqs, 1]
+        # [num_reqs]
         last_sampled: torch.Tensor,
         # [num_reqs]
         next_prefill_tokens: torch.Tensor,
@@ -316,7 +316,6 @@ def _prepare_eagle_inputs_kernel(
     eagle_positions_ptr,
     target_input_ids_ptr,
     target_positions_ptr,
-    idx_mapping_ptr,
     last_sampled_ptr,
     next_prefill_tokens_ptr,
     num_sampled_ptr,
@@ -335,8 +334,7 @@ def _prepare_eagle_inputs_kernel(
 
     num_sampled = tl.load(num_sampled_ptr + batch_idx)
     if num_sampled > 0:
-        req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
-        next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32)
+        next_token = tl.load(last_sampled_ptr + batch_idx).to(tl.int32)
     else:
         # Chunked prefilling.
         # Get the next prefill token.
@@ -368,9 +366,9 @@ def prepare_eagle_inputs(
     num_sampled: torch.Tensor,
     # [num_reqs]
     num_rejected: torch.Tensor,
-    # [max_num_reqs, 1]
+    # [num_reqs]
     last_sampled: torch.Tensor,
-    # [max_num_reqs]
+    # [num_reqs]
     next_prefill_tokens: torch.Tensor,
 ) -> torch.Tensor:
     num_reqs = input_batch.num_reqs
@@ -381,11 +379,10 @@ def prepare_eagle_inputs(
     )
     _prepare_eagle_inputs_kernel[(num_reqs,)](
         last_token_indices,
-        input_buffers.input_ids.gpu,
+        input_buffers.input_ids,
         input_buffers.positions,
         input_batch.input_ids,
         input_batch.positions,
-        input_batch.idx_mapping,
         last_sampled,
         next_prefill_tokens,
         num_sampled,
@@ -485,7 +482,7 @@ def prepare_eagle_decode(
         last_token_indices,
         target_seq_lens,
         num_rejected,
-        input_buffers.input_ids.gpu,
+        input_buffers.input_ids,
         input_buffers.positions,
         input_hidden_states,
         input_hidden_states.stride(0),
@@ -553,7 +550,7 @@ def update_eagle_inputs(
 ):
     num_reqs, hidden_size = output_hidden_states.shape
     _update_eagle_inputs_kernel[(num_reqs,)](
-        input_buffers.input_ids.gpu,
+        input_buffers.input_ids,
         input_buffers.positions,
         hidden_states,
         hidden_states.stride(0),
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 64874b72e60c..4ddd2dfdd731 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -117,8 +117,7 @@ def __init__(
         self.prefill_token_ids = UvaBuffer(
             self.max_num_reqs, self.max_model_len, dtype=torch.int32
         )
-        self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
-
+        self.prefill_len = UvaBuffer(self.max_num_reqs, dtype=torch.int32)
         # Number of computed tokens.
         self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
         self.num_computed_tokens = torch.zeros(
@@ -140,6 +139,9 @@ def __init__(
             dtype=torch.int64,
             device=device,
         )
+        self.next_prefill_tokens = torch.zeros(
+            self.max_num_reqs, dtype=torch.int32, device=device
+        )
 
         # LoRA.
         self.lora_ids = np.zeros(self.max_num_reqs, dtype=np.int32)
@@ -380,13 +382,13 @@ def _expand_sampling_metadata_kernel(
     expanded_top_p_ptr,
     top_k_ptr,
     expanded_top_k_ptr,
-    seeds_ptr,
     rep_penalty_ptr,
     expanded_rep_penalty_ptr,
     freq_penalty_ptr,
     expanded_freq_penalty_ptr,
     pres_penalty_ptr,
     expanded_pres_penalty_ptr,
+    seeds_ptr,
     expanded_seeds_ptr,
     cu_num_logits_ptr,
     BLOCK_SIZE: tl.constexpr,

From e23f665d835a40cfe2fbf21df76a4cbdd3d60b4c Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 28 Nov 2025 23:19:01 -0500
Subject: [PATCH 540/578] [BugFix] Fix DBO failing with TypeError: 'NoneType'
 object is not iterable (#29698)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/v1/distributed/test_dbo.py    | 1 -
 vllm/v1/attention/backends/utils.py | 4 +---
 vllm/v1/worker/dp_utils.py          | 9 ++++++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py
index 866ae742bf3c..16f154d196ba 100644
--- a/tests/v1/distributed/test_dbo.py
+++ b/tests/v1/distributed/test_dbo.py
@@ -85,5 +85,4 @@ def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
         assert accuracy >= MIN_ACCURACY, (
             f"DBO+DP+EP accuracy too low ({all2all_backend}): "
             f"{accuracy:.3f} < {MIN_ACCURACY:.3f} "
-            f"(correct: {results['num_correct']}/{results['num_questions']})"
         )
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 27f07218d9b2..8edfbb5140bc 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -166,9 +166,7 @@ def _make_metadata_with_slice(
     assert start_locs[first_req] <= first_tok < start_locs[first_req + 1], (
         "Token slice start outside of first request"
     )
-    assert start_locs[last_req] <= last_tok < start_locs[last_req + 1], (
-        "Token slice end outside of last request"
-    )
+    # NOTE: last token can be outside of the last request if we have CG padding.
 
     # If the "middle" request has tokens in both ubatches, we have to split it.
     # If ubatch_slice is the first ubatch then we will be splitting the last
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index c1509de821b0..6539d72d81cb 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -93,13 +93,16 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
 
 # This just pads the second ubatch slice out to the total number of tokens
 # (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
-def _pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int):
-    padded_second_ubatch_slice = slice(
+def _pad_out_ubatch_slice(
+    ubatch_slices: UBatchSlices, num_total_tokens: int
+) -> UBatchSlices:
+    padded_second_token_slice = slice(
         ubatch_slices[1].token_slice.start, num_total_tokens
     )
     ubatch_slices[1] = UBatchSlice(
-        padded_second_ubatch_slice, padded_second_ubatch_slice
+        ubatch_slices[1].request_slice, padded_second_token_slice
     )
+    return ubatch_slices
 
 
 def _synchronize_dp_ranks(

From 4b17ce6815ea3b648a15383e7354f0a43d2fe3f6 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Fri, 28 Nov 2025 20:19:05 -0800
Subject: [PATCH 541/578] Add gpu memory wait before test_async_tp (#28893)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: angelayi <yiangela7@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml |  6 +++---
 tests/conftest.py             | 29 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3347d74cbb23..c38068a9b22c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1313,11 +1313,11 @@ steps:
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
     - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
diff --git a/tests/conftest.py b/tests/conftest.py
index 163593eb3f14..11c573befb2d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1424,3 +1424,32 @@ def disable_deepgemm_ue8m0(monkeypatch):
         # Clear cache so the next time it is used it is processed with the
         # default VLLM_USE_DEEP_GEMM_E8M0  setting.
         is_deep_gemm_e8m0_used.cache_clear()
+
+
+@pytest.fixture(autouse=True)
+def clean_gpu_memory_between_tests():
+    if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1":
+        yield
+        return
+
+    # Wait for GPU memory to be cleared before starting the test
+    import gc
+
+    from tests.utils import wait_for_gpu_memory_to_clear
+
+    num_gpus = torch.cuda.device_count()
+    if num_gpus > 0:
+        try:
+            wait_for_gpu_memory_to_clear(
+                devices=list(range(num_gpus)),
+                threshold_ratio=0.1,
+            )
+        except ValueError as e:
+            logger.info("Failed to clean GPU memory: %s", e)
+
+    yield
+
+    # Clean up GPU memory after the test
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()

From 4a80ad0a2548cf0da02af843af346a783fa7754a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Nov 2025 20:27:16 -0800
Subject: [PATCH 542/578] [Model Runner V2] Don't use UVA buffer for
 prefill_len  (#29713)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/model_runner.py | 2 ++
 vllm/v1/worker/gpu/states.py       | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 1b512b2ee3e5..8414ca53b874 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -410,6 +410,8 @@ def update_states(self, scheduler_output: SchedulerOutput) -> None:
                 cu_num_new_blocks[i].append(x + len(block_ids))
                 new_block_ids[i].extend(block_ids)
             overwrite.append(True)
+        if scheduler_output.scheduled_new_reqs:
+            self.req_states.prefill_len.copy_to_gpu()
 
         # Add new blocks for the existing requests.
         cached_reqs = scheduler_output.scheduled_cached_reqs
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 4ddd2dfdd731..44b076fa4c2a 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -117,7 +117,10 @@ def __init__(
         self.prefill_token_ids = UvaBuffer(
             self.max_num_reqs, self.max_model_len, dtype=torch.int32
         )
-        self.prefill_len = UvaBuffer(self.max_num_reqs, dtype=torch.int32)
+        # NOTE(woosuk): We don't use UVA for prefill_len because its GPU view
+        # can be used outside of update_states and prepare_inputs.
+        # Without async barrier, using UVA can cause race conditions.
+        self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
         # Number of computed tokens.
         self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
         self.num_computed_tokens = torch.zeros(

From 39e63dec7c751a28d5a577caf16439e08a878c92 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 29 Nov 2025 14:52:58 +0800
Subject: [PATCH 543/578] [LoRA] Cleanup LoRA unused code (#29611)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../offline_inference/multilora_inference.py  |  6 +--
 tests/entrypoints/conftest.py                 |  6 +--
 tests/entrypoints/openai/test_basic.py        |  2 +-
 tests/entrypoints/openai/test_chat.py         |  8 ++++
 .../entrypoints/openai/test_chunked_prompt.py |  3 +-
 .../entrypoints/openai/test_lora_adapters.py  | 43 +++++++++----------
 tests/entrypoints/openai/test_models.py       | 12 +++---
 tests/entrypoints/openai/test_orca_metrics.py |  5 ++-
 .../openai/test_return_tokens_as_ids.py       |  6 +--
 tests/entrypoints/openai/test_tokenization.py |  2 +-
 tests/entrypoints/openai/test_uds.py          |  2 +-
 tests/entrypoints/sagemaker/conftest.py       |  1 -
 tests/lora/conftest.py                        | 27 ++++--------
 tests/lora/test_lora_checkpoints.py           | 21 +++------
 tests/lora/test_lora_huggingface.py           | 11 ++---
 tests/lora/test_lora_manager.py               | 17 +++-----
 tests/lora/test_olmoe_tp.py                   | 22 +++++++---
 tests/lora/test_peft_helper.py                | 28 ++++++------
 .../test_filesystem_resolver.py               | 10 ++---
 vllm/lora/models.py                           | 34 ++++++---------
 vllm/lora/worker_manager.py                   |  6 +--
 vllm/model_executor/models/apertus.py         |  1 -
 vllm/model_executor/models/bamba.py           |  1 -
 vllm/model_executor/models/exaone.py          |  1 -
 vllm/model_executor/models/exaone4.py         |  1 -
 vllm/model_executor/models/falcon_h1.py       |  1 -
 vllm/model_executor/models/granite.py         |  1 -
 vllm/model_executor/models/granitemoe.py      |  1 -
 .../model_executor/models/granitemoehybrid.py |  1 -
 .../model_executor/models/granitemoeshared.py |  1 -
 vllm/model_executor/models/interfaces.py      |  3 --
 vllm/model_executor/models/jamba.py           |  1 -
 vllm/model_executor/models/lfm2.py            |  1 -
 vllm/model_executor/models/lfm2_moe.py        |  1 -
 vllm/model_executor/models/llama.py           |  1 -
 vllm/model_executor/models/minicpm.py         |  1 -
 vllm/model_executor/models/minicpm_eagle.py   |  1 -
 vllm/model_executor/models/minicpmv.py        |  1 -
 vllm/model_executor/models/mixtral.py         |  1 -
 vllm/model_executor/models/nemotron.py        |  1 -
 vllm/model_executor/models/nemotron_h.py      |  1 -
 vllm/model_executor/models/nemotron_nas.py    |  1 -
 vllm/model_executor/models/phimoe.py          |  1 -
 vllm/model_executor/models/solar.py           |  1 -
 .../models/transformers/base.py               |  1 -
 vllm/v1/worker/lora_model_runner_mixin.py     |  1 -
 46 files changed, 126 insertions(+), 173 deletions(-)

diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 6c23cf342e06..5e5da2c0144c 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -46,7 +46,6 @@ def create_test_prompts(
                 logprobs=1,
                 prompt_logprobs=1,
                 max_tokens=128,
-                stop_token_ids=[32003],
             ),
             LoRARequest("sql-lora", 1, lora_path),
         ),
@@ -57,7 +56,6 @@ def create_test_prompts(
                 logprobs=1,
                 prompt_logprobs=1,
                 max_tokens=128,
-                stop_token_ids=[32003],
             ),
             LoRARequest("sql-lora2", 2, lora_path),
         ),
@@ -98,7 +96,7 @@ def initialize_engine() -> LLMEngine:
     #   use the same rank, it is recommended to set this as low as possible.
     # max_cpu_loras: controls the size of the CPU LoRA cache.
     engine_args = EngineArgs(
-        model="meta-llama/Llama-2-7b-hf",
+        model="meta-llama/Llama-3.2-3B-Instruct",
         enable_lora=True,
         max_loras=1,
         max_lora_rank=8,
@@ -111,7 +109,7 @@ def initialize_engine() -> LLMEngine:
 def main():
     """Main function that sets up and runs the prompt processing."""
     engine = initialize_engine()
-    lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+    lora_path = snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
     test_prompts = create_test_prompts(lora_path)
     process_requests(engine, test_prompts)
 
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index a52e1cb7df33..9ab50c44aa4a 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -188,11 +188,11 @@ def sample_sql_statements():
 
 
 @pytest.fixture(scope="session")
-def zephyr_lora_files():
-    """Download zephyr LoRA files once per test session."""
+def qwen3_lora_files():
+    """Download Qwen3 LoRA files once per test session."""
     from huggingface_hub import snapshot_download
 
-    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+    return snapshot_download(repo_id="charent/self_cognition_Alice")
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index e63a6f10cbc7..3d581a300b6a 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -16,7 +16,7 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index d25958f602b3..b2909f21e4dd 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -19,6 +19,14 @@
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    """Download zephyr LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+
+
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files):  # noqa: F811
     args = [
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 608e509e59e8..f5c412107775 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -8,7 +8,7 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
@@ -20,7 +20,6 @@ def server():
         "--max-model-len",
         "8192",
         "--enforce-eager",
-        # lora config below
         "--max-num-seqs",
         "128",
         "--enable-chunked-prefill",
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index c74f805961bc..22461f470db0 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -13,9 +13,8 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
 
 BADREQUEST_CASES = [
     (
@@ -33,11 +32,11 @@
 
 
 @pytest.fixture(scope="module", params=[True])
-def server_with_lora_modules_json(request, zephyr_lora_files):
+def server_with_lora_modules_json(request, qwen3_lora_files):
     # Define the json format LoRA module configurations
     lora_module_1 = {
-        "name": "zephyr-lora",
-        "path": zephyr_lora_files,
+        "name": "qwen3-lora",
+        "path": qwen3_lora_files,
         "base_model_name": MODEL_NAME,
     }
 
@@ -74,7 +73,7 @@ async def client(server_with_lora_modules_json):
 
 
 @pytest.mark.asyncio
-async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_static_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
@@ -82,17 +81,17 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
     assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
     assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[0].id == "qwen3-lora"
 
 
 @pytest.mark.asyncio
-async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
     response = await client.post(
         "load_lora_adapter",
         cast_to=str,
-        body={"lora_name": "zephyr-lora-3", "lora_path": zephyr_lora_files},
+        body={"lora_name": "qwen3-lora-3", "lora_path": qwen3_lora_files},
     )
     # Ensure adapter loads before querying /models
     assert "success" in response
@@ -100,9 +99,9 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_file
     models = await client.models.list()
     models = models.data
     dynamic_lora_model = models[-1]
-    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.root == qwen3_lora_files
     assert dynamic_lora_model.parent == MODEL_NAME
-    assert dynamic_lora_model.id == "zephyr-lora-3"
+    assert dynamic_lora_model.id == "qwen3-lora-3"
 
 
 @pytest.mark.asyncio
@@ -134,7 +133,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
 async def test_dynamic_lora_badrequests(
     client: openai.AsyncOpenAI,
     tmp_path,
-    zephyr_lora_files,
+    qwen3_lora_files,
     test_name: str,
     config_change: dict,
     expected_error: str,
@@ -143,7 +142,7 @@ async def test_dynamic_lora_badrequests(
     test_dir = tmp_path / test_name
 
     # Copy adapter files
-    shutil.copytree(zephyr_lora_files, test_dir)
+    shutil.copytree(qwen3_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
@@ -167,7 +166,7 @@ async def test_dynamic_lora_badrequests(
 
 @pytest.mark.asyncio
 async def test_multiple_lora_adapters(
-    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
 ):
     """Validate that many loras can be dynamically registered and inferenced
     with concurrently"""
@@ -178,7 +177,7 @@ async def load_and_run_adapter(adapter_name: str):
         await client.post(
             "load_lora_adapter",
             cast_to=str,
-            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
         )
         for _ in range(3):
             await client.completions.create(
@@ -199,7 +198,7 @@ async def load_and_run_adapter(adapter_name: str):
 
 @pytest.mark.asyncio
 async def test_loading_invalid_adapters_does_not_break_others(
-    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
 ):
     invalid_files = tmp_path / "invalid_files"
     invalid_files.mkdir()
@@ -215,7 +214,7 @@ async def run_good_requests(client):
         while not stop_good_requests_event.is_set():
             try:
                 batch = await client.completions.create(
-                    model="zephyr-lora",
+                    model="qwen3-lora",
                     prompt=["Hello there", "Foo bar bazz buzz"],
                     max_tokens=5,
                 )
@@ -254,7 +253,7 @@ async def run_good_requests(client):
     await client.post(
         "load_lora_adapter",
         cast_to=str,
-        body={"lora_name": "valid", "lora_path": zephyr_lora_files},
+        body={"lora_name": "valid", "lora_path": qwen3_lora_files},
     )
     await client.completions.create(
         model="valid",
@@ -267,7 +266,7 @@ async def run_good_requests(client):
 async def test_beam_search_with_lora_adapters(
     client: openai.AsyncOpenAI,
     tmp_path,
-    zephyr_lora_files,
+    qwen3_lora_files,
 ):
     """Validate that async beam search can be used with lora."""
 
@@ -275,7 +274,7 @@ async def load_and_run_adapter(adapter_name: str):
         await client.post(
             "load_lora_adapter",
             cast_to=str,
-            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
         )
         for _ in range(3):
             await client.completions.create(
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 7d2968d96506..e5af11edf7fa 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -8,13 +8,13 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
 
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files):
+def server(qwen3_lora_files):
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -25,7 +25,7 @@ def server(zephyr_lora_files):
         # lora config below
         "--enable-lora",
         "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
+        f"qwen3-lora={qwen3_lora_files}",
         "--max-lora-rank",
         "64",
         "--max-cpu-loras",
@@ -45,12 +45,12 @@ async def client(server):
 
 
 @pytest.mark.asyncio
-async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_check_models(client: openai.AsyncOpenAI, qwen3_lora_files):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
     lora_models = models[1:]
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
-    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
+    assert lora_models[0].id == "qwen3-lora"
diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py
index 1ed44a33bf81..1ce043df0cd8 100644
--- a/tests/entrypoints/openai/test_orca_metrics.py
+++ b/tests/entrypoints/openai/test_orca_metrics.py
@@ -8,7 +8,7 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
@@ -110,8 +110,9 @@ async def test_single_completion(client: openai.AsyncOpenAI):
     choice = completion.choices[0]
     assert len(choice.text) >= 5
     assert choice.finish_reason == "length"
+    # When using Qwen3-0.6B, prompt tokens=[9707, 11, 847, 829, 374]
     assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11
+        completion_tokens=5, prompt_tokens=5, total_tokens=10
     )
 
     # test using token IDs
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index adbcc1f2430c..cedf6ce16060 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -11,11 +11,11 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files):
+def default_server_args(qwen3_lora_files):
     return [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -28,7 +28,7 @@ def default_server_args(zephyr_lora_files):
         # lora config
         "--enable-lora",
         "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
+        f"qwen3-lora={qwen3_lora_files}",
         "--max-lora-rank",
         "64",
         "--max-cpu-loras",
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 7fd32e1c7be1..d23628671a87 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -10,7 +10,7 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_uds.py b/tests/entrypoints/openai/test_uds.py
index 5c39869a794f..c79a4870dea3 100644
--- a/tests/entrypoints/openai/test_uds.py
+++ b/tests/entrypoints/openai/test_uds.py
@@ -10,7 +10,7 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py
index 4c859c2527d2..ad219eec18b7 100644
--- a/tests/entrypoints/sagemaker/conftest.py
+++ b/tests/entrypoints/sagemaker/conftest.py
@@ -9,7 +9,6 @@
 from ...utils import RemoteOpenAIServer
 
 # Model name constants used across tests
-MODEL_NAME_ZEPHYR = "HuggingFaceH4/zephyr-7b-beta"
 MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
 LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese"
 
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 9d38ec542279..be3ddf693383 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -154,23 +154,6 @@ def dummy_model_gate_up() -> nn.Module:
     return model
 
 
-@pytest.fixture(scope="session")
-def llama_2_7b_base_huggingface_id():
-    # used as a base model for testing with sql lora adapter
-    return "meta-llama/Llama-2-7b-hf"
-
-
-@pytest.fixture(scope="session")
-def sql_lora_huggingface_id():
-    # huggingface repo id is used to test lora runtime downloading.
-    return "yard1/llama-2-7b-sql-lora-test"
-
-
-@pytest.fixture(scope="session")
-def sql_lora_files(sql_lora_huggingface_id):
-    return snapshot_download(repo_id=sql_lora_huggingface_id)
-
-
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
     # Note: this module has incorrect adapter_config.json to test
@@ -256,8 +239,14 @@ def qwen3_lora_files():
 
 
 @pytest.fixture(scope="session")
-def llama32_lora_files():
-    return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
+def llama32_lora_huggingface_id():
+    # huggingface repo id is used to test lora runtime downloading.
+    return "jeeejeee/llama32-3b-text2sql-spider"
+
+
+@pytest.fixture(scope="session")
+def llama32_lora_files(llama32_lora_huggingface_id):
+    return snapshot_download(repo_id=llama32_lora_huggingface_id)
 
 
 @pytest.fixture
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index b9b1bc59c6ed..e9653a2fedfa 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -26,8 +26,7 @@ def test_load_checkpoints(
     chatglm3_lora_files,
 ):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
-    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
-    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+
     expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
@@ -47,8 +46,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     elif lora_name == "baichuan7B-zero":
         # Test that the target_modules contain prefix
@@ -63,8 +61,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     elif lora_name == "baichuan7B-zero-regex":
         # Test that the `target_modules` in the form of regular expressions,
@@ -78,8 +75,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     else:
         # For the baichuan7B model, load chatglm3-6b's LoRA,
@@ -95,15 +91,13 @@ def test_load_checkpoints(
                 peft_helper=peft_helper,
                 lora_model_id=1,
                 device="cpu",
-                embedding_modules=embedding_modules,
-                embedding_padding_modules=embed_padding_modules,
+                model_vocab_size=64000,
             )
 
 
 def test_lora_weights_mapping(baichuan_lora_files):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
-    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
-    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+
     expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
@@ -128,8 +122,7 @@ def test_lora_weights_mapping(baichuan_lora_files):
         peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
-        embedding_modules=embedding_modules,
-        embedding_padding_modules=embed_padding_modules,
+        model_vocab_size=64000,
         weights_mapper=hf_to_vllm_mapper,
     )
     for name in lora_model.loras:
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 6a787471c74f..3348d2f8ce65 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -6,10 +6,10 @@
 from vllm.lora.models import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
-from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
 
 # Provide absolute path and huggingface lora ids
-lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
+lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"]
 LLAMA_LORA_MODULES = [
     "qkv_proj",
     "o_proj",
@@ -23,9 +23,8 @@
 @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
 def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     lora_name = request.getfixturevalue(lora_fixture_name)
-    packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
-    embedding_modules = LlamaForCausalLM.embedding_modules
-    embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
+    packed_modules_mapping = Qwen3ForCausalLM.packed_modules_mapping
+
     expected_lora_lst: list[str] = []
     for module in LLAMA_LORA_MODULES:
         if module in packed_modules_mapping:
@@ -43,8 +42,6 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
         peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
-        embedding_modules=embedding_modules,
-        embedding_padding_modules=embed_padding_modules,
     )
 
     # Assertions to ensure the model is loaded correctly
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 24d4dfca46d6..081f14d6fabf 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -34,7 +34,6 @@
     "lm_head": "output_embeddings",
 }
 
-EMBEDDING_PADDING_MODULES = ["lm_head"]
 
 DEVICES = (
     [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@@ -46,24 +45,22 @@
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_from_lora_tensors(sql_lora_files, device):
-    tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
+def test_from_lora_tensors(qwen3_lora_files, device):
+    tensors = load_file(os.path.join(qwen3_lora_files, "adapter_model.safetensors"))
 
     peft_helper = PEFTHelper.from_local_dir(
-        sql_lora_files, max_position_embeddings=4096
+        qwen3_lora_files, max_position_embeddings=4096
     )
     lora_model = LoRAModel.from_lora_tensors(
         1,
         tensors,
         peft_helper=peft_helper,
         device=device,
-        embedding_modules=EMBEDDING_MODULES,
-        embedding_padding_modules=EMBEDDING_PADDING_MODULES,
     )
     for module_name, lora in lora_model.loras.items():
         assert lora.module_name == module_name
         assert lora.rank == 8
-        assert lora.lora_alpha == 16
+        assert lora.lora_alpha == 32
         assert lora.lora_a is not None
         assert lora.lora_b is not None
         assert lora.lora_a.device == torch.device(device)
@@ -430,7 +427,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
     worker_adapter_manager = LRUCacheWorkerLoRAManager(
-        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
+        vllm_config, device, EMBEDDING_MODULES
     )
 
     worker_adapter_manager.max_num_seqs = 4
@@ -533,9 +530,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
 
-    worker_adapter_manager = WorkerLoRAManager(
-        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
-    )
+    worker_adapter_manager = WorkerLoRAManager(vllm_config, device, EMBEDDING_MODULES)
     worker_adapter_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
     worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
 
diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py
index e3c9816625ba..e10419d244c3 100644
--- a/tests/lora/test_olmoe_tp.py
+++ b/tests/lora/test_olmoe_tp.py
@@ -40,7 +40,10 @@
 
 
 def generate_and_test(
-    llm: vllm.LLM, lora_path: str, lora_id: list[int | None] | int | None
+    llm: vllm.LLM,
+    lora_path: str,
+    lora_id: list[int | None] | int | None,
+    compare_lower: bool = False,
 ) -> None:
     prompts = [
         PROMPT_TEMPLATE.format(context="How many candidates are there?"),
@@ -74,12 +77,18 @@ def generate_and_test(
 
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         req_lora_id = lora_id[i] if isinstance(lora_id, list) else lora_id
+        generated_text = generated_texts[i]
         expected_output = (
             EXPECTED_LORA_OUTPUT[i]
             if req_lora_id is not None
             else EXPECTED_BASE_MODEL_OUTPUT[i]
         )
-        assert generated_texts[i].startswith(expected_output)
+
+        if compare_lower:
+            generated_text = generated_text.lower()
+            expected_output = expected_output.lower()
+
+        assert generated_text.startswith(expected_output)
 
 
 def test_olmoe_lora(olmoe_lora_files):
@@ -146,6 +155,9 @@ def test_olmoe_lora_tp4(olmoe_lora_files, fully_sharded_loras):
         tensor_parallel_size=4,
         fully_sharded_loras=fully_sharded_loras,
     )
-
-    generate_and_test(llm, olmoe_lora_files, lora_id=1)
-    generate_and_test(llm, olmoe_lora_files, lora_id=2)
+    generate_and_test(
+        llm, olmoe_lora_files, lora_id=1, compare_lower=fully_sharded_loras
+    )
+    generate_and_test(
+        llm, olmoe_lora_files, lora_id=2, compare_lower=fully_sharded_loras
+    )
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index 9c55c623d444..e3035b00e9e0 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -25,31 +25,33 @@
 ]
 
 
-def test_peft_helper_pass(sql_lora_files, tmp_path):
+def test_peft_helper_pass(llama32_lora_files, tmp_path):
     peft_helper = PEFTHelper.from_local_dir(
-        sql_lora_files, max_position_embeddings=4096
+        llama32_lora_files, max_position_embeddings=4096
     )
     lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
     peft_helper.validate_legal(lora_config)
     assert peft_helper.r == 8
-    assert peft_helper.lora_alpha == 16
-    assert peft_helper.target_modules == [
-        "q_proj",
-        "v_proj",
-        "k_proj",
-        "o_proj",
-        "gate_proj",
-        "up_proj",
+    assert peft_helper.lora_alpha == 32
+    target_modules = sorted(peft_helper.target_modules)
+
+    assert target_modules == [
         "down_proj",
         "embed_tokens",
+        "gate_proj",
+        "k_proj",
         "lm_head",
+        "o_proj",
+        "q_proj",
+        "up_proj",
+        "v_proj",
     ]
     assert peft_helper.vllm_max_position_embeddings == 4096
 
     # test RSLoRA
     rslora_config = dict(use_rslora=True)
     test_dir = tmp_path / "test_rslora"
-    shutil.copytree(sql_lora_files, test_dir)
+    shutil.copytree(llama32_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
@@ -70,14 +72,14 @@ def test_peft_helper_pass(sql_lora_files, tmp_path):
 
 @pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
 def test_peft_helper_error(
-    sql_lora_files,
+    llama32_lora_files,
     tmp_path,
     test_name: str,
     config_change: dict,
     expected_error: str,
 ):
     test_dir = tmp_path / test_name
-    shutil.copytree(sql_lora_files, test_dir)
+    shutil.copytree(llama32_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
index cd98efdd1390..d4adf6f84cf0 100644
--- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
@@ -8,8 +8,8 @@
 
 from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
 
-MODEL_NAME = "mistralai/Mistral-7B-v0.1"
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+LORA_NAME = "charent/self_cognition_Alice"
 PA_NAME = "swapnilbp/llama_tweet_ptune"
 
 
@@ -21,7 +21,7 @@ def adapter_cache(request, tmpdir_factory):
 
 
 @pytest.fixture(scope="module")
-def zephyr_lora_files():
+def qwen3_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
 
@@ -31,9 +31,9 @@ def pa_files():
 
 
 @pytest.mark.asyncio
-async def test_filesystem_resolver(adapter_cache, zephyr_lora_files):
+async def test_filesystem_resolver(adapter_cache, qwen3_lora_files):
     model_files = adapter_cache / LORA_NAME
-    shutil.copytree(zephyr_lora_files, model_files)
+    shutil.copytree(qwen3_lora_files, model_files)
 
     fs_resolver = FilesystemResolver(adapter_cache)
     assert fs_resolver is not None
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 4caaf0e117cc..f568b8b9ba59 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -103,7 +103,6 @@ def get_lora(self, module_name: str) -> LoRALayerWeights | None:
     def check_lora_name(self, lora_name: str) -> bool:
         return lora_name in self.loras
 
-    # (yard1): TODO see if we can derive target_embedding_padding automatically
     @classmethod
     def from_lora_tensors(
         cls,
@@ -112,9 +111,7 @@ def from_lora_tensors(
         peft_helper: PEFTHelper,
         device: str = "cuda",
         dtype: torch.dtype | None = None,
-        target_embedding_padding: int | None = None,
-        embedding_modules: dict[str, str] | None = None,
-        embedding_padding_modules: list[str] | None = None,
+        model_vocab_size: int | None = None,
         weights_mapper: WeightsMapper | None = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
@@ -132,22 +129,21 @@ def from_lora_tensors(
                 )
 
             if is_lora_a:
+                if (
+                    "lora_embedding_A" in tensor_name
+                    and model_vocab_size is not None
+                    and model_vocab_size != tensor.shape[1]
+                ):
+                    raise RuntimeError(
+                        f"The embedding LoRA size({tensor.shape[1]}) must be consistent"
+                        f" with the base model's vocabulary size({model_vocab_size})."
+                    )
                 loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
                 if pin_memory:
                     loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
             else:
                 loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
-                assert embedding_padding_modules is not None
-                if (
-                    any(name in module_name for name in embedding_padding_modules)
-                    and target_embedding_padding is not None
-                ):
-                    lora_b = loras[module_name].lora_b
-                    assert target_embedding_padding >= lora_b.shape[0]
-                    addition = target_embedding_padding - lora_b.shape[0]
-                    loras[module_name].lora_b = torch.nn.functional.pad(
-                        lora_b, (0, 0, 0, addition)
-                    )
+
                 if pin_memory:
                     loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
 
@@ -163,9 +159,7 @@ def from_local_checkpoint(
         lora_model_id: int | None = None,
         device: str = "cuda",
         dtype: torch.dtype | None = None,
-        target_embedding_padding: int | None = None,
-        embedding_modules: dict[str, str] | None = None,
-        embedding_padding_modules: list[str] | None = None,
+        model_vocab_size: int | None = None,
         weights_mapper: WeightsMapper | None = None,
         tensorizer_config_dict: dict | None = None,
     ) -> "LoRAModel":
@@ -287,9 +281,7 @@ def check_unexpected_modules(modules: dict):
             peft_helper=peft_helper,
             device=device,
             dtype=dtype,
-            target_embedding_padding=target_embedding_padding,
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embedding_padding_modules,
+            model_vocab_size=model_vocab_size,
             weights_mapper=weights_mapper,
         )
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index d9a03f050049..7d77ba7247ef 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -34,12 +34,10 @@ def __init__(
         vllm_config: VllmConfig,
         device: torch.device,
         embedding_modules: dict[str, str],
-        embedding_padding_modules: list[str],
         lora_model_cls: type[LoRAModel] = LoRAModel,
     ):
         self._lora_model_cls = lora_model_cls
         self.embedding_modules = embedding_modules
-        self.embedding_padding_modules = embedding_padding_modules
         self._cached_dummy_lora: None | Literal[False] | LoRAModel = False
         self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs
         self.max_num_batched_tokens = (
@@ -121,9 +119,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 lora_model_id=lora_request.lora_int_id,
                 device="cpu",
                 dtype=self.lora_config.lora_dtype,
-                target_embedding_padding=self.vocab_size,
-                embedding_modules=self.embedding_modules,
-                embedding_padding_modules=self.embedding_padding_modules,
+                model_vocab_size=self.vocab_size,
                 tensorizer_config_dict=lora_request.tensorizer_config_dict,
                 weights_mapper=hf_to_vllm_mapper,
             )
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index f38b09bf5506..4a69787af55e 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -482,7 +482,6 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 4422bb5da98f..1d6493b18c34 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -419,7 +419,6 @@ class BambaForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 99002baa8752..acf651ed2498 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -457,7 +457,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "wte": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 9d2c67d6c4f8..cb710a7ec5cf 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -450,7 +450,6 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 9433f0d1b4a4..83ceb9303cfb 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -510,7 +510,6 @@ class FalconH1ForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index eac9ef9478a6..76519c4660f1 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -400,7 +400,6 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 02c6c5862141..b038400a1262 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -497,7 +497,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 9d5eeef198a6..1d9c2f5df4a5 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -601,7 +601,6 @@ class GraniteMoeHybridForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index fd346db7e35a..8ad5a7105bb5 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -263,7 +263,6 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 2218d688e59f..ccd5be42e65a 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -347,7 +347,6 @@ class SupportsLoRA(Protocol):
     # The `embedding_module` and `embedding_padding_modules`
     # are empty by default.
     embedding_modules: ClassVar[dict[str, str]] = {}
-    embedding_padding_modules: ClassVar[list[str]] = []
     packed_modules_mapping: dict[str, list[str]] = {}
 
 
@@ -359,7 +358,6 @@ class _SupportsLoRAType(Protocol):
 
     packed_modules_mapping: dict[str, list[str]]
     embedding_modules: dict[str, str]
-    embedding_padding_modules: list[str]
 
 
 @overload
@@ -379,7 +377,6 @@ def supports_lora(
         lora_attrs = (
             "packed_modules_mapping",
             "embedding_modules",
-            "embedding_padding_modules",
         )
         missing_attrs = tuple(attr for attr in lora_attrs if not hasattr(model, attr))
 
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 3a2c98c73dab..b2ad12be1e35 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -480,7 +480,6 @@ class JambaForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index 69615f8b6a09..a4a994f97a2f 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -422,7 +422,6 @@ class Lfm2ForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index aaeb2cc38999..c8669de72dd0 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -602,7 +602,6 @@ class Lfm2MoeForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 6dfbde7a17f5..8f5a967cd422 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -528,7 +528,6 @@ class LlamaForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     # Mistral/Llama models can also be loaded with --load-format mistral
     # from consolidated.safetensors checkpoints
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 67911ba8c1c8..67c462f4b25c 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -568,7 +568,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index e6bccfcac4f1..9f3587a6d2fa 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -305,7 +305,6 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2ac97764dd34..6d0ebf5c9825 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1741,5 +1741,4 @@ def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
         # so update values before init is called
         cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
         cls.embedding_modules.update(instance_cls.embedding_modules)
-        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
         return instance_cls(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e21656dbd635..50ec57e7a805 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -496,7 +496,6 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 93ad2064a2fc..ffba6c9dfe73 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -439,7 +439,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 8675eff59222..baeb901bbb05 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -713,7 +713,6 @@ class NemotronHForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 34ea2945b711..9d968dee8711 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -387,7 +387,6 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     # Mistral/Llama models can also be loaded with --load-format mistral
     # from consolidated.safetensors checkpoints
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index a5a669139b2f..49530776f890 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -617,7 +617,6 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index c576154b1ecf..7bef56110cab 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -426,7 +426,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index b33ce35427f5..f3ebc6da8e30 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -93,7 +93,6 @@ def vllm_flash_attention_forward(
 
 
 class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
-    embedding_padding_modules = ["lm_head"]
     embedding_modules = ["embed_tokens"]  # TODO transformers will have a util to get it
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 37abe5649460..a67246146005 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -43,7 +43,6 @@ def load_lora_model(
             vllm_config,
             device,
             model.embedding_modules,
-            model.embedding_padding_modules,
         )
         return self.lora_manager.create_lora_manager(model)
 

From 6afc0ffaf61b49599dec25c30b28596a390ad2d9 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 29 Nov 2025 00:41:01 -0800
Subject: [PATCH 544/578] [Model Runner V2] Add sample/ directory and
 reorganize files (#29719)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/model_runner.py           |  15 +-
 vllm/v1/worker/gpu/sample/__init__.py        |   0
 vllm/v1/worker/gpu/sample/gumbel.py          | 100 ++++++
 vllm/v1/worker/gpu/sample/logprob.py         | 167 ++++++++++
 vllm/v1/worker/gpu/sample/metadata.py        | 179 ++++++++++
 vllm/v1/worker/gpu/{ => sample}/penalties.py |  48 ++-
 vllm/v1/worker/gpu/sample/sampler.py         |  79 +++++
 vllm/v1/worker/gpu/sampler.py                | 333 -------------------
 vllm/v1/worker/gpu/spec_decode/eagle.py      |   4 +-
 vllm/v1/worker/gpu/states.py                 | 232 +------------
 10 files changed, 587 insertions(+), 570 deletions(-)
 create mode 100644 vllm/v1/worker/gpu/sample/__init__.py
 create mode 100644 vllm/v1/worker/gpu/sample/gumbel.py
 create mode 100644 vllm/v1/worker/gpu/sample/logprob.py
 create mode 100644 vllm/v1/worker/gpu/sample/metadata.py
 rename vllm/v1/worker/gpu/{ => sample}/penalties.py (66%)
 create mode 100644 vllm/v1/worker/gpu/sample/sampler.py
 delete mode 100644 vllm/v1/worker/gpu/sampler.py

diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 8414ca53b874..fdb930c4dcd7 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -47,13 +47,18 @@
     prepare_pos_seq_lens,
     prepare_prefill_inputs,
 )
-from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
+from vllm.v1.worker.gpu.sample.logprob import compute_prompt_logprobs
+from vllm.v1.worker.gpu.sample.metadata import (
+    SamplingMetadata,
+    expand_sampling_metadata,
+)
+from vllm.v1.worker.gpu.sample.sampler import Sampler
 from vllm.v1.worker.gpu.spec_decode import init_speculator
 from vllm.v1.worker.gpu.spec_decode.rejection_sample import (
     get_num_rejected,
     rejection_sample,
 )
-from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
+from vllm.v1.worker.gpu.states import RequestState
 from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
@@ -890,8 +895,10 @@ def execute_model(
                     input_batch.idx_mapping, input_batch.idx_mapping_np, pos
                 )
                 if input_batch.num_draft_tokens > 0:
-                    sampling_metadata = self.req_states.expand_sampling_metadata(
-                        sampling_metadata, input_batch.cu_num_logits
+                    sampling_metadata = expand_sampling_metadata(
+                        sampling_metadata,
+                        input_batch.cu_num_logits,
+                        max_expand_len=self.num_speculative_steps + 1,
                     )
 
                 if self.lora_config:
diff --git a/vllm/v1/worker/gpu/sample/__init__.py b/vllm/v1/worker/gpu/sample/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/v1/worker/gpu/sample/gumbel.py b/vllm/v1/worker/gpu/sample/gumbel.py
new file mode 100644
index 000000000000..3e0d72e56939
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/gumbel.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _gumbel_sample_kernel(
+    local_argmax_ptr,
+    local_argmax_stride,
+    local_max_ptr,
+    local_max_stride,
+    logits_ptr,
+    logits_stride,
+    seeds_ptr,
+    pos_ptr,
+    temp_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+    APPLY_TEMPERATURE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    block_idx = tl.program_id(1)
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = block < vocab_size
+    logits = tl.load(
+        logits_ptr + req_idx * logits_stride + block,
+        mask=mask,
+        other=float("-inf"),
+    )
+    logits = logits.to(tl.float32)
+
+    temp = tl.load(temp_ptr + req_idx).to(tl.float32)
+    if temp != 0.0:
+        # Calculate the seed for gumbel noise.
+        seed = tl.load(seeds_ptr + req_idx)
+        pos = tl.load(pos_ptr + req_idx)
+        gumbel_seed = tl.randint(seed, pos)
+
+        # Generate gumbel noise.
+        r = tl.rand(gumbel_seed, block).to(tl.float64)
+        gumbel_noise = -tl.log(-tl.log(r + 1e-20) + 1e-20)
+        gumbel_noise = gumbel_noise.to(tl.float32)
+
+        # Apply temperature.
+        if APPLY_TEMPERATURE:
+            # NOTE(woosuk): Use div_rn to match the behavior of torch.
+            logits = tl.div_rn(logits, temp)
+
+        # Apply gumbel noise.
+        logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
+
+    idx = tl.argmax(logits, axis=0)
+    token_id = block_idx * BLOCK_SIZE + idx
+    value = tl.max(logits, axis=0)
+    tl.store(local_argmax_ptr + req_idx * local_argmax_stride + block_idx, token_id)
+    tl.store(local_max_ptr + req_idx * local_max_stride + block_idx, value)
+
+
+def gumbel_sample(
+    logits: torch.Tensor,  # [num_reqs, vocab_size]
+    temperature: torch.Tensor,  # [num_reqs]
+    seed: torch.Tensor,  # [num_reqs]
+    pos: torch.Tensor,  # [num_reqs]
+    apply_temperature: bool,
+) -> torch.Tensor:
+    num_reqs, vocab_size = logits.shape
+    BLOCK_SIZE = 1024
+    num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
+    local_argmax = torch.empty(
+        num_reqs,
+        num_blocks,
+        dtype=torch.int64,
+        device=logits.device,
+    )
+    local_max = torch.empty(
+        num_reqs,
+        num_blocks,
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    _gumbel_sample_kernel[(num_reqs, num_blocks)](
+        local_argmax,
+        local_argmax.stride(0),
+        local_max,
+        local_max.stride(0),
+        logits,
+        logits.stride(0),
+        seed,
+        pos,
+        temperature,
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+        APPLY_TEMPERATURE=apply_temperature,
+    )
+    # NOTE(woosuk): Use int64 for later indexing.
+    max_block_idx = local_max.argmax(dim=-1, keepdim=True)
+    sampled = local_argmax.gather(dim=-1, index=max_block_idx).view(-1)
+    return sampled
diff --git a/vllm/v1/worker/gpu/sample/logprob.py b/vllm/v1/worker/gpu/sample/logprob.py
new file mode 100644
index 000000000000..25448b387b31
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/logprob.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.v1.outputs import LogprobsTensors
+
+
+@triton.jit
+def _topk_log_softmax_kernel(
+    output_ptr,
+    logits_ptr,
+    logits_stride,
+    topk_ids_ptr,
+    topk,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+    PADDED_TOPK: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = logits_ptr + req_idx * logits_stride
+
+    max_val = float("-inf")
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
+        max_val = tl.max(tl.maximum(logits, max_val))
+    max_val = max_val.to(tl.float32)  # type: ignore
+
+    se = 0.0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=0.0)
+        # NOTE(woosuk): Make sure that logits and all following operations use FP32.
+        logits = logits.to(tl.float32)
+        e = tl.exp(logits - max_val)
+        e = tl.where(block < vocab_size, e, 0.0)
+        se += tl.sum(e)
+    lse = tl.log(se)
+
+    k_offset = tl.arange(0, PADDED_TOPK)
+    k_mask = k_offset < topk
+    topk_ids = tl.load(topk_ids_ptr + req_idx * topk + k_offset, mask=k_mask, other=0)
+
+    logits = tl.load(row_ptr + topk_ids, mask=k_mask)
+    logits = logits.to(tl.float32)
+    o = logits - max_val - lse
+    tl.store(output_ptr + req_idx * topk + k_offset, o, mask=k_mask)
+
+
+@triton.jit
+def _ranks_kernel(
+    output_ptr,
+    logits_ptr,
+    logits_stride,
+    token_ids_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = logits_ptr + req_idx * logits_stride
+
+    token_id = tl.load(token_ids_ptr + req_idx)
+    x = tl.load(row_ptr + token_id)
+
+    n = 0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
+        n += tl.sum((logits > x).to(tl.int32))
+    tl.store(output_ptr + req_idx, n)
+
+
+def compute_token_logprobs(
+    logits: torch.Tensor,
+    token_ids: torch.Tensor,
+) -> torch.Tensor:
+    batch_size = logits.shape[0]
+    vocab_size = logits.shape[1]
+    token_ids = token_ids.to(torch.int64)
+    num_logprobs = token_ids.shape[1]
+    logprobs = torch.empty(
+        batch_size,
+        num_logprobs,
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    _topk_log_softmax_kernel[(batch_size,)](
+        logprobs,
+        logits,
+        logits.stride(0),
+        token_ids,
+        num_logprobs,
+        vocab_size,
+        BLOCK_SIZE=1024,  # type: ignore
+        PADDED_TOPK=triton.next_power_of_2(num_logprobs),
+    )
+    return logprobs
+
+
+def compute_topk_logprobs(
+    logits: torch.Tensor,
+    num_logprobs: int,
+    sampled_token_ids: torch.Tensor,
+) -> LogprobsTensors:
+    assert num_logprobs >= 0
+    batch_size, vocab_size = logits.shape
+    if num_logprobs == 0:
+        logprob_token_ids = sampled_token_ids.unsqueeze(-1)
+    else:
+        topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
+        logprob_token_ids = torch.cat(
+            (sampled_token_ids.unsqueeze(-1), topk_indices), dim=1
+        )
+
+    # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full
+    # logprobs tensor. Instead, we only compute and return the logprobs of
+    # the topk + 1 tokens.
+    logprobs = compute_token_logprobs(logits, logprob_token_ids)
+    token_ranks = torch.empty(
+        batch_size,
+        dtype=torch.int64,
+        device=logits.device,
+    )
+    _ranks_kernel[(batch_size,)](
+        token_ranks,
+        logits,
+        logits.stride(0),
+        sampled_token_ids,
+        vocab_size,
+        BLOCK_SIZE=8192,  # type: ignore
+    )
+    return LogprobsTensors(
+        logprob_token_ids=logprob_token_ids,
+        logprobs=logprobs,
+        selected_token_ranks=token_ranks,
+    )
+
+
+def compute_prompt_logprobs(
+    prompt_token_ids: torch.Tensor,
+    prompt_hidden_states: torch.Tensor,
+    logits_fn: Callable[[torch.Tensor], torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # Since materializing the full prompt logits can take too much memory,
+    # we compute it in chunks.
+    CHUNK_SIZE = 1024
+    logprobs = []
+    ranks = []
+    prompt_token_ids = prompt_token_ids.to(torch.int64)
+    for start_idx in range(0, prompt_token_ids.shape[0], CHUNK_SIZE):
+        end_idx = start_idx + CHUNK_SIZE
+        # NOTE(woosuk): logits_fn can be slow because it involves all-gather.
+        prompt_logits = logits_fn(prompt_hidden_states[start_idx:end_idx])
+        prompt_logprobs = compute_topk_logprobs(
+            prompt_logits,
+            0,  # num_logprobs
+            prompt_token_ids[start_idx:end_idx],
+        )
+        logprobs.append(prompt_logprobs.logprobs)
+        ranks.append(prompt_logprobs.selected_token_ranks)
+
+    logprobs = torch.cat(logprobs, dim=0) if len(logprobs) > 1 else logprobs[0]
+    ranks = torch.cat(ranks, dim=0) if len(ranks) > 1 else ranks[0]
+    return logprobs, ranks
diff --git a/vllm/v1/worker/gpu/sample/metadata.py b/vllm/v1/worker/gpu/sample/metadata.py
new file mode 100644
index 000000000000..666649fd0eeb
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/metadata.py
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@dataclass
+class SamplingMetadata:
+    temperature: torch.Tensor
+
+    top_p: torch.Tensor | None
+    top_k: torch.Tensor | None
+
+    repetition_penalty: torch.Tensor
+    frequency_penalty: torch.Tensor
+    presence_penalty: torch.Tensor
+
+    seeds: torch.Tensor
+    pos: torch.Tensor
+
+    # None means no logprobs, 0 means sampled token logprobs only
+    max_num_logprobs: int | None
+
+    # For penalties
+    idx_mapping: torch.Tensor
+    prompt_bin_counts: torch.Tensor
+    output_bin_counts: torch.Tensor
+
+    @classmethod
+    def make_dummy(
+        cls,
+        num_reqs: int,
+        device: torch.device,
+    ) -> "SamplingMetadata":
+        assert num_reqs > 0
+        temperature = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        temperature[0] = 0.5
+        # TODO(woosuk): Use top-p and top-k for dummy sampler.
+        # Currently, they are disabled because of memory usage.
+        # top_p = torch.full((num_reqs,), 0.95, dtype=torch.float32, device=device)
+        # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device)
+        top_p = None
+        top_k = None
+        # NOTE(woosuk): We must set penalties to their default values to make sure
+        # the penalties kernel does not touch the placeholder bin_counts tensors.
+        repetition_penalty = torch.ones(num_reqs, dtype=torch.float32, device=device)
+        frequency_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        presence_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        pos = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        max_num_logprobs = 20
+
+        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
+        # NOTE(woosuk): These are placeholder tensors to avoid None checks in the
+        # penalties kernel. We use 2 instead of 1 as vocab_size to avoid Triton
+        # specialization and re-compilation at runtime.
+        prompt_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+        output_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+
+        return cls(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            seeds=seeds,
+            pos=pos,
+            max_num_logprobs=max_num_logprobs,
+            idx_mapping=idx_mapping,
+            prompt_bin_counts=prompt_bin_counts,
+            output_bin_counts=output_bin_counts,
+        )
+
+
+# NOTE(woosuk): Re-compilation can happen at runtime since top_p and top_k can be None.
+@triton.jit
+def _expand_sampling_metadata_kernel(
+    temp_ptr,
+    expanded_temp_ptr,
+    top_p_ptr,
+    expanded_top_p_ptr,
+    top_k_ptr,
+    expanded_top_k_ptr,
+    rep_penalty_ptr,
+    expanded_rep_penalty_ptr,
+    freq_penalty_ptr,
+    expanded_freq_penalty_ptr,
+    pres_penalty_ptr,
+    expanded_pres_penalty_ptr,
+    seeds_ptr,
+    expanded_seeds_ptr,
+    cu_num_logits_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    start_idx = tl.load(cu_num_logits_ptr + req_idx)
+    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
+    num_tokens = end_idx - start_idx
+
+    block = tl.arange(0, BLOCK_SIZE)
+    mask = block < num_tokens
+
+    temp = tl.load(temp_ptr + req_idx)
+    tl.store(expanded_temp_ptr + start_idx + block, temp, mask=mask)
+
+    if top_p_ptr is not None:
+        top_p = tl.load(top_p_ptr + req_idx)
+        tl.store(expanded_top_p_ptr + start_idx + block, top_p, mask=mask)
+
+    if top_k_ptr is not None:
+        top_k = tl.load(top_k_ptr + req_idx)
+        tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
+
+    rep_penalty = tl.load(rep_penalty_ptr + req_idx)
+    tl.store(expanded_rep_penalty_ptr + start_idx + block, rep_penalty, mask=mask)
+
+    freq_penalty = tl.load(freq_penalty_ptr + req_idx)
+    tl.store(expanded_freq_penalty_ptr + start_idx + block, freq_penalty, mask=mask)
+
+    pres_penalty = tl.load(pres_penalty_ptr + req_idx)
+    tl.store(expanded_pres_penalty_ptr + start_idx + block, pres_penalty, mask=mask)
+
+    seed = tl.load(seeds_ptr + req_idx)
+    tl.store(expanded_seeds_ptr + start_idx + block, seed, mask=mask)
+
+
+def expand_sampling_metadata(
+    sampling_metadata: SamplingMetadata,
+    cu_num_logits: torch.Tensor,
+    max_expand_len: int,
+) -> SamplingMetadata:
+    total_num_logits = sampling_metadata.pos.shape[0]
+    create_empty = lambda x: x.new_empty(total_num_logits) if x is not None else None
+    expanded_temp = create_empty(sampling_metadata.temperature)
+    expanded_top_p = create_empty(sampling_metadata.top_p)
+    expanded_top_k = create_empty(sampling_metadata.top_k)
+    expanded_repetition_penalty = create_empty(sampling_metadata.repetition_penalty)
+    expanded_frequency_penalty = create_empty(sampling_metadata.frequency_penalty)
+    expanded_presence_penalty = create_empty(sampling_metadata.presence_penalty)
+    expanded_seeds = create_empty(sampling_metadata.seeds)
+
+    num_reqs = cu_num_logits.shape[0] - 1
+    _expand_sampling_metadata_kernel[(num_reqs,)](
+        sampling_metadata.temperature,
+        expanded_temp,
+        sampling_metadata.top_p,
+        expanded_top_p,
+        sampling_metadata.top_k,
+        expanded_top_k,
+        sampling_metadata.repetition_penalty,
+        expanded_repetition_penalty,
+        sampling_metadata.frequency_penalty,
+        expanded_frequency_penalty,
+        sampling_metadata.presence_penalty,
+        expanded_presence_penalty,
+        sampling_metadata.seeds,
+        expanded_seeds,
+        cu_num_logits,
+        BLOCK_SIZE=triton.next_power_of_2(max_expand_len),
+    )
+    return SamplingMetadata(
+        temperature=expanded_temp,
+        top_p=expanded_top_p,
+        top_k=expanded_top_k,
+        seeds=expanded_seeds,
+        repetition_penalty=expanded_repetition_penalty,
+        frequency_penalty=expanded_frequency_penalty,
+        presence_penalty=expanded_presence_penalty,
+        pos=sampling_metadata.pos,
+        max_num_logprobs=sampling_metadata.max_num_logprobs,
+        # TODO(woosuk): Support penalties with spec decoding.
+        idx_mapping=sampling_metadata.idx_mapping,
+        prompt_bin_counts=sampling_metadata.prompt_bin_counts,
+        output_bin_counts=sampling_metadata.output_bin_counts,
+    )
diff --git a/vllm/v1/worker/gpu/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
similarity index 66%
rename from vllm/v1/worker/gpu/penalties.py
rename to vllm/v1/worker/gpu/sample/penalties.py
index f87ee01718cd..1607a75fd56b 100644
--- a/vllm/v1/worker/gpu/penalties.py
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -3,7 +3,7 @@
 import torch
 
 from vllm.triton_utils import tl, triton
-from vllm.v1.worker.gpu.states import SamplingMetadata
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
 
 
 @triton.jit
@@ -83,3 +83,49 @@ def apply_penalties(logits: torch.Tensor, sampling_metadata: SamplingMetadata) -
         vocab_size,
         BLOCK_SIZE=BLOCK_SIZE,
     )
+
+
+@triton.jit(do_not_specialize=["prefill_len", "prompt_len"])
+def _bincount_kernel(
+    prefill_token_ids_ptr,
+    prefill_len,
+    prompt_len,
+    prompt_bin_counts_ptr,
+    output_bin_counts_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    block_idx = tl.program_id(0)
+    if block_idx * BLOCK_SIZE >= prefill_len:
+        return
+
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    if block_idx * BLOCK_SIZE < prompt_len:
+        mask = block < prompt_len
+        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
+        tl.atomic_add(prompt_bin_counts_ptr + prefill_tokens, 1, mask=mask)
+    if (block_idx + 1) * BLOCK_SIZE >= prompt_len:
+        mask = block < prefill_len
+        mask &= block >= prompt_len
+        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
+        tl.atomic_add(output_bin_counts_ptr + prefill_tokens, 1, mask=mask)
+
+
+def bincount(
+    prefill_token_ids: torch.Tensor,
+    prefill_len: int,
+    prompt_len: int,
+    prompt_bin_counts: torch.Tensor,
+    output_bin_counts: torch.Tensor,
+) -> None:
+    prompt_bin_counts.zero_()
+    output_bin_counts.zero_()
+    BLOCK_SIZE = 1024
+    num_blocks = triton.cdiv(prefill_len, BLOCK_SIZE)
+    _bincount_kernel[(num_blocks,)](
+        prefill_token_ids,
+        prefill_len,
+        prompt_len,
+        prompt_bin_counts,
+        output_bin_counts,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py
new file mode 100644
index 000000000000..4e7c85a021cf
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/sampler.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.config.model import LogprobsMode
+from vllm.v1.outputs import SamplerOutput
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
+from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu.sample.penalties import apply_penalties
+
+
+class Sampler:
+    def __init__(
+        self,
+        logprobs_mode: LogprobsMode = "raw_logprobs",
+    ):
+        if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
+            raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
+        self.logprobs_mode = logprobs_mode
+
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+        if sampling_metadata.max_num_logprobs is not None:
+            if self.logprobs_mode == "processed_logprobs":
+                sampled, logits = self.sample(
+                    logits, sampling_metadata, return_logits=True
+                )
+            else:
+                assert self.logprobs_mode == "raw_logprobs"
+                sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
+
+            logprobs_tensors = compute_topk_logprobs(
+                logits,
+                sampling_metadata.max_num_logprobs,
+                sampled,
+            )
+        else:
+            sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
+            logprobs_tensors = None
+
+        # These are GPU tensors.
+        sampler_output = SamplerOutput(
+            # The sampled tokens are expanded to 2D tensor with shape
+            # [num_requests, 1], where each row represents one generated
+            # token per request.
+            sampled_token_ids=sampled.view(-1, 1),
+            logprobs_tensors=logprobs_tensors,
+        )
+        return sampler_output
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        return_logits: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        is_greedy = sampling_metadata.temperature == 0
+        temp = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
+        logits = logits / temp.view(-1, 1)
+        logits = apply_top_k_top_p(
+            logits, sampling_metadata.top_k, sampling_metadata.top_p
+        )
+        # Apply penalties in place.
+        apply_penalties(logits, sampling_metadata)
+
+        sampled = gumbel_sample(
+            logits,
+            sampling_metadata.temperature,
+            sampling_metadata.seeds,
+            sampling_metadata.pos,
+            apply_temperature=False,
+        )
+        return sampled, logits if return_logits else None
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
deleted file mode 100644
index 6e0d6150a966..000000000000
--- a/vllm/v1/worker/gpu/sampler.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Callable
-
-import torch
-
-from vllm.config.model import LogprobsMode
-from vllm.triton_utils import tl, triton
-from vllm.v1.outputs import LogprobsTensors, SamplerOutput
-from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
-from vllm.v1.worker.gpu.penalties import apply_penalties
-from vllm.v1.worker.gpu.states import SamplingMetadata
-
-
-class Sampler:
-    def __init__(
-        self,
-        logprobs_mode: LogprobsMode = "raw_logprobs",
-    ):
-        if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
-            raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
-        self.logprobs_mode = logprobs_mode
-
-    def __call__(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
-        if sampling_metadata.max_num_logprobs is not None:
-            if self.logprobs_mode == "processed_logprobs":
-                sampled, logits = self.sample(
-                    logits, sampling_metadata, return_logits=True
-                )
-            else:
-                assert self.logprobs_mode == "raw_logprobs"
-                sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
-
-            logprobs_tensors = compute_topk_logprobs(
-                logits,
-                sampling_metadata.max_num_logprobs,
-                sampled,
-            )
-        else:
-            sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
-            logprobs_tensors = None
-
-        # These are GPU tensors.
-        sampler_output = SamplerOutput(
-            # The sampled tokens are expanded to 2D tensor with shape
-            # [num_requests, 1], where each row represents one generated
-            # token per request.
-            sampled_token_ids=sampled.view(-1, 1),
-            logprobs_tensors=logprobs_tensors,
-        )
-        return sampler_output
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-        return_logits: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        is_greedy = sampling_metadata.temperature == 0
-        temp = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
-        logits = logits / temp.view(-1, 1)
-        logits = apply_top_k_top_p(
-            logits, sampling_metadata.top_k, sampling_metadata.top_p
-        )
-        # Apply penalties in place.
-        apply_penalties(logits, sampling_metadata)
-
-        sampled = gumbel_sample(
-            logits,
-            sampling_metadata.temperature,
-            sampling_metadata.seeds,
-            sampling_metadata.pos,
-            apply_temperature=False,
-        )
-        return sampled, logits if return_logits else None
-
-
-@triton.jit
-def _gumbel_sample_kernel(
-    local_argmax_ptr,
-    local_argmax_stride,
-    local_max_ptr,
-    local_max_stride,
-    logits_ptr,
-    logits_stride,
-    seeds_ptr,
-    pos_ptr,
-    temp_ptr,
-    vocab_size,
-    BLOCK_SIZE: tl.constexpr,
-    APPLY_TEMPERATURE: tl.constexpr,
-):
-    req_idx = tl.program_id(0)
-    block_idx = tl.program_id(1)
-    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-    mask = block < vocab_size
-    logits = tl.load(
-        logits_ptr + req_idx * logits_stride + block,
-        mask=mask,
-        other=float("-inf"),
-    )
-    logits = logits.to(tl.float32)
-
-    temp = tl.load(temp_ptr + req_idx).to(tl.float32)
-    if temp != 0.0:
-        # Calculate the seed for gumbel noise.
-        seed = tl.load(seeds_ptr + req_idx)
-        pos = tl.load(pos_ptr + req_idx)
-        gumbel_seed = tl.randint(seed, pos)
-
-        # Generate gumbel noise.
-        r = tl.rand(gumbel_seed, block).to(tl.float64)
-        gumbel_noise = -tl.log(-tl.log(r + 1e-20) + 1e-20)
-        gumbel_noise = gumbel_noise.to(tl.float32)
-
-        # Apply temperature.
-        if APPLY_TEMPERATURE:
-            # NOTE(woosuk): Use div_rn to match the behavior of torch.
-            logits = tl.div_rn(logits, temp)
-
-        # Apply gumbel noise.
-        logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
-
-    idx = tl.argmax(logits, axis=0)
-    token_id = block_idx * BLOCK_SIZE + idx
-    value = tl.max(logits, axis=0)
-    tl.store(local_argmax_ptr + req_idx * local_argmax_stride + block_idx, token_id)
-    tl.store(local_max_ptr + req_idx * local_max_stride + block_idx, value)
-
-
-def gumbel_sample(
-    logits: torch.Tensor,  # [num_reqs, vocab_size]
-    temperature: torch.Tensor,  # [num_reqs]
-    seed: torch.Tensor,  # [num_reqs]
-    pos: torch.Tensor,  # [num_reqs]
-    apply_temperature: bool,
-) -> torch.Tensor:
-    num_reqs, vocab_size = logits.shape
-    BLOCK_SIZE = 1024
-    num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
-    local_argmax = torch.empty(
-        num_reqs,
-        num_blocks,
-        dtype=torch.int64,
-        device=logits.device,
-    )
-    local_max = torch.empty(
-        num_reqs,
-        num_blocks,
-        dtype=torch.float32,
-        device=logits.device,
-    )
-    _gumbel_sample_kernel[(num_reqs, num_blocks)](
-        local_argmax,
-        local_argmax.stride(0),
-        local_max,
-        local_max.stride(0),
-        logits,
-        logits.stride(0),
-        seed,
-        pos,
-        temperature,
-        vocab_size,
-        BLOCK_SIZE=BLOCK_SIZE,
-        APPLY_TEMPERATURE=apply_temperature,
-    )
-    # NOTE(woosuk): Use int64 for later indexing.
-    max_block_idx = local_max.argmax(dim=-1, keepdim=True)
-    sampled = local_argmax.gather(dim=-1, index=max_block_idx).view(-1)
-    return sampled
-
-
-@triton.jit
-def _topk_log_softmax_kernel(
-    output_ptr,
-    logits_ptr,
-    logits_stride,
-    topk_ids_ptr,
-    topk,
-    vocab_size,
-    BLOCK_SIZE: tl.constexpr,
-    PADDED_TOPK: tl.constexpr,
-):
-    req_idx = tl.program_id(0)
-    row_ptr = logits_ptr + req_idx * logits_stride
-
-    max_val = float("-inf")
-    for i in range(0, vocab_size, BLOCK_SIZE):
-        block = i + tl.arange(0, BLOCK_SIZE)
-        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
-        max_val = tl.max(tl.maximum(logits, max_val))
-    max_val = max_val.to(tl.float32)  # type: ignore
-
-    se = 0.0
-    for i in range(0, vocab_size, BLOCK_SIZE):
-        block = i + tl.arange(0, BLOCK_SIZE)
-        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=0.0)
-        # NOTE(woosuk): Make sure that logits and all following operations use FP32.
-        logits = logits.to(tl.float32)
-        e = tl.exp(logits - max_val)
-        e = tl.where(block < vocab_size, e, 0.0)
-        se += tl.sum(e)
-    lse = tl.log(se)
-
-    k_offset = tl.arange(0, PADDED_TOPK)
-    k_mask = k_offset < topk
-    topk_ids = tl.load(topk_ids_ptr + req_idx * topk + k_offset, mask=k_mask, other=0)
-
-    logits = tl.load(row_ptr + topk_ids, mask=k_mask)
-    logits = logits.to(tl.float32)
-    o = logits - max_val - lse
-    tl.store(output_ptr + req_idx * topk + k_offset, o, mask=k_mask)
-
-
-@triton.jit
-def _ranks_kernel(
-    output_ptr,
-    logits_ptr,
-    logits_stride,
-    token_ids_ptr,
-    vocab_size,
-    BLOCK_SIZE: tl.constexpr,
-):
-    req_idx = tl.program_id(0)
-    row_ptr = logits_ptr + req_idx * logits_stride
-
-    token_id = tl.load(token_ids_ptr + req_idx)
-    x = tl.load(row_ptr + token_id)
-
-    n = 0
-    for i in range(0, vocab_size, BLOCK_SIZE):
-        block = i + tl.arange(0, BLOCK_SIZE)
-        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
-        n += tl.sum((logits > x).to(tl.int32))
-    tl.store(output_ptr + req_idx, n)
-
-
-def compute_token_logprobs(
-    logits: torch.Tensor,
-    token_ids: torch.Tensor,
-) -> torch.Tensor:
-    batch_size = logits.shape[0]
-    vocab_size = logits.shape[1]
-    token_ids = token_ids.to(torch.int64)
-    num_logprobs = token_ids.shape[1]
-    logprobs = torch.empty(
-        batch_size,
-        num_logprobs,
-        dtype=torch.float32,
-        device=logits.device,
-    )
-    _topk_log_softmax_kernel[(batch_size,)](
-        logprobs,
-        logits,
-        logits.stride(0),
-        token_ids,
-        num_logprobs,
-        vocab_size,
-        BLOCK_SIZE=1024,  # type: ignore
-        PADDED_TOPK=triton.next_power_of_2(num_logprobs),
-    )
-    return logprobs
-
-
-def compute_topk_logprobs(
-    logits: torch.Tensor,
-    num_logprobs: int,
-    sampled_token_ids: torch.Tensor,
-) -> LogprobsTensors:
-    assert num_logprobs >= 0
-    batch_size, vocab_size = logits.shape
-    if num_logprobs == 0:
-        logprob_token_ids = sampled_token_ids.unsqueeze(-1)
-    else:
-        topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
-        logprob_token_ids = torch.cat(
-            (sampled_token_ids.unsqueeze(-1), topk_indices), dim=1
-        )
-
-    # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full
-    # logprobs tensor. Instead, we only compute and return the logprobs of
-    # the topk + 1 tokens.
-    logprobs = compute_token_logprobs(logits, logprob_token_ids)
-    token_ranks = torch.empty(
-        batch_size,
-        dtype=torch.int64,
-        device=logits.device,
-    )
-    _ranks_kernel[(batch_size,)](
-        token_ranks,
-        logits,
-        logits.stride(0),
-        sampled_token_ids,
-        vocab_size,
-        BLOCK_SIZE=8192,  # type: ignore
-    )
-    return LogprobsTensors(
-        logprob_token_ids=logprob_token_ids,
-        logprobs=logprobs,
-        selected_token_ranks=token_ranks,
-    )
-
-
-def compute_prompt_logprobs(
-    prompt_token_ids: torch.Tensor,
-    prompt_hidden_states: torch.Tensor,
-    logits_fn: Callable[[torch.Tensor], torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor]:
-    # Since materializing the full prompt logits can take too much memory,
-    # we compute it in chunks.
-    CHUNK_SIZE = 1024
-    logprobs = []
-    ranks = []
-    prompt_token_ids = prompt_token_ids.to(torch.int64)
-    for start_idx in range(0, prompt_token_ids.shape[0], CHUNK_SIZE):
-        end_idx = start_idx + CHUNK_SIZE
-        # NOTE(woosuk): logits_fn can be slow because it involves all-gather.
-        prompt_logits = logits_fn(prompt_hidden_states[start_idx:end_idx])
-        prompt_logprobs = compute_topk_logprobs(
-            prompt_logits,
-            0,  # num_logprobs
-            prompt_token_ids[start_idx:end_idx],
-        )
-        logprobs.append(prompt_logprobs.logprobs)
-        ranks.append(prompt_logprobs.selected_token_ranks)
-
-    logprobs = torch.cat(logprobs, dim=0) if len(logprobs) > 1 else logprobs[0]
-    ranks = torch.cat(ranks, dim=0) if len(ranks) > 1 else ranks[0]
-    return logprobs, ranks
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 580d67246dfa..a2d0550326f3 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -18,9 +18,9 @@
 from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
 from vllm.v1.worker.gpu.block_table import BlockTables
 from vllm.v1.worker.gpu.input_batch import InputBatch, InputBuffers
-from vllm.v1.worker.gpu.sampler import gumbel_sample
+from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu.spec_decode.eagle_cudagraph import EagleCudaGraphManager
-from vllm.v1.worker.gpu.states import SamplingMetadata
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 44b076fa4c2a..c3428faab0a3 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -7,86 +7,18 @@
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
-from vllm.triton_utils import tl, triton
 from vllm.utils.platform_utils import is_uva_available
 from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu.sample.penalties import bincount
 
 _NP_INT64_MIN = np.iinfo(np.int64).min
 _NP_INT64_MAX = np.iinfo(np.int64).max
 NO_LORA_ID = 0
 
 
-@dataclass
-class SamplingMetadata:
-    temperature: torch.Tensor
-
-    top_p: torch.Tensor | None
-    top_k: torch.Tensor | None
-
-    repetition_penalty: torch.Tensor
-    frequency_penalty: torch.Tensor
-    presence_penalty: torch.Tensor
-
-    seeds: torch.Tensor
-    pos: torch.Tensor
-
-    # None means no logprobs, 0 means sampled token logprobs only
-    max_num_logprobs: int | None
-
-    # For penalties
-    idx_mapping: torch.Tensor
-    prompt_bin_counts: torch.Tensor
-    output_bin_counts: torch.Tensor
-
-    @classmethod
-    def make_dummy(
-        cls,
-        num_reqs: int,
-        device: torch.device,
-    ) -> "SamplingMetadata":
-        assert num_reqs > 0
-        temperature = torch.zeros(num_reqs, dtype=torch.float32, device=device)
-        temperature[0] = 0.5
-        # TODO(woosuk): Use top-p and top-k for dummy sampler.
-        # Currently, they are disabled because of memory usage.
-        # top_p = torch.full((num_reqs,), 0.95, dtype=torch.float32, device=device)
-        # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device)
-        top_p = None
-        top_k = None
-        # NOTE(woosuk): We must set penalties to their default values to make sure
-        # the penalties kernel does not touch the placeholder bin_counts tensors.
-        repetition_penalty = torch.ones(num_reqs, dtype=torch.float32, device=device)
-        frequency_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
-        presence_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
-        seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device)
-        pos = torch.zeros(num_reqs, dtype=torch.int64, device=device)
-        max_num_logprobs = 20
-
-        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
-        # NOTE(woosuk): These are placeholder tensors to avoid None checks in the
-        # penalties kernel. We use 2 instead of 1 as vocab_size to avoid Triton
-        # specialization and re-compilation at runtime.
-        prompt_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
-        output_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
-
-        return cls(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            frequency_penalty=frequency_penalty,
-            presence_penalty=presence_penalty,
-            seeds=seeds,
-            pos=pos,
-            max_num_logprobs=max_num_logprobs,
-            idx_mapping=idx_mapping,
-            prompt_bin_counts=prompt_bin_counts,
-            output_bin_counts=output_bin_counts,
-        )
-
-
 class RequestState:
     def __init__(
         self,
@@ -311,17 +243,6 @@ def make_sampling_metadata(
             output_bin_counts=self.output_bin_counts,
         )
 
-    def expand_sampling_metadata(
-        self,
-        sampling_metadata: SamplingMetadata,
-        cu_num_logits: torch.Tensor,
-    ) -> SamplingMetadata:
-        # For draft tokens, we need to expand the sampling param tensors as
-        # each request samples multiple tokens in each step.
-        return expand_sampling_metadata(
-            sampling_metadata, cu_num_logits, self.num_speculative_steps
-        )
-
     def make_lora_inputs(
         self,
         req_ids: list[str],
@@ -376,158 +297,9 @@ def __init__(self, *size: int | torch.SymInt, dtype: torch.dtype):
         self.gpu = get_cuda_view_from_cpu_tensor(self.cpu)
 
 
-# NOTE(woosuk): Re-compilation can happen at runtime since top_p and top_k can be None.
-@triton.jit
-def _expand_sampling_metadata_kernel(
-    temp_ptr,
-    expanded_temp_ptr,
-    top_p_ptr,
-    expanded_top_p_ptr,
-    top_k_ptr,
-    expanded_top_k_ptr,
-    rep_penalty_ptr,
-    expanded_rep_penalty_ptr,
-    freq_penalty_ptr,
-    expanded_freq_penalty_ptr,
-    pres_penalty_ptr,
-    expanded_pres_penalty_ptr,
-    seeds_ptr,
-    expanded_seeds_ptr,
-    cu_num_logits_ptr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    req_idx = tl.program_id(0)
-    start_idx = tl.load(cu_num_logits_ptr + req_idx)
-    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
-    num_tokens = end_idx - start_idx
-
-    block = tl.arange(0, BLOCK_SIZE)
-    mask = block < num_tokens
-
-    temp = tl.load(temp_ptr + req_idx)
-    tl.store(expanded_temp_ptr + start_idx + block, temp, mask=mask)
-
-    if top_p_ptr is not None:
-        top_p = tl.load(top_p_ptr + req_idx)
-        tl.store(expanded_top_p_ptr + start_idx + block, top_p, mask=mask)
-
-    if top_k_ptr is not None:
-        top_k = tl.load(top_k_ptr + req_idx)
-        tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
-
-    rep_penalty = tl.load(rep_penalty_ptr + req_idx)
-    tl.store(expanded_rep_penalty_ptr + start_idx + block, rep_penalty, mask=mask)
-
-    freq_penalty = tl.load(freq_penalty_ptr + req_idx)
-    tl.store(expanded_freq_penalty_ptr + start_idx + block, freq_penalty, mask=mask)
-
-    pres_penalty = tl.load(pres_penalty_ptr + req_idx)
-    tl.store(expanded_pres_penalty_ptr + start_idx + block, pres_penalty, mask=mask)
-
-    seed = tl.load(seeds_ptr + req_idx)
-    tl.store(expanded_seeds_ptr + start_idx + block, seed, mask=mask)
-
-
-def expand_sampling_metadata(
-    sampling_metadata: SamplingMetadata,
-    cu_num_logits: torch.Tensor,
-    num_speculative_steps: int,
-) -> SamplingMetadata:
-    total_num_logits = sampling_metadata.pos.shape[0]
-    create_empty = lambda x: x.new_empty(total_num_logits) if x is not None else None
-    expanded_temp = create_empty(sampling_metadata.temperature)
-    expanded_top_p = create_empty(sampling_metadata.top_p)
-    expanded_top_k = create_empty(sampling_metadata.top_k)
-    expanded_repetition_penalty = create_empty(sampling_metadata.repetition_penalty)
-    expanded_frequency_penalty = create_empty(sampling_metadata.frequency_penalty)
-    expanded_presence_penalty = create_empty(sampling_metadata.presence_penalty)
-    expanded_seeds = create_empty(sampling_metadata.seeds)
-
-    num_reqs = cu_num_logits.shape[0] - 1
-    _expand_sampling_metadata_kernel[(num_reqs,)](
-        sampling_metadata.temperature,
-        expanded_temp,
-        sampling_metadata.top_p,
-        expanded_top_p,
-        sampling_metadata.top_k,
-        expanded_top_k,
-        sampling_metadata.repetition_penalty,
-        expanded_repetition_penalty,
-        sampling_metadata.frequency_penalty,
-        expanded_frequency_penalty,
-        sampling_metadata.presence_penalty,
-        expanded_presence_penalty,
-        sampling_metadata.seeds,
-        expanded_seeds,
-        cu_num_logits,
-        BLOCK_SIZE=triton.next_power_of_2(num_speculative_steps + 1),
-    )
-    return SamplingMetadata(
-        temperature=expanded_temp,
-        top_p=expanded_top_p,
-        top_k=expanded_top_k,
-        seeds=expanded_seeds,
-        repetition_penalty=expanded_repetition_penalty,
-        frequency_penalty=expanded_frequency_penalty,
-        presence_penalty=expanded_presence_penalty,
-        pos=sampling_metadata.pos,
-        max_num_logprobs=sampling_metadata.max_num_logprobs,
-        # TODO(woosuk): Support penalties with spec decoding.
-        idx_mapping=sampling_metadata.idx_mapping,
-        prompt_bin_counts=sampling_metadata.prompt_bin_counts,
-        output_bin_counts=sampling_metadata.output_bin_counts,
-    )
-
-
 def use_penalty(sampling_params: SamplingParams) -> bool:
     return (
         sampling_params.repetition_penalty != 1.0
         or sampling_params.frequency_penalty != 0.0
         or sampling_params.presence_penalty != 0.0
     )
-
-
-@triton.jit(do_not_specialize=["prefill_len", "prompt_len"])
-def _bincount_kernel(
-    prefill_token_ids_ptr,
-    prefill_len,
-    prompt_len,
-    prompt_bin_counts_ptr,
-    output_bin_counts_ptr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    block_idx = tl.program_id(0)
-    if block_idx * BLOCK_SIZE >= prefill_len:
-        return
-
-    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-    if block_idx * BLOCK_SIZE < prompt_len:
-        mask = block < prompt_len
-        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
-        tl.atomic_add(prompt_bin_counts_ptr + prefill_tokens, 1, mask=mask)
-    if (block_idx + 1) * BLOCK_SIZE >= prompt_len:
-        mask = block < prefill_len
-        mask &= block >= prompt_len
-        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
-        tl.atomic_add(output_bin_counts_ptr + prefill_tokens, 1, mask=mask)
-
-
-def bincount(
-    prefill_token_ids: torch.Tensor,
-    prefill_len: int,
-    prompt_len: int,
-    prompt_bin_counts: torch.Tensor,
-    output_bin_counts: torch.Tensor,
-) -> None:
-    prompt_bin_counts.zero_()
-    output_bin_counts.zero_()
-    BLOCK_SIZE = 1024
-    num_blocks = triton.cdiv(prefill_len, BLOCK_SIZE)
-    _bincount_kernel[(num_blocks,)](
-        prefill_token_ids,
-        prefill_len,
-        prompt_len,
-        prompt_bin_counts,
-        output_bin_counts,
-        BLOCK_SIZE=BLOCK_SIZE,
-    )

From 04a797cd0e94567ee2b158a0246d6112a197fdc5 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Sat, 29 Nov 2025 10:15:39 +0100
Subject: [PATCH 545/578] [Doc]: fixing typos in various files. (#29717)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/distributed/device_communicators/pynccl_allocator.py       | 2 +-
 vllm/distributed/parallel_state.py                              | 2 +-
 vllm/entrypoints/openai/serving_models.py                       | 2 +-
 .../layers/quantization/quark/schemes/quark_ocp_mx.py           | 2 +-
 vllm/transformers_utils/repo_utils.py                           | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl_allocator.py b/vllm/distributed/device_communicators/pynccl_allocator.py
index 401b80046f60..2e5d94de9d01 100644
--- a/vllm/distributed/device_communicators/pynccl_allocator.py
+++ b/vllm/distributed/device_communicators/pynccl_allocator.py
@@ -157,7 +157,7 @@ def __enter__(self):
         if self.disabled:
             return self
         assert self.pynccl_comm is not None, (
-            "Symmetric memory requires pynccl to be initalized"
+            "Symmetric memory requires pynccl to be initialized"
         )
         assert self.pynccl_comm.nccl_version >= 22703, (
             "NCCL version 2.27.3 or higher is required for NCCL symmetric memory"
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 52b433cfaf1b..c82a77c216af 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1583,7 +1583,7 @@ def destroy_distributed_environment():
 
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
-    # Ensure all objects are not freezed before cleanup
+    # Ensure all objects are not frozen before cleanup
     gc.unfreeze()
 
     destroy_model_parallel()
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 165de5b618c4..953398a9a72a 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -150,7 +150,7 @@ async def load_lora_adapter(
                 lora_request.base_model_name = base_model_name
 
             # Validate that the adapter can be loaded into the engine
-            # This will also pre-load it for incoming requests
+            # This will also preload it for incoming requests
             try:
                 await self.engine_client.add_lora(lora_request)
             except Exception as e:
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index 33e9f9806b27..eeb60023dc0e 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -37,7 +37,7 @@
 # use `rocm_aiter_ops.is_asm_fp4_gemm_dynamic_quant_enabled()`
 # for envs checks which does not require @cache anymore.
 # triton kernel is torch compile compatible.
-# does not require direct registeration.
+# does not require direct registration.
 # use `rocm_aiter_ops.triton_fp4_gemm_dynamic_qaunt`.
 @cache
 def is_rocm_aiter_fp4_asm_gemm_enabled() -> bool:
diff --git a/vllm/transformers_utils/repo_utils.py b/vllm/transformers_utils/repo_utils.py
index 3ccec04fc487..b63288914cf8 100644
--- a/vllm/transformers_utils/repo_utils.py
+++ b/vllm/transformers_utils/repo_utils.py
@@ -171,7 +171,7 @@ def file_or_path_exists(
         repo_id=model, filename=config_name, revision=revision
     )
     if isinstance(cached_filepath, str):
-        # The config file exists in cache- we can continue trying to load
+        # The config file exists in cache - we can continue trying to load
         return True
 
     # NB: file_exists will only check for the existence of the config file on

From f223ed4181f003301a115950178746f42b668903 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 29 Nov 2025 02:29:16 -0800
Subject: [PATCH 546/578] [Model Runner V2] Fuse penalties and temperature into
 single kernel (#29720)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sample/gumbel.py    |  5 +-
 vllm/v1/worker/gpu/sample/penalties.py | 67 ++++++++++++++++----------
 vllm/v1/worker/gpu/sample/sampler.py   | 32 ++++++------
 3 files changed, 59 insertions(+), 45 deletions(-)

diff --git a/vllm/v1/worker/gpu/sample/gumbel.py b/vllm/v1/worker/gpu/sample/gumbel.py
index 3e0d72e56939..a95bf1e7a37a 100644
--- a/vllm/v1/worker/gpu/sample/gumbel.py
+++ b/vllm/v1/worker/gpu/sample/gumbel.py
@@ -45,8 +45,9 @@ def _gumbel_sample_kernel(
 
         # Apply temperature.
         if APPLY_TEMPERATURE:
-            # NOTE(woosuk): Use div_rn to match the behavior of torch.
-            logits = tl.div_rn(logits, temp)
+            # NOTE(woosuk): Match the behavior of _penalties_and_temperature_kernel.
+            # E.g., if the kernel uses tl.div_rn, we should use tl.div_rn here too.
+            logits = logits / temp
 
         # Apply gumbel noise.
         logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
index 1607a75fd56b..69cf9d26ec99 100644
--- a/vllm/v1/worker/gpu/sample/penalties.py
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -7,12 +7,13 @@
 
 
 @triton.jit
-def _penalties_kernel(
+def _penalties_and_temperature_kernel(
     logits_ptr,
     logits_stride,
     repetition_penalty_ptr,
     frequency_penalty_ptr,
     presence_penalty_ptr,
+    temperature_ptr,
     idx_mapping_ptr,
     prompt_bin_counts_ptr,
     prompt_bin_counts_stride,
@@ -25,12 +26,16 @@ def _penalties_kernel(
     rep_penalty = tl.load(repetition_penalty_ptr + batch_idx)
     freq_penalty = tl.load(frequency_penalty_ptr + batch_idx)
     pres_penalty = tl.load(presence_penalty_ptr + batch_idx)
+    temperature = tl.load(temperature_ptr + batch_idx)
+    temperature = tl.where(temperature == 0.0, 1.0, temperature)
 
     use_rep_penalty = rep_penalty != 1.0
     use_freq_penalty = freq_penalty != 0.0
     use_pres_penalty = pres_penalty != 0.0
-    if not (use_rep_penalty or use_freq_penalty or use_pres_penalty):
-        # No penalties to apply. Early return.
+    use_penalty = use_rep_penalty or use_freq_penalty or use_pres_penalty
+    use_temperature = temperature != 1.0
+    if not (use_penalty or use_temperature):
+        # Early return to avoid loading logits.
         return
 
     block_idx = tl.program_id(1)
@@ -39,42 +44,54 @@ def _penalties_kernel(
     logits = tl.load(logits_ptr + batch_idx * logits_stride + block, mask=mask)
     logits = logits.to(tl.float32)
 
-    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
-    output_bin_counts = tl.load(
-        output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + block,
-        mask=mask,
-    )
-
-    # Apply repetition penalties.
-    if use_rep_penalty:
-        prompt_bin_counts = tl.load(
-            prompt_bin_counts_ptr + req_state_idx * prompt_bin_counts_stride + block,
+    if use_penalty:
+        req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+        output_bin_counts = tl.load(
+            output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + block,
             mask=mask,
         )
-        # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
-        scale = tl.where((prompt_bin_counts + output_bin_counts) > 0, rep_penalty, 1.0)
-        # If logits are positive, divide by penalty, otherwise multiply by penalty.
-        scale = tl.where(logits > 0, 1.0 / scale, scale)
-        logits *= scale
-
-    # Apply frequency penalties.
-    logits -= freq_penalty * output_bin_counts
-    # Apply presence penalties.
-    logits -= pres_penalty * (output_bin_counts > 0)
+        output_bin_mask = output_bin_counts > 0
+
+        # Apply repetition penalties.
+        if use_rep_penalty:
+            prompt_bin_counts = tl.load(
+                prompt_bin_counts_ptr
+                + req_state_idx * prompt_bin_counts_stride
+                + block,
+                mask=mask,
+            )
+            prompt_bin_mask = prompt_bin_counts > 0
+            # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+            scale = tl.where(prompt_bin_mask | output_bin_mask, rep_penalty, 1.0)
+            # If logits are positive, divide by penalty, otherwise multiply by penalty.
+            logits *= tl.where(logits > 0, 1.0 / scale, scale)
+
+        # Apply frequency penalties.
+        logits -= freq_penalty * output_bin_counts
+        # Apply presence penalties.
+        logits -= pres_penalty * output_bin_mask
+
+    # Apply temperature.
+    logits = logits / temperature
+
     # Store back to logits.
     tl.store(logits_ptr + batch_idx * logits_stride + block, logits, mask=mask)
 
 
-def apply_penalties(logits: torch.Tensor, sampling_metadata: SamplingMetadata) -> None:
+def apply_penalties_and_temperature(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> None:
     num_reqs, vocab_size = logits.shape
     BLOCK_SIZE = 8192
     num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
-    _penalties_kernel[(num_reqs, num_blocks)](
+    _penalties_and_temperature_kernel[(num_reqs, num_blocks)](
         logits,
         logits.stride(0),
         sampling_metadata.repetition_penalty,
         sampling_metadata.frequency_penalty,
         sampling_metadata.presence_penalty,
+        sampling_metadata.temperature,
         sampling_metadata.idx_mapping,
         sampling_metadata.prompt_bin_counts,
         sampling_metadata.prompt_bin_counts.stride(0),
diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py
index 4e7c85a021cf..3429dd3e4d0f 100644
--- a/vllm/v1/worker/gpu/sample/sampler.py
+++ b/vllm/v1/worker/gpu/sample/sampler.py
@@ -9,7 +9,7 @@
 from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
 from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
 from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
-from vllm.v1.worker.gpu.sample.penalties import apply_penalties
+from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature
 
 
 class Sampler:
@@ -26,22 +26,19 @@ def __call__(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
+        sampled, processed_logits = self.sample(logits, sampling_metadata)
         if sampling_metadata.max_num_logprobs is not None:
-            if self.logprobs_mode == "processed_logprobs":
-                sampled, logits = self.sample(
-                    logits, sampling_metadata, return_logits=True
-                )
-            else:
-                assert self.logprobs_mode == "raw_logprobs"
-                sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
-
+            logits = (
+                processed_logits
+                if self.logprobs_mode == "processed_logprobs"
+                else logits
+            )
             logprobs_tensors = compute_topk_logprobs(
                 logits,
                 sampling_metadata.max_num_logprobs,
                 sampled,
             )
         else:
-            sampled, _ = self.sample(logits, sampling_metadata, return_logits=False)
             logprobs_tensors = None
 
         # These are GPU tensors.
@@ -58,16 +55,15 @@ def sample(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-        return_logits: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor | None]:
-        is_greedy = sampling_metadata.temperature == 0
-        temp = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
-        logits = logits / temp.view(-1, 1)
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Copy logits to a new FP32 tensor.
+        logits = torch.empty_like(logits, dtype=torch.float32).copy_(logits)
+
+        # Apply penalties and temperature in place.
+        apply_penalties_and_temperature(logits, sampling_metadata)
         logits = apply_top_k_top_p(
             logits, sampling_metadata.top_k, sampling_metadata.top_p
         )
-        # Apply penalties in place.
-        apply_penalties(logits, sampling_metadata)
 
         sampled = gumbel_sample(
             logits,
@@ -76,4 +72,4 @@ def sample(
             sampling_metadata.pos,
             apply_temperature=False,
         )
-        return sampled, logits if return_logits else None
+        return sampled, logits

From 34a984274eae2f8fb9d1d6413abd08d7fcde741c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 20:02:21 +0800
Subject: [PATCH 547/578] [Misc] Refactor tokenizer interface (#29693)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-amd.yaml                      |   4 +-
 .buildkite/test-pipeline.yaml                 |   4 +-
 benchmarks/backend_request_func.py            |   2 +-
 docs/features/reasoning_outputs.md            |   5 +-
 docs/features/tool_calling.md                 |   2 +-
 .../entrypoints/openai/test_serving_engine.py |   2 +-
 .../openai/tool_parsers/conftest.py           |   4 +-
 .../tool_parsers/test_hermes_tool_parser.py   |  12 +-
 .../test_llama3_json_tool_parser.py           |   4 +-
 .../test_llama4_pythonic_tool_parser.py       |  10 +-
 .../tool_parsers/test_olmo3_tool_parser.py    |  10 +-
 .../tool_parsers/test_pythonic_tool_parser.py |  10 +-
 .../entrypoints/openai/tool_parsers/utils.py  |   4 +-
 tests/entrypoints/test_chat_utils.py          |   2 +-
 .../language/generation/test_mistral.py       |   2 +-
 .../multimodal/generation/test_voxtral.py     |   2 +-
 .../multimodal/generation/vlm_utils/core.py   |   4 +-
 .../multimodal/generation/vlm_utils/types.py  |   4 +-
 .../multimodal/processing/test_common.py      |   2 +-
 tests/multimodal/test_processing.py           |  21 ++-
 .../test_mistral_reasoning_parser.py          |   2 +-
 tests/reasoning/utils.py                      |   2 +-
 tests/tokenization/__init__.py                |   0
 tests/tokenization/test_do_lower_case.py      |  18 --
 tests/tokenization/test_get_eos.py            |  32 ----
 tests/tokenization/test_tokenizer.py          |  23 ---
 tests/tokenization/test_tokenizer_registry.py | 120 -------------
 tests/tokenizers_/__init__.py                 |   4 +
 tests/tokenizers_/test_basic.py               |  59 +++++++
 .../test_cached_tokenizer.py                  |   5 +-
 .../test_detokenize.py                        |   2 +-
 .../test_mistral.py}                          |  21 +--
 tests/tokenizers_/test_registry.py            |  36 ++++
 .../tool_use/test_ernie45_moe_tool_parser.py  |   5 +-
 tests/tool_use/test_jamba_tool_parser.py      |   7 +-
 tests/tool_use/test_qwen3coder_tool_parser.py |   5 +-
 tests/tool_use/test_seed_oss_tool_parser.py   |   5 +-
 tests/tool_use/test_xlam_tool_parser.py       |   5 +-
 tests/transformers_utils/test_config.py       |  86 +++------
 ...gs_from_processor.py => test_processor.py} |   0
 tests/transformers_utils/test_repo_utils.py   |  62 +++++++
 tests/v1/engine/test_output_processor.py      |   4 +-
 tools/pre_commit/check_pickle_imports.py      |   2 +-
 tools/pre_commit/mypy.py                      |   1 +
 vllm/benchmarks/datasets.py                   |   4 +-
 vllm/engine/protocol.py                       |   4 +-
 vllm/entrypoints/chat_utils.py                |  12 +-
 vllm/entrypoints/llm.py                       |  15 +-
 vllm/entrypoints/openai/serving_chat.py       |  13 +-
 .../openai/serving_classification.py          |   2 +-
 vllm/entrypoints/openai/serving_completion.py |  16 +-
 vllm/entrypoints/openai/serving_engine.py     |  88 +++++-----
 vllm/entrypoints/openai/serving_responses.py  |  20 +--
 vllm/entrypoints/openai/serving_score.py      |   8 +-
 .../openai/serving_tokenization.py            |   4 +-
 .../tool_parsers/abstract_tool_parser.py      |   4 +-
 .../tool_parsers/deepseekv31_tool_parser.py   |   4 +-
 .../tool_parsers/deepseekv3_tool_parser.py    |   4 +-
 .../tool_parsers/ernie45_tool_parser.py       |   4 +-
 .../tool_parsers/glm4_moe_tool_parser.py      |   4 +-
 .../granite_20b_fc_tool_parser.py             |   4 +-
 .../tool_parsers/granite_tool_parser.py       |   4 +-
 .../openai/tool_parsers/hermes_tool_parser.py |   8 +-
 .../tool_parsers/hunyuan_a13b_tool_parser.py  |   4 +-
 .../tool_parsers/internlm2_tool_parser.py     |   4 +-
 .../openai/tool_parsers/jamba_tool_parser.py  |   5 +-
 .../tool_parsers/kimi_k2_tool_parser.py       |   4 +-
 .../tool_parsers/longcat_tool_parser.py       |   4 +-
 .../tool_parsers/minimax_m2_tool_parser.py    |   4 +-
 .../tool_parsers/minimax_tool_parser.py       |   4 +-
 .../tool_parsers/mistral_tool_parser.py       |   6 +-
 .../openai/tool_parsers/openai_tool_parser.py |   6 +-
 .../tool_parsers/qwen3coder_tool_parser.py    |   4 +-
 .../tool_parsers/qwen3xml_tool_parser.py      |   4 +-
 .../tool_parsers/seed_oss_tool_parser.py      |   4 +-
 .../openai/tool_parsers/step3_tool_parser.py  |   4 +-
 .../openai/tool_parsers/xlam_tool_parser.py   |   4 +-
 vllm/entrypoints/renderer.py                  |  10 +-
 vllm/entrypoints/score_utils.py               |  18 +-
 vllm/entrypoints/utils.py                     |   2 +-
 vllm/inputs/preprocess.py                     |  10 +-
 vllm/logits_process.py                        |   4 +-
 vllm/model_executor/models/h2ovl.py           |   4 +-
 vllm/model_executor/models/internvl.py        |   6 +-
 .../model_executor/models/nano_nemotron_vl.py |  10 +-
 vllm/model_executor/models/nemotron_vl.py     |   4 +-
 vllm/model_executor/models/opencua.py         |   4 +-
 vllm/model_executor/models/pixtral.py         |   6 +-
 vllm/model_executor/models/qwen2_vl.py        |   4 +-
 vllm/model_executor/models/skyworkr1v.py      |   4 +-
 vllm/model_executor/models/step3_vl.py        |   4 +-
 vllm/model_executor/models/voxtral.py         |   6 +-
 vllm/multimodal/processing.py                 |  43 ++---
 vllm/multimodal/registry.py                   |  14 +-
 vllm/reasoning/abs_reasoning_parsers.py       |   6 +-
 vllm/reasoning/basic_parsers.py               |   4 +-
 vllm/reasoning/minimax_m2_reasoning_parser.py |   4 +-
 vllm/reasoning/mistral_reasoning_parser.py    |   2 +-
 vllm/reasoning/olmo3_reasoning_parser.py      |   4 +-
 vllm/sampling_params.py                       |   4 +-
 vllm/tokenizers/__init__.py                   |   8 +
 .../tokenizers/mistral.py                     |  29 +---
 vllm/tokenizers/protocol.py                   | 105 +++++++++++
 vllm/tokenizers/registry.py                   |  28 +++
 vllm/transformers_utils/config.py             |   7 +-
 vllm/transformers_utils/detokenizer_utils.py  |  10 +-
 vllm/transformers_utils/gguf_utils.py         |   3 +-
 vllm/transformers_utils/tokenizer.py          |  60 ++++---
 vllm/transformers_utils/tokenizer_base.py     | 163 +++---------------
 .../transformers_utils/tokenizers/__init__.py |  16 --
 vllm/v1/engine/async_llm.py                   |  16 +-
 vllm/v1/engine/detokenizer.py                 |   6 +-
 vllm/v1/engine/input_processor.py             |   9 +-
 vllm/v1/engine/llm_engine.py                  |  16 +-
 vllm/v1/engine/logprobs.py                    |   6 +-
 vllm/v1/engine/output_processor.py            |   9 +-
 vllm/v1/structured_output/backend_types.py    |   6 +-
 vllm/v1/structured_output/backend_xgrammar.py |   2 +-
 vllm/v1/structured_output/utils.py            |  10 +-
 119 files changed, 758 insertions(+), 827 deletions(-)
 delete mode 100644 tests/tokenization/__init__.py
 delete mode 100644 tests/tokenization/test_do_lower_case.py
 delete mode 100644 tests/tokenization/test_get_eos.py
 delete mode 100644 tests/tokenization/test_tokenizer.py
 delete mode 100644 tests/tokenization/test_tokenizer_registry.py
 create mode 100644 tests/tokenizers_/__init__.py
 create mode 100644 tests/tokenizers_/test_basic.py
 rename tests/{tokenization => tokenizers_}/test_cached_tokenizer.py (88%)
 rename tests/{tokenization => tokenizers_}/test_detokenize.py (99%)
 rename tests/{tokenization/test_mistral_tokenizer.py => tokenizers_/test_mistral.py} (98%)
 create mode 100644 tests/tokenizers_/test_registry.py
 rename tests/transformers_utils/{test_get_processor_kwargs_from_processor.py => test_processor.py} (100%)
 create mode 100644 tests/transformers_utils/test_repo_utils.py
 create mode 100644 vllm/tokenizers/__init__.py
 rename vllm/{transformers_utils => }/tokenizers/mistral.py (96%)
 create mode 100644 vllm/tokenizers/protocol.py
 create mode 100644 vllm/tokenizers/registry.py
 delete mode 100644 vllm/transformers_utils/tokenizers/__init__.py

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4ddf11c0b268..4d98ee40a4bb 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -316,7 +316,7 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/engine
-  - tests/tokenization
+  - tests/tokenizers_
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
@@ -324,7 +324,7 @@ steps:
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
   # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization
+  - pytest -v -s tokenizers_
 
 - label: V1 Test e2e + engine # 30min
   timeout_in_minutes: 45
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c38068a9b22c..16d490754958 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -282,7 +282,7 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/engine
-  - tests/tokenization
+  - tests/tokenizers_
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
@@ -290,7 +290,7 @@ steps:
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
   # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization
+  - pytest -v -s tokenizers_
 
 - label: V1 Test e2e + engine # 30min
   timeout_in_minutes: 45
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4021fede7215..d69d74ca61f5 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
         kwargs["use_fast"] = False
     if tokenizer_mode == "mistral":
         try:
-            from vllm.transformers_utils.tokenizer import MistralTokenizer
+            from vllm.tokenizers import MistralTokenizer
         except ImportError as e:
             raise ImportError(
                 "MistralTokenizer requires vllm package.\n"
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 5f26c7cf182b..08a0dd69efa9 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -216,14 +216,13 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso
     # import the required packages
 
     from vllm.reasoning import ReasoningParser, ReasoningParserManager
-    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                                DeltaMessage)
+    from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 
     # define a reasoning parser and register it to vllm
     # the name list in register_module can be used
     # in --reasoning-parser.
     class ExampleParser(ReasoningParser):
-        def __init__(self, tokenizer: AnyTokenizer):
+        def __init__(self, tokenizer: TokenizerLike):
             super().__init__(tokenizer)
 
         def extract_reasoning_streaming(
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 22dda37279ac..b6dfbf10b456 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -422,7 +422,7 @@ Here is a summary of a plugin file:
     # in --tool-call-parser. you can define as many
     # tool parsers as you want here.
     class ExampleToolParser(ToolParser):
-        def __init__(self, tokenizer: AnyTokenizer):
+        def __init__(self, tokenizer: TokenizerLike):
             super().__init__(tokenizer)
 
         # adjust request. e.g.: set skip special tokens
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 29892d0bf38a..956a06dc5487 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -10,7 +10,7 @@
 from vllm.config import ModelConfig
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 
 @pytest.fixture()
diff --git a/tests/entrypoints/openai/tool_parsers/conftest.py b/tests/entrypoints/openai/tool_parsers/conftest.py
index f2ac5e5b9a8f..a40d0ab44cf7 100644
--- a/tests/entrypoints/openai/tool_parsers/conftest.py
+++ b/tests/entrypoints/openai/tool_parsers/conftest.py
@@ -4,9 +4,9 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 @pytest.fixture(scope="function")
-def default_tokenizer() -> AnyTokenizer:
+def default_tokenizer() -> TokenizerLike:
     return AutoTokenizer.from_pretrained("gpt2")
diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
index 38008dafe32b..b2303ab0e7b7 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -7,7 +7,7 @@
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from ....utils import RemoteOpenAIServer
 
@@ -270,14 +270,14 @@ async def test_streaming_product_tool_call():
 
 
 @pytest.fixture
-def qwen_tokenizer() -> AnyTokenizer:
+def qwen_tokenizer() -> TokenizerLike:
     from vllm.transformers_utils.tokenizer import get_tokenizer
 
     return get_tokenizer("Qwen/Qwen3-32B")
 
 
 @pytest.fixture
-def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser:
+def hermes_parser(qwen_tokenizer: TokenizerLike) -> Hermes2ProToolParser:
     return Hermes2ProToolParser(qwen_tokenizer)
 
 
@@ -291,7 +291,7 @@ def any_chat_request() -> ChatCompletionRequest:
 
 
 def test_hermes_parser_streaming_just_forward_text(
-    qwen_tokenizer: AnyTokenizer,
+    qwen_tokenizer: TokenizerLike,
     hermes_parser: Hermes2ProToolParser,
     any_chat_request: ChatCompletionRequest,
 ) -> None:
@@ -323,7 +323,7 @@ def test_hermes_parser_streaming_just_forward_text(
 
 
 def test_hermes_parser_streaming_failure_case_bug_19056(
-    qwen_tokenizer: AnyTokenizer,
+    qwen_tokenizer: TokenizerLike,
     hermes_parser: Hermes2ProToolParser,
     any_chat_request: ChatCompletionRequest,
 ) -> None:
@@ -357,7 +357,7 @@ def test_hermes_parser_streaming_failure_case_bug_19056(
 
 
 def test_hermes_parser_streaming(
-    qwen_tokenizer: AnyTokenizer,
+    qwen_tokenizer: TokenizerLike,
     hermes_parser: Hermes2ProToolParser,
     any_chat_request: ChatCompletionRequest,
 ) -> None:
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
index 37e52d2cdf60..6c286ca90ce4 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -7,11 +7,11 @@
 
 from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
 from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import Llama3JsonToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 @pytest.fixture
-def parser(default_tokenizer: AnyTokenizer):
+def parser(default_tokenizer: TokenizerLike):
     return Llama3JsonToolParser(default_tokenizer)
 
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index d297432eab64..8aa88a007188 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -11,7 +11,7 @@
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 # Test cases similar to pythonic parser but with Llama4 specific format
 SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
@@ -64,7 +64,7 @@
 
 
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
     )
@@ -208,7 +208,7 @@ def test_tool_call(
     streaming: bool,
     model_output: str,
     expected_tool_calls: list[FunctionCall],
-    default_tokenizer: AnyTokenizer,
+    default_tokenizer: TokenizerLike,
 ):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
@@ -224,7 +224,7 @@ def test_tool_call(
         assert actual.function == expected
 
 
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
     )
@@ -246,7 +246,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
 
 
 @pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
     """test regex timeout is handled gracefully"""
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
index 13cff9a8ebf1..a0b9a3c563bc 100644
--- a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
@@ -11,7 +11,7 @@
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
@@ -69,7 +69,7 @@
 
 
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
     )
@@ -188,7 +188,7 @@ def test_tool_call(
     streaming: bool,
     model_output: str,
     expected_tool_calls: list[FunctionCall],
-    default_tokenizer: AnyTokenizer,
+    default_tokenizer: TokenizerLike,
 ):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
@@ -205,7 +205,7 @@ def test_tool_call(
         assert actual.function == expected
 
 
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
     )
@@ -228,7 +228,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
 
 
 @pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
     """test regex timeout is handled gracefully"""
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index fcd3df16e5cf..52202c55e840 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -11,7 +11,7 @@
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
@@ -61,7 +61,7 @@
 
 
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
     )
@@ -168,7 +168,7 @@ def test_tool_call(
     streaming: bool,
     model_output: str,
     expected_tool_calls: list[FunctionCall],
-    default_tokenizer: AnyTokenizer,
+    default_tokenizer: TokenizerLike,
 ):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
@@ -185,7 +185,7 @@ def test_tool_call(
         assert actual.function == expected
 
 
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
     )
@@ -208,7 +208,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
 
 
 @pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
     """test regex timeout is handled gracefully"""
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index 38899f263255..2d4f5f173410 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -11,7 +11,7 @@
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers import ToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 class StreamingToolReconstructor:
@@ -111,7 +111,7 @@ def run_tool_extraction_nonstreaming(
     return tool_parser.extract_tool_calls(model_output, request)
 
 
-def split_string_into_token_deltas(tokenizer: AnyTokenizer, text: str) -> list[str]:
+def split_string_into_token_deltas(tokenizer: TokenizerLike, text: str) -> list[str]:
     # Split a string into a series of deltas using the provided tokenizer. Each
     # delta will be the string equivalent of a single token.
     token_ids = tokenizer.encode(text, add_special_tokens=False)
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 7baf564ad01a..a351cda60621 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -28,8 +28,8 @@
     encode_image_base64,
     encode_video_base64,
 )
+from vllm.tokenizers import MistralTokenizer
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import VLLM_PATH
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 80e337d570a3..1377776a6d84 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -10,7 +10,7 @@
     MistralToolParser,
 )
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 from ...utils import check_logprobs_close
 
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
index 18a50c3a555d..9e9087cb0fc4 100644
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -9,7 +9,7 @@
 from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
 from mistral_common.protocol.instruct.messages import UserMessage
 
-from vllm.transformers_utils.tokenizer import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 from ....conftest import AudioTestAssets
 from ....utils import RemoteOpenAIServer
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index 03ff3bcf6307..08cf4b2202dc 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -9,7 +9,7 @@
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from vllm.config.model import RunnerOption
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .....conftest import HfRunner, VllmRunner
 from ....registry import HF_EXAMPLE_MODELS
@@ -33,7 +33,7 @@ def run_test(
     auto_cls: type[_BaseAutoModelClass],
     use_tokenizer_eos: bool,
     comparator: Callable[..., None],
-    get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None,
+    get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None,
     stop_str: list[str] | None,
     limit_mm_per_prompt: dict[str, int],
     vllm_runner_kwargs: dict[str, Any] | None,
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 5c1bc6ac28fe..0c03c8449712 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -14,7 +14,7 @@
 
 from vllm.config.model import RunnerOption
 from vllm.logprobs import SampleLogprobs
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .....conftest import (
     AUDIO_ASSETS,
@@ -126,7 +126,7 @@ class VLMTestInfo(NamedTuple):
     vllm_runner_kwargs: dict[str, Any] | None = None
 
     # Optional callable which gets a list of token IDs from the model tokenizer
-    get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None = None
+    get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None = None
     # Optional list of strings to stop generation, useful when stop tokens are
     # not special tokens in the tokenizer
     stop_str: list[str] | None = None
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 9638791ab5ca..c39e52210090 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,8 +22,8 @@
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
+from vllm.tokenizers import MistralTokenizer
 from vllm.transformers_utils.tokenizer import (
-    MistralTokenizer,
     cached_tokenizer_from_config,
     encode_tokens,
 )
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index d860c50e7899..f7fa8da54d54 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import time
 from contextlib import nullcontext
 from typing import cast
 
@@ -23,7 +24,7 @@
     replace_token_matches,
 )
 from vllm.multimodal.profiling import MultiModalProfiler
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .utils import random_image
 
@@ -238,7 +239,7 @@ def test_find_token_matches(
     update_type,
 ):
     # Should not be used since there is nothing to convert to token IDs
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     prompt_updates = {
         key: update_type(key, target, []).resolve(0)
@@ -385,7 +386,7 @@ def test_find_text_matches(
     update_type,
 ):
     # Should not be used since there is nothing to convert to text
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     prompt_updates = {
         key: update_type(key, target, []).resolve(0)
@@ -545,7 +546,7 @@ def test_find_update_text(
     expected_by_update_type_mm_count,
 ):
     # Should not be used since there is nothing to convert to text
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     for (
         update_type,
@@ -750,7 +751,7 @@ def test_find_update_tokens(
     expected_by_update_type_mm_count,
 ):
     # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     for (
         update_type,
@@ -900,7 +901,7 @@ def test_find_mm_placeholders(
     update_type,
 ):
     # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     mm_prompt_updates = {
         key: [[update_type(key, [], repl).resolve(i)] for i in range(3)]
@@ -1029,7 +1030,7 @@ def test_hf_processor_init_kwargs(
     expected_kwargs,
 ):
     # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     ctx = InputProcessingContext(
         model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
@@ -1065,7 +1066,7 @@ def test_hf_processor_call_kwargs(
     expected_kwargs,
 ):
     # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     ctx = InputProcessingContext(
         model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
@@ -1088,9 +1089,7 @@ def test_apply_matches_no_match_exits_quickly():
 
     With the fix, it should exit immediately when no match is found.
     """
-    import time
-
-    mock_tokenizer = cast(AnyTokenizer, object())
+    mock_tokenizer = cast(TokenizerLike, object())
 
     # Create a long prompt with no placeholder
     long_prompt = "x" * 10000
diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py
index 5163c863863a..0fe315c2567f 100644
--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -5,7 +5,7 @@
 
 from tests.reasoning.utils import run_reasoning_extraction_mistral
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 parser_name = "mistral"
 
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index bd0b230a847c..695312a0cadf 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -4,7 +4,7 @@
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.reasoning import ReasoningParser
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 
 class StreamingReasoningReconstructor:
diff --git a/tests/tokenization/__init__.py b/tests/tokenization/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/tokenization/test_do_lower_case.py b/tests/tokenization/test_do_lower_case.py
deleted file mode 100644
index 8aff50b351e3..000000000000
--- a/tests/tokenization/test_do_lower_case.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-TOKENIZER_NAMES = ["BAAI/bge-base-en"]
-
-
-@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
-@pytest.mark.parametrize("n_tokens", [510])
-def test_special_tokens(tokenizer_name: str, n_tokens: int):
-    tokenizer = get_tokenizer(tokenizer_name, revision="main")
-
-    prompts = "[UNK]" * n_tokens
-    prompt_token_ids = tokenizer.encode(prompts)
-    assert len(prompt_token_ids) == n_tokens + 2
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
deleted file mode 100644
index 921d77b1b335..000000000000
--- a/tests/tokenization/test_get_eos.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This test file includes some cases where it is inappropriate to
-only get the `eos_token_id` from the tokenizer as defined by
-{meth}`vllm.LLMEngine._get_eos_token_id`.
-"""
-
-from vllm.transformers_utils.config import try_get_generation_config
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-
-def test_get_llama3_eos_token():
-    model_name = "meta-llama/Llama-3.2-1B-Instruct"
-
-    tokenizer = get_tokenizer(model_name)
-    assert tokenizer.eos_token_id == 128009
-
-    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
-    assert generation_config is not None
-    assert generation_config.eos_token_id == [128001, 128008, 128009]
-
-
-def test_get_blip2_eos_token():
-    model_name = "Salesforce/blip2-opt-2.7b"
-
-    tokenizer = get_tokenizer(model_name)
-    assert tokenizer.eos_token_id == 2
-
-    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
-    assert generation_config is not None
-    assert generation_config.eos_token_id == 50118
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
deleted file mode 100644
index e86bb03883b5..000000000000
--- a/tests/tokenization/test_tokenizer.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-from transformers import PreTrainedTokenizerBase
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-TOKENIZER_NAMES = [
-    "facebook/opt-125m",
-    "gpt2",
-]
-
-
-@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
-def test_tokenizer_revision(tokenizer_name: str):
-    # Assume that "main" branch always exists
-    tokenizer = get_tokenizer(tokenizer_name, revision="main")
-    assert isinstance(tokenizer, PreTrainedTokenizerBase)
-
-    # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match="not a valid git identifier"):
-        get_tokenizer(tokenizer_name, revision="never")
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
deleted file mode 100644
index f13bb4333d61..000000000000
--- a/tests/tokenization/test_tokenizer_registry.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import TYPE_CHECKING, Any
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.transformers_utils.tokenizer_base import TokenizerBase, TokenizerRegistry
-
-if TYPE_CHECKING:
-    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
-
-
-class TestTokenizer(TokenizerBase):
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
-        return TestTokenizer()
-
-    @property
-    def all_special_tokens(self) -> list[str]:
-        raise NotImplementedError()
-
-    @property
-    def all_special_ids(self) -> list[int]:
-        raise NotImplementedError()
-
-    @property
-    def bos_token_id(self) -> int:
-        return 0
-
-    @property
-    def eos_token_id(self) -> int:
-        return 1
-
-    @property
-    def sep_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def pad_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def is_fast(self) -> bool:
-        raise NotImplementedError()
-
-    @property
-    def vocab_size(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    def max_token_id(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    def truncation_side(self) -> str:
-        raise NotImplementedError()
-
-    def __call__(
-        self,
-        text: str | list[str] | list[int],
-        text_pair: str | None = None,
-        add_special_tokens: bool = False,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ):
-        raise NotImplementedError()
-
-    def get_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    def get_added_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    def encode_one(
-        self,
-        text: str,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    def encode(self, text: str, add_special_tokens: bool | None = None) -> list[int]:
-        raise NotImplementedError()
-
-    def apply_chat_template(
-        self,
-        messages: list["ChatCompletionMessageParam"],
-        tools: list[dict[str, Any]] | None = None,
-        **kwargs,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    def convert_tokens_to_string(self, tokens: list[str]) -> str:
-        raise NotImplementedError()
-
-    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
-        raise NotImplementedError()
-
-    def convert_ids_to_tokens(
-        self,
-        ids: list[int],
-        skip_special_tokens: bool = True,
-    ) -> list[str]:
-        raise NotImplementedError()
-
-
-def test_customized_tokenizer():
-    TokenizerRegistry.register(
-        "test_tokenizer", "tests.tokenization.test_tokenizer_registry", "TestTokenizer"
-    )
-
-    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
-    assert isinstance(tokenizer, TestTokenizer)
-    assert tokenizer.bos_token_id == 0
-    assert tokenizer.eos_token_id == 1
-
-    tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
-    assert isinstance(tokenizer, TestTokenizer)
-    assert tokenizer.bos_token_id == 0
-    assert tokenizer.eos_token_id == 1
diff --git a/tests/tokenizers_/__init__.py b/tests/tokenizers_/__init__.py
new file mode 100644
index 000000000000..a5d7f4b03103
--- /dev/null
+++ b/tests/tokenizers_/__init__.py
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# NOTE: Since CI runs the tests from the `tests` directory, it is necessary to rename
+# this module to avoid conflicting with HF's `tokenizers` package
diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py
new file mode 100644
index 000000000000..1fca633cc5cd
--- /dev/null
+++ b/tests/tokenizers_/test_basic.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import _get_protocol_attrs  # type: ignore
+
+import pytest
+from transformers import PreTrainedTokenizerBase
+
+from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+def _get_missing_attrs(obj: object, target: type):
+    return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]
+
+
+def test_tokenizer_like_protocol():
+    assert not (
+        missing_attrs := _get_missing_attrs(
+            get_tokenizer("gpt2", use_fast=False),
+            TokenizerLike,
+        )
+    ), f"Missing attrs: {missing_attrs}"
+
+    assert not (
+        missing_attrs := _get_missing_attrs(
+            get_tokenizer("gpt2", use_fast=True),
+            TokenizerLike,
+        )
+    ), f"Missing attrs: {missing_attrs}"
+
+    assert not (
+        missing_attrs := _get_missing_attrs(
+            get_tokenizer(
+                "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
+            ),
+            TokenizerLike,
+        )
+    ), f"Missing attrs: {missing_attrs}"
+
+
+@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
+def test_tokenizer_revision(tokenizer_name: str):
+    # Assume that "main" branch always exists
+    tokenizer = get_tokenizer(tokenizer_name, revision="main")
+    assert isinstance(tokenizer, PreTrainedTokenizerBase)
+
+    # Assume that "never" branch always does not exist
+    with pytest.raises(OSError, match="not a valid git identifier"):
+        get_tokenizer(tokenizer_name, revision="never")
+
+
+@pytest.mark.parametrize("tokenizer_name", ["BAAI/bge-base-en"])
+@pytest.mark.parametrize("n_tokens", [510])
+def test_special_tokens(tokenizer_name: str, n_tokens: int):
+    tokenizer = get_tokenizer(tokenizer_name, revision="main")
+
+    prompts = "[UNK]" * n_tokens
+    prompt_token_ids = tokenizer.encode(prompts)
+    assert len(prompt_token_ids) == n_tokens + 2
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenizers_/test_cached_tokenizer.py
similarity index 88%
rename from tests/tokenization/test_cached_tokenizer.py
rename to tests/tokenizers_/test_cached_tokenizer.py
index a5bb3dbcfe29..48234687ea1e 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenizers_/test_cached_tokenizer.py
@@ -6,7 +6,8 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_cached_tokenizer
+from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 
 
 @pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
@@ -25,7 +26,7 @@ def test_cached_tokenizer(model_id: str):
     _check_consistency(unpickled_tokenizer, reference_tokenizer)
 
 
-def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
+def _check_consistency(target: TokenizerLike, expected: TokenizerLike):
     assert isinstance(target, type(expected))
 
     # Cached attributes
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenizers_/test_detokenize.py
similarity index 99%
rename from tests/tokenization/test_detokenize.py
rename to tests/tokenizers_/test_detokenize.py
index f4b43a21daaa..ae1d6b095672 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenizers_/test_detokenize.py
@@ -8,7 +8,7 @@
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import (
     FastIncrementalDetokenizer,
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenizers_/test_mistral.py
similarity index 98%
rename from tests/tokenization/test_mistral_tokenizer.py
rename to tests/tokenizers_/test_mistral.py
index 4cdfa9df95e1..0706a94791dc 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenizers_/test_mistral.py
@@ -7,7 +7,7 @@
 from mistral_common.exceptions import InvalidMessageStructureException
 from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 
-from vllm.transformers_utils.tokenizers.mistral import (
+from vllm.tokenizers.mistral import (
     MistralTokenizer,
     _prepare_apply_chat_template_tools_and_messages,
 )
@@ -308,25 +308,6 @@ def get_vocab(self, mistral_tokenizer: MistralTokenizer):
     def test_get_added_vocab(self, mistral_tokenizer: MistralTokenizer):
         assert mistral_tokenizer.get_added_vocab() == {}
 
-    def test_encode_one(self, mistral_tokenizer: MistralTokenizer):
-        token_ids = (
-            [22177, 4304, 2662] if mistral_tokenizer.is_tekken else [23325, 2294, 1686]
-        )
-
-        assert mistral_tokenizer.encode_one("Hello world !") == token_ids
-        assert mistral_tokenizer.encode_one("Hello world !", max_length=1) == token_ids
-        assert (
-            mistral_tokenizer.encode_one("Hello world !", truncation=True, max_length=1)
-            == token_ids[:-2]
-        )
-        assert (
-            mistral_tokenizer.encode_one(
-                "Hello world !", truncation=False, max_length=1
-            )
-            == token_ids
-        )
-        assert mistral_tokenizer.encode_one("") == []
-
     def test_encode(self, mistral_tokenizer: MistralTokenizer):
         token_ids = (
             [1, 22177, 4304, 2662]
diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py
new file mode 100644
index 000000000000..1eb19a0996dd
--- /dev/null
+++ b/tests/tokenizers_/test_registry.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.tokenizers import TokenizerLike, TokenizerRegistry
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+class TestTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
+        return TestTokenizer()  # type: ignore
+
+    @property
+    def bos_token_id(self) -> int:
+        return 0
+
+    @property
+    def eos_token_id(self) -> int:
+        return 1
+
+
+def test_customized_tokenizer():
+    TokenizerRegistry.register(
+        "test_tokenizer",
+        __name__,
+        TestTokenizer.__name__,
+    )
+
+    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
+
+    tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py
index 36a07bb561d9..ee9da4fd6464 100644
--- a/tests/tool_use/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_use/test_ernie45_moe_tool_parser.py
@@ -14,8 +14,9 @@
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 # Use a common model that is likely to be available
 MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"
@@ -173,7 +174,7 @@ def test_extract_tool_calls(
 
 def stream_delta_message_generator(
     ernie45_tool_parser: Ernie45ToolParser,
-    ernie45_tokenizer: AnyTokenizer,
+    ernie45_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 9eb73b80fa9b..2413b983fe87 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -10,8 +10,9 @@
 
 from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -44,7 +45,9 @@ def assert_tool_calls(
 
 
 def stream_delta_message_generator(
-    jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer, model_output: str
+    jamba_tool_parser: JambaToolParser,
+    jamba_tokenizer: TokenizerLike,
+    model_output: str,
 ) -> Generator[DeltaMessage, None, None]:
     all_token_ids = jamba_tokenizer.encode(model_output, add_special_tokens=False)
 
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 93ef1049fc07..3cf1f4ef89f1 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -17,8 +17,9 @@
     Qwen3CoderToolParser,
 )
 from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -104,7 +105,7 @@ def assert_tool_calls(
 
 def stream_delta_message_generator(
     qwen3_tool_parser,
-    qwen3_tokenizer: AnyTokenizer,
+    qwen3_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index 1367ad87cb01..8e1ad5e9cedc 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -15,8 +15,9 @@
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -256,7 +257,7 @@ def test_streaming_tool_calls_no_tools(seed_oss_tool_parser):
 
 def stream_delta_message_generator(
     seed_oss_tool_parser: SeedOssToolParser,
-    seed_oss_tokenizer: AnyTokenizer,
+    seed_oss_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index 122b427d6040..a1852c368eeb 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -13,8 +13,9 @@
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -49,7 +50,7 @@ def assert_tool_calls(
 
 def stream_delta_message_generator(
     xlam_tool_parser: xLAMToolParser,
-    xlam_tokenizer: AnyTokenizer,
+    xlam_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py
index 7107ad0f7b99..7b56c9f0189d 100644
--- a/tests/transformers_utils/test_config.py
+++ b/tests/transformers_utils/test_config.py
@@ -1,62 +1,32 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This test file includes some cases where it is inappropriate to
+only get the `eos_token_id` from the tokenizer as defined by
+`vllm.LLMEngine._get_eos_token_id`.
+"""
 
+from vllm.transformers_utils.config import try_get_generation_config
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
-import tempfile
-from pathlib import Path
-from unittest.mock import MagicMock, call, patch
-
-import pytest
-
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
-
-
-@pytest.mark.parametrize(
-    "allow_patterns,expected_relative_files",
-    [
-        (
-            ["*.json", "correct*.txt"],
-            ["json_file.json", "subfolder/correct.txt", "correct_2.txt"],
-        ),
-    ],
-)
-def test_list_filtered_repo_files(
-    allow_patterns: list[str], expected_relative_files: list[str]
-):
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        # Prep folder and files
-        path_tmp_dir = Path(tmp_dir)
-        subfolder = path_tmp_dir / "subfolder"
-        subfolder.mkdir()
-        (path_tmp_dir / "json_file.json").touch()
-        (path_tmp_dir / "correct_2.txt").touch()
-        (path_tmp_dir / "uncorrect.txt").touch()
-        (path_tmp_dir / "uncorrect.jpeg").touch()
-        (subfolder / "correct.txt").touch()
-        (subfolder / "uncorrect_sub.txt").touch()
-
-        def _glob_path() -> list[str]:
-            return [
-                str(file.relative_to(path_tmp_dir))
-                for file in path_tmp_dir.glob("**/*")
-                if file.is_file()
-            ]
-
-        # Patch list_repo_files called by fn
-        with patch(
-            "vllm.transformers_utils.repo_utils.list_repo_files",
-            MagicMock(return_value=_glob_path()),
-        ) as mock_list_repo_files:
-            out_files = sorted(
-                list_filtered_repo_files(
-                    tmp_dir, allow_patterns, "revision", "model", "token"
-                )
-            )
-        assert out_files == sorted(expected_relative_files)
-        assert mock_list_repo_files.call_count == 1
-        assert mock_list_repo_files.call_args_list[0] == call(
-            repo_id=tmp_dir,
-            revision="revision",
-            repo_type="model",
-            token="token",
-        )
+
+def test_get_llama3_eos_token():
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 128009
+
+    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == [128001, 128008, 128009]
+
+
+def test_get_blip2_eos_token():
+    model_name = "Salesforce/blip2-opt-2.7b"
+
+    tokenizer = get_tokenizer(model_name)
+    assert tokenizer.eos_token_id == 2
+
+    generation_config = try_get_generation_config(model_name, trust_remote_code=False)
+    assert generation_config is not None
+    assert generation_config.eos_token_id == 50118
diff --git a/tests/transformers_utils/test_get_processor_kwargs_from_processor.py b/tests/transformers_utils/test_processor.py
similarity index 100%
rename from tests/transformers_utils/test_get_processor_kwargs_from_processor.py
rename to tests/transformers_utils/test_processor.py
diff --git a/tests/transformers_utils/test_repo_utils.py b/tests/transformers_utils/test_repo_utils.py
new file mode 100644
index 000000000000..7107ad0f7b99
--- /dev/null
+++ b/tests/transformers_utils/test_repo_utils.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+
+
+@pytest.mark.parametrize(
+    "allow_patterns,expected_relative_files",
+    [
+        (
+            ["*.json", "correct*.txt"],
+            ["json_file.json", "subfolder/correct.txt", "correct_2.txt"],
+        ),
+    ],
+)
+def test_list_filtered_repo_files(
+    allow_patterns: list[str], expected_relative_files: list[str]
+):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        # Prep folder and files
+        path_tmp_dir = Path(tmp_dir)
+        subfolder = path_tmp_dir / "subfolder"
+        subfolder.mkdir()
+        (path_tmp_dir / "json_file.json").touch()
+        (path_tmp_dir / "correct_2.txt").touch()
+        (path_tmp_dir / "uncorrect.txt").touch()
+        (path_tmp_dir / "uncorrect.jpeg").touch()
+        (subfolder / "correct.txt").touch()
+        (subfolder / "uncorrect_sub.txt").touch()
+
+        def _glob_path() -> list[str]:
+            return [
+                str(file.relative_to(path_tmp_dir))
+                for file in path_tmp_dir.glob("**/*")
+                if file.is_file()
+            ]
+
+        # Patch list_repo_files called by fn
+        with patch(
+            "vllm.transformers_utils.repo_utils.list_repo_files",
+            MagicMock(return_value=_glob_path()),
+        ) as mock_list_repo_files:
+            out_files = sorted(
+                list_filtered_repo_files(
+                    tmp_dir, allow_patterns, "revision", "model", "token"
+                )
+            )
+        assert out_files == sorted(expected_relative_files)
+        assert mock_list_repo_files.call_count == 1
+        assert mock_list_repo_files.call_args_list[0] == call(
+            repo_id=tmp_dir,
+            revision="revision",
+            repo_type="model",
+            token="token",
+        )
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 8e1198b315bd..990aa9d92585 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -18,7 +18,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.v1.engine import (
     EngineCoreEvent,
     EngineCoreEventType,
@@ -31,7 +31,7 @@
 
 
 def _ref_convert_id_to_token(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_id: int,
 ) -> str:
     """Reference impl of logprobs detokenization.
diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py
index b96a6701333d..2bb468da68c2 100644
--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@@ -27,8 +27,8 @@
     "vllm/distributed/device_communicators/shm_broadcast.py",
     "vllm/distributed/device_communicators/shm_object_storage.py",
     "vllm/utils/hashing.py",
+    "tests/tokenizers_/test_cached_tokenizer.py",
     "tests/utils_/test_hashing.py",
-    "tests/tokenization/test_cached_tokenizer.py",
     "benchmarks/kernels/graph_machete_bench.py",
     "benchmarks/kernels/benchmark_lora.py",
     "benchmarks/kernels/benchmark_machete.py",
diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 47e01fc93b48..724b39304426 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -35,6 +35,7 @@
     "vllm/multimodal",
     "vllm/platforms",
     "vllm/plugins",
+    "vllm/tokenizers",
     "vllm/transformers_utils",
     "vllm/triton_utils",
     "vllm/usage",
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 5411ecbb27b2..ec9b0fd6e969 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -39,7 +39,7 @@
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.image import convert_image_mode
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.import_utils import PlaceholderModule
 
 try:
@@ -293,7 +293,7 @@ def lora_path_on_disk(lora_path: str) -> str:
 
 
 # Global cache for LoRA tokenizers.
-lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+lora_tokenizer_cache: dict[int, TokenizerLike] = {}
 
 
 def process_image(image: Any) -> Mapping[str, Any]:
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 02741e50f6aa..f2b19c845018 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -13,7 +13,7 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.input_processor import InputProcessor
 
@@ -85,7 +85,7 @@ async def abort(self, request_id: str | Iterable[str]) -> None:
         ...
 
     @abstractmethod
-    async def get_tokenizer(self) -> AnyTokenizer:
+    async def get_tokenizer(self) -> TokenizerLike:
         """Get the tokenizer"""
         ...
 
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index bf80856c1bbf..1643906894c6 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -49,9 +49,9 @@
 from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import random_uuid
 from vllm.utils.func_utils import supports_kw
 
@@ -536,7 +536,7 @@ def resolve_hf_chat_template(
 def _resolve_chat_template_content_format(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     *,
     model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
@@ -593,7 +593,7 @@ def resolve_chat_template_content_format(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
     given_format: ChatTemplateContentFormatOption,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     *,
     model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
@@ -627,7 +627,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     maximum per prompt.
     """
 
-    def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
+    def __init__(self, model_config: ModelConfig, tokenizer: TokenizerLike):
         super().__init__()
 
         self._model_config = model_config
@@ -1592,7 +1592,7 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
 def parse_chat_messages(
     messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
@@ -1624,7 +1624,7 @@ def parse_chat_messages(
 def parse_chat_messages_futures(
     messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 2b34f36253ed..4ea213752e39 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -71,11 +71,8 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
 from vllm.tasks import PoolingTask
-from vllm.transformers_utils.tokenizer import (
-    AnyTokenizer,
-    MistralTokenizer,
-    get_cached_tokenizer,
-)
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.collection_utils import as_iter, is_list_of
 from vllm.utils.counter import Counter
@@ -350,11 +347,11 @@ def __init__(
         self.input_processor = self.llm_engine.input_processor
         self.io_processor = self.llm_engine.io_processor
 
-    def get_tokenizer(self) -> AnyTokenizer:
+    def get_tokenizer(self) -> TokenizerLike:
         return self.llm_engine.get_tokenizer()
 
     @deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.")
-    def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+    def set_tokenizer(self, tokenizer: TokenizerLike) -> None:
         # While CachedTokenizer is dynamic, have no choice but
         # compare class name. Misjudgment will arise from
         # user-defined tokenizer started with 'Cached'
@@ -1244,7 +1241,7 @@ def reward(
 
     def _embedding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         text_1: list[str | TextPrompt | TokensPrompt],
         text_2: list[str | TextPrompt | TokensPrompt],
         truncate_prompt_tokens: int | None = None,
@@ -1276,7 +1273,7 @@ def _embedding_score(
 
     def _cross_encoding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         data_1: list[str] | list[ScoreContentPartParam],
         data_2: list[str] | list[ScoreContentPartParam],
         truncate_prompt_tokens: int | None = None,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 9a7051e0920a..cecd1da1e554 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -62,8 +62,9 @@
 from vllm.logprobs import Logprob
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.transformers_utils.tokenizers import (
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import (
+    MistralTokenizer,
     maybe_serialize_tool_calls,
     truncate_tool_call_ids,
     validate_request_params,
@@ -530,7 +531,7 @@ async def chat_completion_stream_generator(
         request_id: str,
         model_name: str,
         conversation: list[ConversationMessage],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
         created_time = int(time.time())
@@ -1296,7 +1297,7 @@ async def chat_completion_full_generator(
         request_id: str,
         model_name: str,
         conversation: list[ConversationMessage],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
     ) -> ErrorResponse | ChatCompletionResponse:
         created_time = int(time.time())
@@ -1624,7 +1625,7 @@ def _get_top_logprobs(
         self,
         logprobs: dict[int, Logprob],
         top_logprobs: int | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         should_return_as_token_id: bool,
     ) -> list[ChatCompletionLogProb]:
         return [
@@ -1648,7 +1649,7 @@ def _create_chat_logprobs(
         self,
         token_ids: GenericSequence[int],
         top_logprobs: GenericSequence[dict[int, Logprob] | None],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         num_output_top_logprobs: int | None = None,
         return_as_token_id: bool | None = None,
     ) -> ChatCompletionLogProbs:
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 167ee152fece..3b973eb125a8 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -221,7 +221,7 @@ async def create_classify(
 
     def _create_pooling_params(
         self,
-        ctx: ClassificationServeContext,
+        ctx: ServeContext[ClassificationRequest],
     ) -> PoolingParams | ErrorResponse:
         pooling_params = super()._create_pooling_params(ctx)
         if isinstance(pooling_params, ErrorResponse):
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 9681aa8c71e6..3e421e21e3e8 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -33,7 +33,7 @@
 from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.async_utils import merge_async_iterators
 from vllm.utils.collection_utils import as_list
 from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
@@ -326,7 +326,7 @@ async def completion_stream_generator(
         created_time: int,
         model_name: str,
         num_prompts: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
         num_choices = 1 if request.n is None else request.n
@@ -511,7 +511,7 @@ def request_output_to_completion_response(
         request_id: str,
         created_time: int,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         request_metadata: RequestResponseMetadata,
     ) -> CompletionResponse:
         choices: list[CompletionResponseChoice] = []
@@ -622,7 +622,7 @@ def _create_completion_logprobs(
         token_ids: GenericSequence[int],
         top_logprobs: GenericSequence[dict[int, Logprob] | None],
         num_output_top_logprobs: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         initial_text_offset: int = 0,
         return_as_token_id: bool | None = None,
     ) -> CompletionLogProbs:
@@ -642,9 +642,15 @@ def _create_completion_logprobs(
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
-                token = tokenizer.decode(token_id)
                 if should_return_as_token_id:
                     token = f"token_id:{token_id}"
+                else:
+                    if tokenizer is None:
+                        raise ValueError(
+                            "Unable to get tokenizer because `skip_tokenizer_init=True`"
+                        )
+
+                    token = tokenizer.decode(token_id)
 
                 out_tokens.append(token)
                 out_token_logprobs.append(None)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index cca2fd982fe0..e7a632e02510 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -7,13 +7,14 @@
 import traceback
 from collections.abc import AsyncGenerator, Callable, Iterable, Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import Any, ClassVar, Generic, TypeAlias, TypeVar
 
 import numpy as np
 import torch
 from fastapi import Request
-from pydantic import BaseModel, ConfigDict, Field, TypeAdapter
+from pydantic import ConfigDict, TypeAdapter
 from starlette.datastructures import Headers
 from typing_extensions import TypeIs
 
@@ -96,12 +97,12 @@
 from vllm.pooling_params import PoolingParams
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.tracing import (
     contains_trace_headers,
     extract_trace_headers,
     log_tracing_disabled_warning,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import random_uuid
 from vllm.utils.async_utils import (
     AsyncMicrobatchTokenizer,
@@ -184,19 +185,19 @@ def is_embeds_prompt(prompt: RequestPrompt) -> TypeIs[EmbedsPrompt]:
 RequestT = TypeVar("RequestT", bound=AnyRequest)
 
 
-class RequestProcessingMixin(BaseModel):
+@dataclass(kw_only=True)
+class RequestProcessingMixin:
     """
     Mixin for request processing,
     handling prompt preparation and engine input.
     """
 
-    request_prompts: Sequence[RequestPrompt] | None = []
-    engine_prompts: list[EngineTokensPrompt] | None = []
-
-    model_config = ConfigDict(arbitrary_types_allowed=True)
+    request_prompts: Sequence[RequestPrompt] | None = field(default_factory=list)
+    engine_prompts: list[EngineTokensPrompt] | None = field(default_factory=list)
 
 
-class ResponseGenerationMixin(BaseModel):
+@dataclass(kw_only=True)
+class ResponseGenerationMixin:
     """
     Mixin for response generation,
     managing result generators and final batch results.
@@ -205,54 +206,38 @@ class ResponseGenerationMixin(BaseModel):
     result_generator: (
         AsyncGenerator[tuple[int, RequestOutput | PoolingRequestOutput], None] | None
     ) = None
-    final_res_batch: list[RequestOutput | PoolingRequestOutput] = Field(
+    final_res_batch: list[RequestOutput | PoolingRequestOutput] = field(
         default_factory=list
     )
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
 
-class ServeContext(
-    RequestProcessingMixin,
-    ResponseGenerationMixin,
-    BaseModel,
-    Generic[RequestT],
-):
+@dataclass(kw_only=True)
+class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, Generic[RequestT]):
     # Shared across all requests
     request: RequestT
     raw_request: Request | None = None
     model_name: str
     request_id: str
-    created_time: int = Field(default_factory=lambda: int(time.time()))
+    created_time: int = field(default_factory=lambda: int(time.time()))
     lora_request: LoRARequest | None = None
 
     # Shared across most requests
-    tokenizer: AnyTokenizer | None = None
-
-    # `protected_namespaces` resolves Pydantic v2's warning
-    # on conflict with protected namespace "model_"
-    model_config = ConfigDict(
-        protected_namespaces=(),
-        arbitrary_types_allowed=True,
-    )
+    tokenizer: TokenizerLike | None = None
 
 
-ClassificationServeContext = ServeContext[ClassificationRequest]
+@dataclass(kw_only=True)
+class ClassificationServeContext(ServeContext[ClassificationRequest]):
+    pass
 
 
+@dataclass(kw_only=True)
 class EmbeddingServeContext(ServeContext[EmbeddingRequest]):
     chat_template: str | None = None
     chat_template_content_format: ChatTemplateContentFormatOption
 
 
-# Used to resolve the Pydantic error related to
-# forward reference of MultiModalDataDict in TokensPrompt
-RequestProcessingMixin.model_rebuild()
-ServeContext.model_rebuild()
-ClassificationServeContext.model_rebuild()
-EmbeddingServeContext.model_rebuild()
-
-
 class OpenAIServing:
     request_id_prefix: ClassVar[str] = """
     A short string prepended to every request’s ID (e.g. "embd", "classify")
@@ -281,7 +266,7 @@ def __init__(
             apply_mistral_chat_template, executor=self._tokenizer_executor
         )
 
-        self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {}
+        self._async_tokenizer_pool: dict[TokenizerLike, AsyncMicrobatchTokenizer] = {}
         self.log_error_stack = log_error_stack
 
         self.input_processor = self.models.input_processor
@@ -291,7 +276,7 @@ def __init__(
 
     def _get_tool_parser(
         self, tool_parser_name: str | None = None, enable_auto_tools: bool = False
-    ) -> Callable[[AnyTokenizer], ToolParser] | None:
+    ) -> Callable[[TokenizerLike], ToolParser] | None:
         """Get the tool parser based on the name."""
         parser = None
         if not enable_auto_tools or tool_parser_name is None:
@@ -317,7 +302,7 @@ def _get_tool_parser(
     def _get_reasoning_parser(
         self,
         reasoning_parser_name: str,
-    ) -> Callable[[AnyTokenizer], ReasoningParser] | None:
+    ) -> Callable[[TokenizerLike], ReasoningParser] | None:
         """Get the reasoning parser based on the name."""
         parser = None
         if not reasoning_parser_name:
@@ -547,7 +532,7 @@ async def beam_search(
             prompt_logprobs=None,
         )
 
-    def _get_renderer(self, tokenizer: AnyTokenizer | None) -> BaseRenderer:
+    def _get_renderer(self, tokenizer: TokenizerLike | None) -> BaseRenderer:
         """
         Get a Renderer instance with the provided tokenizer.
         Uses shared async tokenizer pool for efficiency.
@@ -877,7 +862,7 @@ async def _normalize_prompt_text_to_input(
         self,
         request: AnyRequest,
         prompt: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
         async_tokenizer = self._get_async_tokenizer(tokenizer)
@@ -919,7 +904,7 @@ async def _normalize_prompt_tokens_to_input(
         self,
         request: AnyRequest,
         prompt_ids: list[int],
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
     ) -> TextTokensPrompt:
         truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
 
@@ -1015,7 +1000,7 @@ def _validate_input(
     async def _tokenize_prompt_input_async(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         prompt_input: str | list[int],
         add_special_tokens: bool = True,
     ) -> TextTokensPrompt:
@@ -1034,7 +1019,7 @@ async def _tokenize_prompt_input_async(
     async def _tokenize_prompt_inputs_async(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         prompt_inputs: Iterable[str | list[int]],
         add_special_tokens: bool = True,
     ) -> AsyncGenerator[TextTokensPrompt, None]:
@@ -1079,7 +1064,7 @@ def _validate_chat_template(
     async def _preprocess_chat(
         self,
         request: ChatLikeRequest | ResponsesRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         messages: list[ChatCompletionMessageParam],
         chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
@@ -1088,13 +1073,18 @@ async def _preprocess_chat(
         tool_dicts: list[dict[str, Any]] | None = None,
         documents: list[dict[str, str]] | None = None,
         chat_template_kwargs: dict[str, Any] | None = None,
-        tool_parser: Callable[[AnyTokenizer], ToolParser] | None = None,
+        tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
         add_special_tokens: bool = False,
     ) -> tuple[
         list[ConversationMessage],
         Sequence[RequestPrompt],
         list[EngineTokensPrompt],
     ]:
+        if tokenizer is None:
+            raise ValueError(
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
+            )
+
         model_config = self.model_config
 
         resolved_content_format = resolve_chat_template_content_format(
@@ -1370,9 +1360,9 @@ def _get_data_parallel_rank(raw_request: Request | None) -> int | None:
     @staticmethod
     def _parse_tool_calls_from_content(
         request: ResponsesRequest | ChatCompletionRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         enable_auto_tools: bool,
-        tool_parser_cls: Callable[[AnyTokenizer], ToolParser] | None,
+        tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
         content: str | None = None,
     ) -> tuple[list[FunctionCall] | None, str | None]:
         function_calls = list[FunctionCall]()
@@ -1442,7 +1432,7 @@ def _parse_tool_calls_from_content(
     def _get_decoded_token(
         logprob: Logprob,
         token_id: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         return_as_token_id: bool = False,
     ) -> str:
         if return_as_token_id:
@@ -1450,6 +1440,12 @@ def _get_decoded_token(
 
         if logprob.decoded_token is not None:
             return logprob.decoded_token
+
+        if tokenizer is None:
+            raise ValueError(
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
+            )
+
         return tokenizer.decode(token_id)
 
     def _is_model_supported(self, model_name: str | None) -> bool:
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index f546dbda7fef..5144916ba71e 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -105,7 +105,7 @@
 from vllm.logprobs import SampleLogprobs
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -492,7 +492,7 @@ async def _make_request(
         self,
         request: ResponsesRequest,
         prev_response: ResponsesResponse | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ):
         if request.tools is None or (
             request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
@@ -563,7 +563,7 @@ async def responses_full_generator(
         result_generator: AsyncIterator[ConversationContext],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int | None = None,
     ) -> ErrorResponse | ResponsesResponse:
@@ -675,7 +675,7 @@ def _topk_logprobs(
         self,
         logprobs: dict[int, SampleLogprob],
         top_logprobs: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ) -> list[LogprobTopLogprob]:
         """Returns the top-k logprobs from the logprobs dictionary."""
         out = []
@@ -700,7 +700,7 @@ def _create_response_logprobs(
         self,
         token_ids: Sequence[int],
         logprobs: SampleLogprobs | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         top_logprobs: int | None = None,
     ) -> list[Logprob]:
         assert logprobs is not None, "logprobs must be provided"
@@ -736,7 +736,7 @@ def _create_stream_response_logprobs(
         self,
         token_ids: Sequence[int],
         logprobs: SampleLogprobs | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         top_logprobs: int | None = None,
     ) -> list[response_text_delta_event.Logprob]:
         lgs = self._create_response_logprobs(
@@ -763,7 +763,7 @@ def _make_response_output_items(
         self,
         request: ResponsesRequest,
         final_output: CompletionOutput,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ) -> list[ResponseOutputItem]:
         if self.reasoning_parser:
             try:
@@ -1135,7 +1135,7 @@ async def _process_simple_streaming_events(
         result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int,
         _increment_sequence_number_and_return: Callable[
@@ -1438,7 +1438,7 @@ async def _process_harmony_streaming_events(
         result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int,
         _increment_sequence_number_and_return: Callable[
@@ -1891,7 +1891,7 @@ async def responses_stream_generator(
         result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int | None = None,
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 9cbfc9791819..0874c01c1f2a 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -36,7 +36,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.utils.async_utils import make_async, merge_async_iterators
 
 logger = init_logger(__name__)
@@ -60,7 +60,7 @@ def __init__(
 
     async def _embedding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         texts_1: list[str],
         texts_2: list[str],
         request: RerankRequest | ScoreRequest,
@@ -153,7 +153,7 @@ async def _embedding_score(
     def _preprocess_score(
         self,
         request: RerankRequest | ScoreRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         tokenization_kwargs: dict[str, Any],
         data_1: str | ScoreContentPartParam,
         data_2: str | ScoreContentPartParam,
@@ -175,7 +175,7 @@ def _preprocess_score(
 
     async def _cross_encoding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         data_1: list[str] | list[ScoreContentPartParam],
         data_2: list[str] | list[ScoreContentPartParam],
         request: RerankRequest | ScoreRequest,
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 39aae0cd0495..979da02d1450 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -22,7 +22,7 @@
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -170,7 +170,7 @@ def _build_render_config(self, request: TokenizeRequest) -> RenderConfig:
 
 @dataclass
 class TokenizerInfo:
-    tokenizer: AnyTokenizer
+    tokenizer: TokenizerLike
     chat_template: str | None
 
     def to_dict(self) -> dict[str, Any]:
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index e99e405f5de6..87ef2e0786a9 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -22,7 +22,7 @@
 from vllm.sampling_params import (
     StructuredOutputsParams,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import import_from_path
 
@@ -36,7 +36,7 @@ class ToolParser:
     derived classes.
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         self.prev_tool_call_arr: list[dict] = []
         # the index of the tool call that is currently being parsed
         self.current_tool_id: int = -1
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
index cbeb879969ec..10de3dabf985 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
@@ -19,13 +19,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class DeepSeekV31ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index bf7f6fa61ab9..66b14875dce2 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -19,13 +19,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class DeepSeekV3ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
diff --git a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
index 82370323cb00..d054d8e4b865 100644
--- a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
@@ -19,13 +19,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Ernie45ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         """
         Ernie thinking model format:
         abc\n</think>\n\n\n<tool_call>\ndef\n</tool_call>\n
diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
index 389e9754b34d..165346adb3d9 100644
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -22,13 +22,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Glm4MoeModelToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.current_tool_name_sent = False
         self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index ae9217426fb5..df1b590526b1 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -29,7 +29,7 @@
     partial_json_loads,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -44,7 +44,7 @@ class Granite20bFCToolParser(ToolParser):
     are all set
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.bot_token = "<function_call>"
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index d29c427694dc..14b0ca0abe35 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -27,7 +27,7 @@
     partial_json_loads,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -42,7 +42,7 @@ class GraniteToolParser(ToolParser):
     are all set
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         # for granite 3.0, the token `<|tool_call|>`
         self.bot_token = "<|tool_call|>"
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 4336a5438109..19c1c83268ed 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -22,18 +22,18 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Hermes2ProToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
-        if isinstance(self.model_tokenizer, MistralTokenizer):
+        if isinstance(tokenizer, MistralTokenizer):
             logger.error("Detected Mistral tokenizer when using a Hermes model")
-            self.model_tokenizer = self.model_tokenizer.tokenizer
+            self.model_tokenizer = tokenizer.tokenizer
 
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
index 920675c8389b..d2419b5d84ea 100644
--- a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
@@ -22,14 +22,14 @@
 )
 from vllm.entrypoints.openai.tool_parsers.utils import consume_space
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
 
 class HunyuanA13BToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # Initialize state for streaming mode
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 1dd327f645b3..67788358543e 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -22,13 +22,13 @@
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Internlm2ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.position = 0
 
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 6f53ddea4f0e..4655da8dd454 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -21,14 +21,13 @@
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizers import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class JambaToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         if isinstance(self.model_tokenizer, MistralTokenizer):
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
index 2b84c60a3b84..07db52ebd5af 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -19,13 +19,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class KimiK2ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
index c6c8ae8ae95f..76d76a4aa35a 100644
--- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
@@ -4,11 +4,11 @@
 import regex as re
 
 from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 class LongcatFlashToolParser(Hermes2ProToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.tool_call_start_token: str = "<longcat_tool_call>"
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
index 5c2258ba62b2..b595a98f3555 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
@@ -21,13 +21,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class MinimaxM2ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
index 982518a52e3d..1025041037c6 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -22,13 +22,13 @@
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class MinimaxToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # Initialize streaming state for tracking tool call progress
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 85671271522d..7e2d67a1fb65 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -25,7 +25,7 @@
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -46,7 +46,7 @@ def is_valid_id(id: str) -> bool:
         return id.isalnum() and len(id) == 9
 
 
-def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
+def _is_fn_name_regex_support(model_tokenizer: TokenizerLike) -> bool:
     return (
         isinstance(model_tokenizer, MistralTokenizer) and model_tokenizer.version >= 11
     )
@@ -61,7 +61,7 @@ class MistralToolParser(ToolParser):
     Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         if not isinstance(self.model_tokenizer, MistralTokenizer):
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
index d1b36a297e0b..8bdf35d40880 100644
--- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -18,15 +18,15 @@
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 else:
-    AnyTokenizer = object
+    TokenizerLike = object
 
 logger = init_logger(__name__)
 
 
 class OpenAIToolParser(ToolParser):
-    def __init__(self, tokenizer: "AnyTokenizer"):
+    def __init__(self, tokenizer: "TokenizerLike"):
         super().__init__(tokenizer)
 
     def extract_tool_calls(
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index 9d4c079eba18..d49b14690ef0 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -22,13 +22,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Qwen3CoderToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
index 432c419db189..03862ff432a5 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
@@ -23,7 +23,7 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -1165,7 +1165,7 @@ def _reset_xml_parser_after_tool_call(self):
 
 
 class Qwen3XMLToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.parser = StreamingXMLToolCallParser()
 
diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
index 8aed7f0e9fc9..c7947faad192 100644
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
@@ -25,7 +25,7 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -34,7 +34,7 @@ class SeedOssToolParser(ToolParser):
     TOOL_CALL_START = "<seed:tool_call>"
     TOOL_CALL_END = "</seed:tool_call>"
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # --- streaming state ---
diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
index adcb9f476547..9213d6859dd9 100644
--- a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
@@ -21,7 +21,7 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -41,7 +41,7 @@ class Step3ToolParser(ToolParser):
     TOOL_SEP = "<｜tool_sep｜>"
     SPECIAL_TOKENS = [TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END]
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.position = 0
         # Explicit state flags for robust streaming
diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
index 9d308af4de60..effd2bd08b42 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -21,14 +21,14 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
 
 class xLAMToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # Initialize state for streaming mode
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 3c5a396a99f9..10b90bbbb0f3 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -16,7 +16,7 @@
 from vllm.inputs.data import TextPrompt as EngineTextPrompt
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.async_utils import AsyncMicrobatchTokenizer
 
 
@@ -85,7 +85,7 @@ class BaseRenderer(ABC):
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
     ):
         super().__init__()
         self.model_config = model_config
@@ -200,8 +200,8 @@ class CompletionRenderer(BaseRenderer):
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: AnyTokenizer | None = None,
-        async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer]
+        tokenizer: TokenizerLike | None = None,
+        async_tokenizer_pool: dict[TokenizerLike, AsyncMicrobatchTokenizer]
         | None = None,
     ):
         super().__init__(model_config, tokenizer)
@@ -373,7 +373,7 @@ def _get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
             return async_tokenizer
 
         tokenizer = self.tokenizer
-        if self.tokenizer is None:
+        if tokenizer is None:
             raise ValueError("No tokenizer available for text input processing")
 
         if self.async_tokenizer_pool is None:
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 309a4c996392..04d5a192918d 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -19,11 +19,7 @@
 from vllm.model_executor.models.interfaces import supports_score_template
 from vllm.multimodal.inputs import MultiModalDataDict
 from vllm.outputs import PoolingRequestOutput
-from vllm.transformers_utils.tokenizer import (
-    AnyTokenizer,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-)
+from vllm.transformers_utils.tokenizer import TokenizerLike
 
 ScoreContentPartParam: TypeAlias = (
     ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam
@@ -45,7 +41,7 @@ class ScoreMultiModalParam(TypedDict, total=False):
 
 
 def _cosine_similarity(
-    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+    tokenizer: TokenizerLike,
     embed_1: list[PoolingRequestOutput],
     embed_2: list[PoolingRequestOutput],
 ) -> list[PoolingRequestOutput]:
@@ -93,7 +89,7 @@ def parse_score_data(
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> tuple[str, str, MultiModalDataDict | None]:
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
 
@@ -118,12 +114,14 @@ def _parse_score_content(
     mm_tracker: BaseMultiModalItemTracker,
 ) -> _ContentPart | None:
     if isinstance(data, str):
-        data = ChatCompletionContentPartTextParam(type="text", text=data)
+        part = ChatCompletionContentPartTextParam(type="text", text=data)
+    else:
+        part = data
 
     mm_parser = mm_tracker.create_parser()
 
     parse_res = _parse_chat_message_content_part(
-        data,
+        part,
         mm_parser,
         wrap_dicts=False,
         interleave_strings=False,
@@ -181,7 +179,7 @@ def post_process_tokens(
 
 def get_score_prompt(
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     tokenization_kwargs: dict[str, Any],
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 088bb679fef4..daeeb995bc74 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -30,7 +30,7 @@
 from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.transformers_utils.tokenizers import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 839c13868a16..46d1bed38aa8 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -17,7 +17,7 @@
     MultiModalUUIDDict,
 )
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.jsontree import json_iter_leaves
 from vllm.v1.metrics.stats import MultiModalCacheStats
 
@@ -46,7 +46,7 @@ class InputPreprocessor:
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         mm_processor_cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
@@ -59,7 +59,7 @@ def __init__(
 
         self.mm_cache_stats = MultiModalCacheStats() if mm_processor_cache else None
 
-    def get_tokenizer(self) -> AnyTokenizer:
+    def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
                 "You cannot pass text prompts when `skip_tokenizer_init` is True"
@@ -228,11 +228,11 @@ def _tokenize_prompt(
 
         return tokenizer.encode(prompt, **tokenization_kwargs)
 
-    def _get_mm_tokenizer(self) -> AnyTokenizer:
+    def _get_mm_tokenizer(self) -> TokenizerLike:
         # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
         # while using also multi-modal input
         if not self.tokenizer:
-            return cast(AnyTokenizer, object())  # Dummy
+            return cast(TokenizerLike, object())  # Dummy
 
         tokenizer = self.get_tokenizer()
         return tokenizer
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index 7b6a6528e20e..1bf97c2535fb 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 LogitsProcessor: TypeAlias = (
     Callable[[list[int], torch.Tensor], torch.Tensor]
@@ -19,7 +19,7 @@
 
 
 def get_bad_words_logits_processors(
-    bad_words: list[str], tokenizer: AnyTokenizer
+    bad_words: list[str], tokenizer: TokenizerLike
 ) -> list[LogitsProcessor]:
     bad_words_ids: list[list[int]] = list()
 
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 81c6b34bd6ce..6276c3d67541 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -28,7 +28,7 @@
     PromptUpdate,
     PromptUpdateDetails,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .intern_vit import InternVisionModel
 from .internvl import (
@@ -241,7 +241,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index ccbde115009d..fccddf3a6b29 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -50,7 +50,7 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_num_threads
 
@@ -347,7 +347,7 @@ class BaseInternVLProcessor(ABC):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
@@ -561,7 +561,7 @@ class InternVLProcessor(BaseInternVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 5529089e06ae..11beeddabe30 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -73,9 +73,9 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.configs.radio import RadioConfig
 from vllm.transformers_utils.tokenizer import (
-    AnyTokenizer,
     cached_tokenizer_from_config,
     encode_tokens,
 )
@@ -284,7 +284,7 @@ class BaseNanoNemotronVLProcessor(ABC):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *args,
         max_num_tiles: int | None = None,
         **kwargs,
@@ -434,7 +434,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         max_num_tiles: int | None = None,
         min_dynamic_patch: int | None = None,
@@ -645,7 +645,7 @@ def get_video_repl(
         tokens_per_frame: list[int],
         frames_indices: list[int],
         frame_duration_ms: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         img_start_token_ids: list[int],
         img_end_token_ids: list[int],
         img_context_token_ids: list[int],
@@ -670,7 +670,7 @@ def get_video_repl(
             tokens_per_frame (list[int]): number of tokens per frame
             frames_indices (list[int]): frame indices
             frame_duration_ms (int): duration of each frame in milliseconds
-            tokenizer (AnyTokenizer): tokenizer to use for tokenizing frame separators
+            tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
             img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
             img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
             img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 5a1dda8aac2c..a57668b21fb8 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -34,8 +34,8 @@
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.processing import PromptUpdateDetails
 from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_image_processor_from_config
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -203,7 +203,7 @@ class NemotronVLProcessor(InternVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         image_processor: BaseImageProcessorFast,
         *,
         min_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
index 121bf896fa6b..433891866337 100644
--- a/vllm/model_executor/models/opencua.py
+++ b/vllm/model_executor/models/opencua.py
@@ -31,7 +31,7 @@
     PromptReplacement,
     PromptUpdate,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .qwen2_5_vl import (
     Qwen2_5_VisionTransformer as OpenCUAVisionTransformer,
@@ -79,7 +79,7 @@ def check_argument_for_proper_class(self, attribute_name: str, arg: object) -> N
     def __init__(
         self,
         vision_config: dict,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         **kwargs,
     ):
         image_processor = Qwen2VLImageProcessor(**vision_config)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 3464de472add..54bde75cc013 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -59,10 +59,8 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import (
-    MistralTokenizer,
-    cached_tokenizer_from_config,
-)
+from vllm.tokenizers import MistralTokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 672659aa6042..8fbd89622394 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -91,7 +91,7 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (
@@ -1533,7 +1533,7 @@ class Tarsier2Processor(Qwen2VLProcessor):
     def __init__(
         self,
         vision_config: dict,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         **kwargs,
     ):
         self.image_processor = Tarsier2ImageProcessor(**vision_config)
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index d825eb3a1c13..55c25ce6190f 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -47,7 +47,7 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -282,7 +282,7 @@ class SkyworkR1VProcessor:
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 1c60cb414812..3e55ada0ed2e 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -43,8 +43,8 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.configs import Step3VisionEncoderConfig
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -321,7 +321,7 @@ class Step3VLProcessor:
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ) -> None:
         super().__init__()
 
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 18ad8851fccd..0a39ea7ef5bf 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -51,10 +51,8 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import (
-    MistralTokenizer,
-    cached_tokenizer_from_config,
-)
+from vllm.tokenizers import MistralTokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
 from .utils import init_vllm_registered_model, maybe_prefix
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 27bf12a5f316..aab657b24ba2 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -23,8 +23,9 @@
 from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
+from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
 from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -76,7 +77,7 @@
 
 @lru_cache(maxsize=2048)
 def _cached_encode(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     text: str,
     *,
     add_special_tokens: bool | None = None,
@@ -86,7 +87,7 @@ def _cached_encode(
 
 @lru_cache(maxsize=2048)
 def _cached_decode(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_ids: tuple[int, ...],
     *,
     skip_special_tokens: bool | None = None,
@@ -96,14 +97,14 @@ def _cached_decode(
     )
 
 
-def _seq2text(tokenizer: AnyTokenizer, seq: PromptSeq) -> str:
+def _seq2text(tokenizer: TokenizerLike, seq: PromptSeq) -> str:
     if isinstance(seq, str):
         return seq
 
     return _cached_decode(tokenizer, tuple(seq))
 
 
-def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]:
+def _seq2tokens(tokenizer: TokenizerLike, seq: PromptSeq) -> list[int]:
     if isinstance(seq, str):
         return _cached_encode(tokenizer, seq, add_special_tokens=False)
 
@@ -113,7 +114,7 @@ def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]:
 class _GetMatchIndex(Protocol):
     def __call__(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         prompt: PromptSeq,
         start_idx: int = 0,
     ) -> int | None: ...
@@ -143,7 +144,7 @@ def prefix(seq: PromptSeq) -> PromptIndex:
         """
 
         def get_match_index(
-            tokenizer: AnyTokenizer,
+            tokenizer: TokenizerLike,
             prompt: PromptSeq,
             start_idx: int = 0,
         ) -> int | None:
@@ -199,7 +200,7 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    is_embed: Callable[[AnyTokenizer, PromptSeq], torch.Tensor] | None = None
+    is_embed: Callable[[TokenizerLike, PromptSeq], torch.Tensor] | None = None
     """
     Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
     return a boolean mask of shape `(len(full),)` indicating which positions
@@ -220,7 +221,7 @@ def select_text(
         seq: _S,
         embed_text: str,
     ) -> "PromptUpdateDetails[_S]":
-        def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
+        def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor:
             embed_token_ids = encode_tokens(tokenizer, embed_text)
             token_ids = _seq2tokens(tokenizer, full)
 
@@ -236,7 +237,7 @@ def select_token_id(
         seq: _S,
         embed_token_id: int,
     ) -> "PromptUpdateDetails[_S]":
-        def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
+        def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor:
             token_ids = _seq2tokens(tokenizer, full)
 
             return torch.tensor(token_ids) == embed_token_id
@@ -522,7 +523,7 @@ class ResolvedPromptUpdate:
     def iter_token_matches(
         self,
         prompt: list[int],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -544,7 +545,7 @@ def iter_token_matches(
     def iter_text_matches(
         self,
         prompt: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -566,7 +567,7 @@ def iter_text_matches(
     def iter_matches(
         self,
         prompt: list[int] | str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -675,7 +676,7 @@ def to_range(self) -> PlaceholderRange:
 def _find_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     *,
     prev_end_idx: int = 0,
     current_result: "MultiModalPromptUpdatesApplyResult",
@@ -740,7 +741,7 @@ def _all_items_found(
 def _apply_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
     mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
 
@@ -806,7 +807,7 @@ def _apply_matches(
 def apply_token_matches(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> tuple[list[int], "MultiModalPromptUpdatesApplyResult"]:
     """
     Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -823,7 +824,7 @@ def apply_token_matches(
 def apply_text_matches(
     prompt: str,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> tuple[str, "MultiModalPromptUpdatesApplyResult"]:
     """
     Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -840,7 +841,7 @@ def apply_text_matches(
 def _iter_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> Iterable[PlaceholderFeaturesInfo]:
     """
     Yield each set of placeholder tokens found in `prompt`.
@@ -909,7 +910,7 @@ def _iter_placeholders(
 def find_mm_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
     it = _iter_placeholders(prompt, mm_prompt_updates, tokenizer)
     return dict(full_groupby_modality(it))
@@ -930,7 +931,7 @@ class InputProcessingContext:
     model_config: ModelConfig
     """The configuration of the model."""
 
-    tokenizer: AnyTokenizer
+    tokenizer: TokenizerLike
     """The tokenizer used to tokenize the inputs."""
 
     @overload
@@ -1146,7 +1147,7 @@ def __init__(self, ctx: InputProcessingContext) -> None:
     def model_id(self) -> str:
         return self.ctx.model_config.model
 
-    def get_tokenizer(self) -> AnyTokenizer:
+    def get_tokenizer(self) -> TokenizerLike:
         return self.ctx.tokenizer
 
     def get_hf_config(self) -> PretrainedConfig:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index a7eafa76ad17..ee90570b24aa 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,7 +6,8 @@
 
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
+from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from .cache import BaseMultiModalProcessorCache
 from .processing import (
@@ -231,17 +232,20 @@ def _get_model_cls(self, model_config: "ModelConfig") -> "SupportsMultiModal":
     def _create_processing_ctx(
         self,
         model_config: "ModelConfig",
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
     ) -> InputProcessingContext:
-        if tokenizer is None and not model_config.skip_tokenizer_init:
+        if model_config.skip_tokenizer_init:
+            tokenizer = cast(TokenizerLike, object())
+        elif tokenizer is None:
             tokenizer = cached_tokenizer_from_config(model_config)
+
         return InputProcessingContext(model_config, tokenizer)
 
     def _create_processing_info(
         self,
         model_config: "ModelConfig",
         *,
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
     ) -> BaseProcessingInfo:
         model_cls = self._get_model_cls(model_config)
         factories = model_cls._processor_factory
@@ -252,7 +256,7 @@ def create_processor(
         self,
         model_config: "ModelConfig",
         *,
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
         cache: BaseMultiModalProcessorCache | None = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index d26e4ffc9c16..4a04292be009 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -19,12 +19,12 @@
         DeltaMessage,
         ResponsesRequest,
     )
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 else:
     ChatCompletionRequest = Any
     DeltaMessage = Any
     ResponsesRequest = Any
-    AnyTokenizer = Any
+    TokenizerLike = Any
 
 logger = init_logger(__name__)
 
@@ -37,7 +37,7 @@ class ReasoningParser:
     It is used to extract reasoning content from the model output.
     """
 
-    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         self.model_tokenizer = tokenizer
 
     @cached_property
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index 026894773272..35084c0e7cc8 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -7,7 +7,7 @@
 
 from vllm.entrypoints.openai.protocol import DeltaMessage
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 if TYPE_CHECKING:
     from vllm.entrypoints.openai.protocol import (
@@ -43,7 +43,7 @@ def end_token(self) -> str:
         """The token that ends reasoning content."""
         raise NotImplementedError
 
-    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
 
         if not self.model_tokenizer:
diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py
index 30f5f2f88caf..138d1b4e6dac 100644
--- a/vllm/reasoning/minimax_m2_reasoning_parser.py
+++ b/vllm/reasoning/minimax_m2_reasoning_parser.py
@@ -11,7 +11,7 @@
 from vllm.logger import init_logger
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -37,7 +37,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
     Reasoning parser for MiniMax M2 model.
     """
 
-    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
         self.end_token_id = self.vocab.get("</think>")
 
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index af6d179bf6d0..b61e50c188f8 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -6,7 +6,7 @@
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
 from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 logger = init_logger(__name__)
 
diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py
index 7149f8c4123b..2742a24a2c3e 100644
--- a/vllm/reasoning/olmo3_reasoning_parser.py
+++ b/vllm/reasoning/olmo3_reasoning_parser.py
@@ -9,7 +9,7 @@
 import regex as re
 
 if TYPE_CHECKING:
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -220,7 +220,7 @@ class Olmo3ReasoningParser(ReasoningParser):
           token is missing from generation.
     """
 
-    def __init__(self, tokenizer: "AnyTokenizer", *args, **kwargs):
+    def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
 
         self.think_start = r"<think>"
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 8de961e62db1..453100f2e513 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -13,7 +13,7 @@
 
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.v1.serial_utils import PydanticMsgspecMixin
 
 logger = init_logger(__name__)
@@ -477,7 +477,7 @@ def update_from_generation_config(
                     eos_ids.update(self.stop_token_ids)
                     self.stop_token_ids = list(eos_ids)
 
-    def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+    def update_from_tokenizer(self, tokenizer: TokenizerLike) -> None:
         if not self.bad_words:
             return
         self._bad_words_token_ids = []
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
new file mode 100644
index 000000000000..e26b4e8797ec
--- /dev/null
+++ b/vllm/tokenizers/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .mistral import MistralTokenizer
+from .protocol import TokenizerLike
+from .registry import TokenizerRegistry
+
+__all__ = ["TokenizerLike", "MistralTokenizer", "TokenizerRegistry"]
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
similarity index 96%
rename from vllm/transformers_utils/tokenizers/mistral.py
rename to vllm/tokenizers/mistral.py
index 1954e2a815b0..a42fb0e1e5f1 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -4,7 +4,8 @@
 from typing import TYPE_CHECKING, Any, cast
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer_base import TokenizerBase
+
+from .protocol import TokenizerLike
 
 if TYPE_CHECKING:
     from mistral_common.protocol.instruct.request import (
@@ -163,7 +164,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
         return tokenizer.unk_id
 
 
-class MistralTokenizer(TokenizerBase):
+class MistralTokenizer(TokenizerLike):
     def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
         from mistral_common.protocol.instruct.validator import ValidationMode
         from mistral_common.tokens.tokenizers.sentencepiece import (
@@ -270,14 +271,6 @@ def bos_token_id(self) -> int:
     def eos_token_id(self) -> int:
         return self.tokenizer.eos_id
 
-    @property
-    def sep_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def pad_token(self) -> str:
-        return self.transformers_tokenizer.pad_token
-
     @property
     def is_fast(self) -> bool:
         return True
@@ -292,11 +285,14 @@ def max_token_id(self) -> int:
 
     @property
     def truncation_side(self) -> str:
-        raise NotImplementedError()
+        return self.transformers_tokenizer.truncation_side
 
     def _is_special_token_id(self, token_id: int) -> bool:
         return token_id in self._special_token_ids_set
 
+    def __hash__(self) -> int:
+        return hash(id(self))
+
     def __len__(self) -> int:
         return self.vocab_size
 
@@ -341,17 +337,6 @@ def get_added_vocab(self) -> dict[str, int]:
         # Mistral tokenizers have no added vocabulary
         return {}
 
-    def encode_one(
-        self,
-        text: str,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ) -> list[int]:
-        # Mistral Tokenizers should not add special tokens
-        return self.transformers_tokenizer.encode(
-            text, add_special_tokens=False, truncation=truncation, max_length=max_length
-        )
-
     def encode(
         self,
         text: str,
diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py
new file mode 100644
index 000000000000..58a1a7c23f21
--- /dev/null
+++ b/vllm/tokenizers/protocol.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, Any, Protocol
+
+from typing_extensions import Self
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TokenizerLike(Protocol):
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        /,
+        *,
+        revision: str | None = None,
+    ) -> Self:
+        raise NotImplementedError
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        raise NotImplementedError
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        raise NotImplementedError
+
+    @property
+    def bos_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def eos_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def is_fast(self) -> bool:
+        raise NotImplementedError
+
+    @property
+    def vocab_size(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def max_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def truncation_side(self) -> str:
+        raise NotImplementedError
+
+    def __hash__(self) -> int:
+        return hash(id(self))
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
+    def __call__(
+        self,
+        text: str | list[str] | list[int],
+        text_pair: str | None = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: int | None = None,
+    ):
+        raise NotImplementedError
+
+    def get_vocab(self) -> dict[str, int]:
+        raise NotImplementedError
+
+    def get_added_vocab(self) -> dict[str, int]:
+        raise NotImplementedError
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool | None = None,
+    ) -> list[int]:
+        raise NotImplementedError
+
+    def apply_chat_template(
+        self,
+        messages: list["ChatCompletionMessageParam"],
+        tools: list[dict[str, Any]] | None = None,
+        **kwargs,
+    ) -> list[int]:
+        raise NotImplementedError
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        raise NotImplementedError
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+        raise NotImplementedError
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = True,
+    ) -> list[str]:
+        raise NotImplementedError
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
new file mode 100644
index 000000000000..3a236c99b356
--- /dev/null
+++ b/vllm/tokenizers/registry.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+
+from .protocol import TokenizerLike
+
+
+class TokenizerRegistry:
+    # Tokenizer name -> (tokenizer module, tokenizer class)
+    REGISTRY: dict[str, tuple[str, str]] = {}
+
+    @staticmethod
+    def register(name: str, module: str, class_name: str) -> None:
+        TokenizerRegistry.REGISTRY[name] = (module, class_name)
+
+    @staticmethod
+    def get_tokenizer(
+        tokenizer_name: str,
+        *args,
+        **kwargs,
+    ) -> "TokenizerLike":
+        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
+        if tokenizer_cls is None:
+            raise ValueError(f"Tokenizer {tokenizer_name} not found.")
+
+        tokenizer_module = importlib.import_module(tokenizer_cls[0])
+        class_ = getattr(tokenizer_module, tokenizer_cls[1])
+        return class_.from_pretrained(*args, **kwargs)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 45c4358bbc8f..8f2cd3315ab9 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -26,8 +26,9 @@
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.config_parser_base import ConfigParserBase
-from vllm.transformers_utils.repo_utils import (
+
+from .config_parser_base import ConfigParserBase
+from .repo_utils import (
     _get_hf_token,
     file_or_path_exists,
     get_hf_file_to_dict,
@@ -35,7 +36,7 @@
     try_get_local_file,
     with_retry,
 )
-from vllm.transformers_utils.utils import (
+from .utils import (
     check_gguf_file,
     is_gguf,
     is_remote_gguf,
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 560526bfd823..e586a5d46cb8 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-from .tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 def _replace_none_with_empty(tokens: list[str | None]):
@@ -12,7 +12,7 @@ def _replace_none_with_empty(tokens: list[str | None]):
 
 
 def _convert_tokens_to_string_with_added_encoders(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     output_tokens: list[str],
     skip_special_tokens: bool,
     spaces_between_special_tokens: bool,
@@ -57,7 +57,7 @@ def _convert_tokens_to_string_with_added_encoders(
 
 
 def convert_prompt_ids_to_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     prompt_ids: list[int],
     skip_special_tokens: bool = False,
 ) -> tuple[list[str], int, int]:
@@ -81,7 +81,7 @@ def convert_prompt_ids_to_tokens(
 
 
 def convert_ids_list_to_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_ids: list[int],
 ) -> list[str]:
     """Detokenize the input ids individually.
@@ -108,7 +108,7 @@ def convert_ids_list_to_tokens(
 # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
 # under Apache 2.0 license
 def detokenize_incrementally(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     all_input_ids: list[int],
     prev_tokens: list[str] | None,
     prefix_offset: int,
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index c5b4d3f00090..cb1fc2d092e0 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -9,7 +9,8 @@
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+
+from .repo_utils import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index be4325ab9101..87d5cc2b483f 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -5,41 +5,48 @@
 import copy
 import importlib.util
 import os
+import warnings
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, TypeAlias
+from typing import TYPE_CHECKING, Any
 
 import huggingface_hub
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
 from typing_extensions import assert_never
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
-from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
-from vllm.transformers_utils.tokenizers import MistralTokenizer
-from vllm.transformers_utils.utils import (
-    check_gguf_file,
-    is_gguf,
-    is_remote_gguf,
-    split_remote_gguf,
-)
+from vllm.tokenizers import MistralTokenizer, TokenizerLike, TokenizerRegistry
+
+from .config import get_sentence_transformer_tokenizer_config
+from .gguf_utils import get_gguf_file_path_from_hf
+from .repo_utils import list_filtered_repo_files
+from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
-    from vllm.transformers_utils.tokenizer_base import TokenizerBase
-else:
-    ModelConfig = Any
-    TokenizerBase = Any
+
 
 logger = init_logger(__name__)
 
-AnyTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast | TokenizerBase
+
+def __getattr__(name: str):
+    if name == "AnyTokenizer":
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
+            "`vllm.tokenizers.TokenizerLike`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        return TokenizerLike
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
 def decode_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_ids: list[int],
     *,
     skip_special_tokens: bool | None = None,
@@ -58,7 +65,7 @@ def decode_tokens(
 
 
 def encode_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     text: str,
     *,
     truncation: bool | None = None,
@@ -86,7 +93,7 @@ def encode_tokens(
     return tokenizer.encode(text, **kw_args)
 
 
-def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
+def get_cached_tokenizer(tokenizer: TokenizerLike) -> TokenizerLike:
     """
     By default, transformers will recompute multiple tokenizer properties
     each time they are called, leading to a significant slowdown.
@@ -144,7 +151,7 @@ def get_tokenizer(
     revision: str | None = None,
     download_dir: str | None = None,
     **kwargs,
-) -> AnyTokenizer:
+) -> TokenizerLike:
     """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
     if envs.VLLM_USE_MODELSCOPE:
         # download model from ModelScope hub,
@@ -206,15 +213,13 @@ def get_tokenizer(
         if len(files_list) > 0:
             tokenizer_mode = "mistral"
 
-    tokenizer: AnyTokenizer
+    tokenizer: TokenizerLike
     if tokenizer_mode == "mistral":
         logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
         tokenizer = MistralTokenizer.from_pretrained(
             str(tokenizer_name), revision=revision
         )
     elif tokenizer_mode == "custom":
-        from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
-
         logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
         tokenizer = TokenizerRegistry.get_tokenizer(
             str(tokenizer_name),
@@ -260,12 +265,13 @@ def get_tokenizer(
         if isinstance(encoder_config, dict) and encoder_config.get(
             "do_lower_case", False
         ):
+            assert isinstance(tokenizer, PreTrainedTokenizerBase)
             special_tokens_map = {
                 k: v.lower() for k, v in tokenizer.special_tokens_map.items()
             }
             tokenizer.add_special_tokens(special_tokens_map)
 
-        if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        if not tokenizer.is_fast:
             logger.warning(
                 "Using a slow tokenizer. This might cause a significant "
                 "slowdown. Consider using a fast tokenizer instead."
@@ -279,7 +285,7 @@ def get_tokenizer(
 
 
 def cached_tokenizer_from_config(
-    model_config: ModelConfig,
+    model_config: "ModelConfig",
     **kwargs: Any,
 ):
     return cached_get_tokenizer(
@@ -291,7 +297,7 @@ def cached_tokenizer_from_config(
     )
 
 
-def init_tokenizer_from_configs(model_config: ModelConfig):
+def init_tokenizer_from_configs(model_config: "ModelConfig"):
     runner_type = model_config.runner_type
     if runner_type == "generate" or runner_type == "draft":
         truncation_side = "left"
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index 52f221d1e373..78fb6edc8b9e 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -1,150 +1,33 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import importlib
-from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any
+import warnings
 
-if TYPE_CHECKING:
-    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
+def __getattr__(name: str):
+    if name == "TokenizerBase":
+        from vllm.tokenizers import TokenizerLike
 
-class TokenizerBase(ABC):
-    @property
-    @abstractmethod
-    def all_special_tokens(self) -> list[str]:
-        raise NotImplementedError()
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
+            "moved to `vllm.tokenizers.TokenizerLike`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
-    @property
-    @abstractmethod
-    def all_special_ids(self) -> list[int]:
-        raise NotImplementedError()
+        return TokenizerLike
+    if name == "TokenizerRegistry":
+        from vllm.tokenizers import TokenizerRegistry
 
-    @property
-    @abstractmethod
-    def bos_token_id(self) -> int:
-        raise NotImplementedError()
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
+            "moved to `vllm.tokenizers.TokenizerRegistry`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
-    @property
-    @abstractmethod
-    def eos_token_id(self) -> int:
-        raise NotImplementedError()
+        return TokenizerRegistry
 
-    @property
-    @abstractmethod
-    def sep_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def pad_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def is_fast(self) -> bool:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def vocab_size(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def max_token_id(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def truncation_side(self) -> str:
-        raise NotImplementedError()
-
-    def __len__(self) -> int:
-        return self.vocab_size
-
-    @abstractmethod
-    def __call__(
-        self,
-        text: str | list[str] | list[int],
-        text_pair: str | None = None,
-        add_special_tokens: bool = False,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def get_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def get_added_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def encode_one(
-        self,
-        text: str,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def encode(
-        self,
-        text: str,
-        truncation: bool | None = None,
-        max_length: int | None = None,
-        add_special_tokens: bool | None = None,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def apply_chat_template(
-        self,
-        messages: list["ChatCompletionMessageParam"],
-        tools: list[dict[str, Any]] | None = None,
-        **kwargs,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def convert_tokens_to_string(self, tokens: list[str]) -> str:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def convert_ids_to_tokens(
-        self,
-        ids: list[int],
-        skip_special_tokens: bool = True,
-    ) -> list[str]:
-        raise NotImplementedError()
-
-
-class TokenizerRegistry:
-    # Tokenizer name -> (tokenizer module, tokenizer class)
-    REGISTRY: dict[str, tuple[str, str]] = {}
-
-    @staticmethod
-    def register(name: str, module: str, class_name: str) -> None:
-        TokenizerRegistry.REGISTRY[name] = (module, class_name)
-
-    @staticmethod
-    def get_tokenizer(
-        tokenizer_name: str,
-        *args,
-        **kwargs,
-    ) -> TokenizerBase:
-        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
-        if tokenizer_cls is None:
-            raise ValueError(f"Tokenizer {tokenizer_name} not found.")
-
-        tokenizer_module = importlib.import_module(tokenizer_cls[0])
-        class_ = getattr(tokenizer_module, tokenizer_cls[1])
-        return class_.from_pretrained(*args, **kwargs)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
deleted file mode 100644
index b63cb26af46d..000000000000
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from .mistral import (
-    MistralTokenizer,
-    maybe_serialize_tool_calls,
-    truncate_tool_call_ids,
-    validate_request_params,
-)
-
-__all__ = [
-    "MistralTokenizer",
-    "maybe_serialize_tool_calls",
-    "truncate_tool_call_ids",
-    "validate_request_params",
-]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index bd28c41fb50e..336d3e9fa1d2 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -26,9 +26,10 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
+from vllm.tokenizers import TokenizerLike
 from vllm.tracing import init_tracer
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
-from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.async_utils import cancel_task_threadsafe
 from vllm.utils.collection_utils import as_list
@@ -120,9 +121,10 @@ def __init__(
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
-        stream_interval = self.vllm_config.scheduler_config.stream_interval
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
+            self.tokenizer,
+            log_stats=self.log_stats,
+            stream_interval=self.vllm_config.scheduler_config.stream_interval,
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
@@ -703,17 +705,17 @@ async def encode(
             raise EngineGenerateError() from e
 
     @property
-    def tokenizer(self) -> AnyTokenizer | None:
+    def tokenizer(self) -> TokenizerLike | None:
         return self.input_processor.tokenizer
 
     @tokenizer.setter
-    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
         self.input_processor.tokenizer = tokenizer
 
-    async def get_tokenizer(self) -> AnyTokenizer:
+    async def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
-                "Unable to get tokenizer because skip_tokenizer_init is True"
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
             )
 
         return self.tokenizer
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index b7a24096bf15..c55240c40f6f 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -10,7 +10,7 @@
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer,
+    TokenizerLike,
     convert_prompt_ids_to_tokens,
     detokenize_incrementally,
 )
@@ -45,7 +45,7 @@ def get_next_output_text(self, finished: bool, delta: bool) -> str:
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
         request: EngineCoreRequest,
     ) -> "IncrementalDetokenizer":
         assert request.sampling_params is not None
@@ -256,7 +256,7 @@ def _protected_step(self, next_token_id: int) -> str | None:
 
 
 class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
-    def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest):
+    def __init__(self, tokenizer: TokenizerLike, request: EngineCoreRequest):
         super().__init__(request)
 
         self.tokenizer = tokenizer
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index cfd637931a1c..e6a94f4e3de5 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -19,8 +19,7 @@
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import MultiModalCacheStats
@@ -40,7 +39,7 @@ class InputProcessor:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         self.vllm_config = vllm_config
@@ -62,11 +61,11 @@ def __init__(
         )
 
     @property
-    def tokenizer(self) -> AnyTokenizer | None:
+    def tokenizer(self) -> TokenizerLike | None:
         return self.input_preprocessor.tokenizer
 
     @tokenizer.setter
-    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
         self.input_preprocessor.tokenizer = tokenizer
 
     def _validate_logprobs(
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index ead553e98a97..a3bde7ba8d64 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -23,8 +23,9 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
+from vllm.tokenizers import TokenizerLike
 from vllm.tracing import init_tracer
-from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
@@ -95,9 +96,10 @@ def __init__(
         )
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
-        stream_interval = self.vllm_config.scheduler_config.stream_interval
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats, stream_interval=stream_interval
+            self.tokenizer,
+            log_stats=self.log_stats,
+            stream_interval=self.vllm_config.scheduler_config.stream_interval,
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
@@ -350,17 +352,17 @@ def get_metrics(self) -> list[Metric]:
         return get_metrics_snapshot()
 
     @property
-    def tokenizer(self) -> AnyTokenizer | None:
+    def tokenizer(self) -> TokenizerLike | None:
         return self.input_processor.tokenizer
 
     @tokenizer.setter
-    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
+    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
         self.input_processor.tokenizer = tokenizer
 
-    def get_tokenizer(self) -> AnyTokenizer:
+    def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
-                "Unable to get tokenizer because skip_tokenizer_init is True"
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
             )
 
         return self.tokenizer
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 63064a2c65d6..1c8f808bc25b 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -13,7 +13,7 @@
     create_sample_logprobs,
 )
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer,
+    TokenizerLike,
     convert_ids_list_to_tokens,
 )
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
@@ -28,7 +28,7 @@
 class LogprobsProcessor:
     # Tokenizer for this request,
     # None if detokenization is disabled.
-    tokenizer: AnyTokenizer | None
+    tokenizer: TokenizerLike | None
 
     # Logprobs for this request
     logprobs: SampleLogprobs | None
@@ -40,7 +40,7 @@ class LogprobsProcessor:
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
         request: EngineCoreRequest,
     ) -> "LogprobsProcessor":
         sampling_params = request.sampling_params
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 0453c4a77f0c..e85fbb4ee0fb 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -15,8 +15,8 @@
     RequestOutput,
 )
 from vllm.sampling_params import RequestOutputKind
+from vllm.tokenizers import TokenizerLike
 from vllm.tracing import SpanAttributes, SpanKind, Tracer, extract_trace_context
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
@@ -139,7 +139,7 @@ def __init__(
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         request: EngineCoreRequest,
         prompt: str | None,
         parent_req: ParentRequest | None,
@@ -341,7 +341,10 @@ class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
     def __init__(
-        self, tokenizer: AnyTokenizer, log_stats: bool, stream_interval: int = 1
+        self,
+        tokenizer: TokenizerLike | None,
+        log_stats: bool,
+        stream_interval: int = 1,
     ):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 7dc9589b63b8..5c09b7b0634f 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -10,10 +10,10 @@
     import torch
 
     from vllm.config import VllmConfig
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 else:
     VllmConfig = object
-    AnyTokenizer = object
+    TokenizerLike = object
 
 
 class StructuredOutputOptions(enum.Enum):
@@ -100,7 +100,7 @@ class StructuredOutputBackend(ABC):
     """Engine-level backend for structured output requests."""
 
     vllm_config: VllmConfig
-    tokenizer: AnyTokenizer
+    tokenizer: TokenizerLike
     vocab_size: int
 
     @abstractmethod
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index c9f2dc07da78..f8a2df43dd90 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -10,7 +10,7 @@
 import vllm.envs
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (
     StructuredOutputBackend,
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index d2d14fcfc436..ae42b33f80f8 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -24,7 +24,7 @@
     import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2
     import xgrammar as xgr
 
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
     from vllm.v1.worker.gpu_input_batch import InputBatch
 else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
@@ -36,7 +36,7 @@
         "transformers.models.gpt2.tokenization_gpt2",
     )
 
-    AnyTokenizer = object
+    TokenizerLike = object
     SchedulerOutput = object
     InputBatch = object
 
@@ -195,7 +195,7 @@ def get_outlines_cache():
 
 
 def _reduced_vocabulary(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     eos_token_id: int,
 ) -> dict[bytes, list[int]]:
     """Create a map from vocabulary tokens to lists of equivalent token ids.
@@ -222,7 +222,7 @@ def convert_token_to_string(token: str) -> str:
     vocabulary: dict[bytes, list[int]] = {}
     empty_token_ids: list[int] = []
     for token, token_idx in tokenizer.get_vocab().items():
-        if token in tokenizer.all_special_tokens:  # type: ignore
+        if token in tokenizer.all_special_tokens:
             continue
 
         token_str = convert_token_to_string(token)
@@ -261,7 +261,7 @@ def convert_token_to_string(token: str) -> str:
     return vocabulary
 
 
-def get_outlines_vocabulary(tokenizer: AnyTokenizer) -> oc.Vocabulary:
+def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary:
     """Get the `Vocabulary` object for a given tokenizer."""
     if hasattr(tokenizer, "_outlines_vocabulary"):
         return tokenizer._outlines_vocabulary  # type: ignore

From f4341f45d3be64133ded0ff1d403587ef39225c7 Mon Sep 17 00:00:00 2001
From: dublc <88999483+dublc@users.noreply.github.com>
Date: Sat, 29 Nov 2025 21:46:48 +0800
Subject: [PATCH 548/578] [Doc]: fix code block rendering (#29728)

Signed-off-by: dublc <jdublc0x@gmail.com>
---
 docs/design/plugin_system.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index 9c84889f7f03..3485c40c3681 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -86,7 +86,7 @@ Every plugin has three parts:
         },
         ...
     )
-        ```
+    ```
 
     Please make sure `vllm_add_dummy_platform:register` is a callable function and returns the platform class's fully qualified name. for example:
 

From ad7f714d62e2d0acad0799eed39d7823aba4b422 Mon Sep 17 00:00:00 2001
From: Chukwuma Nwaugha <20521315+nwaughachukwuma@users.noreply.github.com>
Date: Sat, 29 Nov 2025 13:57:00 +0000
Subject: [PATCH 549/578] hfrunner.classify should return list[list[float]] not
 list[str] (#29671)

Signed-off-by: Chukwuma Nwaugha <nwaughac@gmail.com>
---
 tests/conftest.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 11c573befb2d..317b36ba6cb8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -459,14 +459,17 @@ def get_prompt_embeddings(self, prompts: list[str]) -> list[torch.Tensor]:
             embeddings.append(embedding)
         return embeddings
 
-    def classify(self, prompts: list[str]) -> list[str]:
+    def classify(self, prompts: list[str]) -> list[list[float]]:
         # output is final logits
         all_inputs = self.get_inputs(prompts)
-        outputs = []
+        outputs: list[list[float]] = []
         problem_type = getattr(self.config, "problem_type", "")
 
         for inputs in all_inputs:
             output = self.model(**self.wrap_device(inputs))
+
+            assert isinstance(output.logits, torch.Tensor)
+
             if problem_type == "regression":
                 logits = output.logits[0].tolist()
             elif problem_type == "multi_label_classification":

From fe3398fab2b11f92b4ada209a85558710c446a36 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 22:25:10 +0800
Subject: [PATCH 550/578] [Chore] Enable passing `tokenizer=None` into MM
 processor (#29724)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py       | 41 +++--------
 vllm/entrypoints/openai/serving_engine.py |  2 +-
 vllm/inputs/preprocess.py                 | 15 +---
 vllm/model_executor/models/glm4_1v.py     |  6 --
 vllm/model_executor/models/qwen3_vl.py    |  3 -
 vllm/model_executor/models/qwen_vl.py     |  2 +-
 vllm/multimodal/processing.py             | 86 +++++++++++++++--------
 vllm/multimodal/registry.py               |  4 +-
 8 files changed, 68 insertions(+), 91 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index f7fa8da54d54..262ea42e4d0f 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -3,7 +3,6 @@
 
 import time
 from contextlib import nullcontext
-from typing import cast
 
 import numpy as np
 import pytest
@@ -24,7 +23,6 @@
     replace_token_matches,
 )
 from vllm.multimodal.profiling import MultiModalProfiler
-from vllm.tokenizers import TokenizerLike
 
 from .utils import random_image
 
@@ -238,15 +236,12 @@ def test_find_token_matches(
     expected_by_key,
     update_type,
 ):
-    # Should not be used since there is nothing to convert to token IDs
-    mock_tokenizer = cast(TokenizerLike, object())
-
     prompt_updates = {
         key: update_type(key, target, []).resolve(0)
         for key, target in target_by_key.items()
     }
     result = {
-        key: list(update.iter_token_matches(prompt, mock_tokenizer))
+        key: list(update.iter_token_matches(prompt, tokenizer=None))
         for key, update in prompt_updates.items()
     }
 
@@ -385,15 +380,12 @@ def test_find_text_matches(
     expected_by_key,
     update_type,
 ):
-    # Should not be used since there is nothing to convert to text
-    mock_tokenizer = cast(TokenizerLike, object())
-
     prompt_updates = {
         key: update_type(key, target, []).resolve(0)
         for key, target in target_by_key.items()
     }
     result = {
-        key: list(update.iter_text_matches(prompt, mock_tokenizer))
+        key: list(update.iter_text_matches(prompt, tokenizer=None))
         for key, update in prompt_updates.items()
     }
 
@@ -545,9 +537,6 @@ def test_find_update_text(
     repl_by_key,
     expected_by_update_type_mm_count,
 ):
-    # Should not be used since there is nothing to convert to text
-    mock_tokenizer = cast(TokenizerLike, object())
-
     for (
         update_type,
         expected_by_mm_count,
@@ -564,7 +553,7 @@ def test_find_update_text(
             new_prompt, result = apply_text_matches(
                 prompt,
                 mm_prompt_updates,
-                mock_tokenizer,
+                tokenizer=None,
             )
 
             # Only displayed on error
@@ -750,9 +739,6 @@ def test_find_update_tokens(
     repl_by_key,
     expected_by_update_type_mm_count,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(TokenizerLike, object())
-
     for (
         update_type,
         expected_by_mm_count,
@@ -769,7 +755,7 @@ def test_find_update_tokens(
             new_prompt, result = apply_token_matches(
                 prompt,
                 mm_prompt_updates,
-                mock_tokenizer,
+                tokenizer=None,
             )
 
             # Only displayed on error
@@ -900,15 +886,12 @@ def test_find_mm_placeholders(
     expected,
     update_type,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(TokenizerLike, object())
-
     mm_prompt_updates = {
         key: [[update_type(key, [], repl).resolve(i)] for i in range(3)]
         for key, repl in repl_by_key.items()
     }
 
-    result = find_mm_placeholders(prompt, mm_prompt_updates, mock_tokenizer)
+    result = find_mm_placeholders(prompt, mm_prompt_updates, tokenizer=None)
 
     # Only displayed on error
     print("result:", result)
@@ -1029,12 +1012,9 @@ def test_hf_processor_init_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(TokenizerLike, object())
-
     ctx = InputProcessingContext(
         model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
-        tokenizer=mock_tokenizer,
+        tokenizer=None,
     )
 
     processor = ctx.get_hf_processor(
@@ -1065,12 +1045,9 @@ def test_hf_processor_call_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(TokenizerLike, object())
-
     ctx = InputProcessingContext(
         model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
-        tokenizer=mock_tokenizer,
+        tokenizer=None,
     )
 
     processor = ctx.get_hf_processor(DummyProcessor)  # type: ignore[arg-type]
@@ -1089,8 +1066,6 @@ def test_apply_matches_no_match_exits_quickly():
 
     With the fix, it should exit immediately when no match is found.
     """
-    mock_tokenizer = cast(TokenizerLike, object())
-
     # Create a long prompt with no placeholder
     long_prompt = "x" * 10000
 
@@ -1103,7 +1078,7 @@ def test_apply_matches_no_match_exits_quickly():
     result, _ = _apply_matches(
         long_prompt,
         mm_prompt_updates,
-        mock_tokenizer,
+        tokenizer=None,
     )
     elapsed = time.perf_counter() - start
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index e7a632e02510..b6a2478cf8c8 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -337,7 +337,7 @@ async def beam_search(
         tokenizer = input_processor.tokenizer
         if tokenizer is None:
             raise ValueError(
-                "You cannot use beam search when `skip_tokenizer_init` is True"
+                "You cannot use beam search when `skip_tokenizer_init=True`"
             )
 
         eos_token_id: int = tokenizer.eos_token_id  # type: ignore
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 46d1bed38aa8..2893a56b1190 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -62,7 +62,7 @@ def __init__(
     def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
-                "You cannot pass text prompts when `skip_tokenizer_init` is True"
+                "You cannot pass text prompts when `skip_tokenizer_init=True`"
             )
 
         return self.tokenizer
@@ -228,22 +228,11 @@ def _tokenize_prompt(
 
         return tokenizer.encode(prompt, **tokenization_kwargs)
 
-    def _get_mm_tokenizer(self) -> TokenizerLike:
-        # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
-        # while using also multi-modal input
-        if not self.tokenizer:
-            return cast(TokenizerLike, object())  # Dummy
-
-        tokenizer = self.get_tokenizer()
-        return tokenizer
-
     def _get_mm_processor(self) -> BaseMultiModalProcessor:
         if not hasattr(self, "_mm_processor"):
-            tokenizer = self._get_mm_tokenizer()
-
             self._mm_processor = self.mm_registry.create_processor(
                 self.model_config,
-                tokenizer=tokenizer,
+                tokenizer=self.tokenizer,
                 cache=self.mm_processor_cache,
             )
 
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index fe238861ecce..5ba3c0a35928 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -866,12 +866,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
 
 class Glm4vProcessingInfo(BaseProcessingInfo):
-    def get_hf_config(self):
-        return self.ctx.get_hf_config()
-
-    def get_tokenizer(self):
-        return self.ctx.tokenizer
-
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": 1}
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index f0ba631e6680..1d3929b936a9 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -615,9 +615,6 @@ def get_hf_processor(self, **kwargs: object) -> Qwen3VLProcessor:
             **kwargs,
         )
 
-    def get_tokenizer(self):
-        return self.ctx.tokenizer
-
     def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessorFast:
         return self.get_hf_processor(**kwargs).image_processor
 
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 4906cf441f6f..55680b8e7ddf 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -555,7 +555,7 @@ def __call__(
 
 class QwenVLProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self) -> PreTrainedTokenizer:
-        tokenizer = self.ctx.tokenizer
+        tokenizer = self.ctx.get_tokenizer()
         assert isinstance(tokenizer, PreTrainedTokenizer)
 
         return _get_tokenizer_without_image_pad(tokenizer)
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index aab657b24ba2..912cff2343dd 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -97,15 +97,37 @@ def _cached_decode(
     )
 
 
-def _seq2text(tokenizer: TokenizerLike, seq: PromptSeq) -> str:
+def _seq2text(
+    tokenizer: TokenizerLike | None,
+    seq: PromptSeq,
+    *,
+    use_cache: bool = True,
+) -> str:
     if isinstance(seq, str):
         return seq
 
+    if tokenizer is None:
+        raise ValueError("You cannot decode tokens when `skip_tokenizer_init=True`")
+
+    if not use_cache:
+        return decode_tokens(tokenizer, seq)
+
     return _cached_decode(tokenizer, tuple(seq))
 
 
-def _seq2tokens(tokenizer: TokenizerLike, seq: PromptSeq) -> list[int]:
+def _seq2tokens(
+    tokenizer: TokenizerLike | None,
+    seq: PromptSeq,
+    *,
+    use_cache: bool = True,
+) -> list[int]:
     if isinstance(seq, str):
+        if tokenizer is None:
+            raise ValueError("You cannot encode text when `skip_tokenizer_init=True`")
+
+        if not use_cache:
+            return encode_tokens(tokenizer, seq, add_special_tokens=False)
+
         return _cached_encode(tokenizer, seq, add_special_tokens=False)
 
     return seq
@@ -114,7 +136,7 @@ def _seq2tokens(tokenizer: TokenizerLike, seq: PromptSeq) -> list[int]:
 class _GetMatchIndex(Protocol):
     def __call__(
         self,
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         prompt: PromptSeq,
         start_idx: int = 0,
     ) -> int | None: ...
@@ -144,7 +166,7 @@ def prefix(seq: PromptSeq) -> PromptIndex:
         """
 
         def get_match_index(
-            tokenizer: TokenizerLike,
+            tokenizer: TokenizerLike | None,
             prompt: PromptSeq,
             start_idx: int = 0,
         ) -> int | None:
@@ -154,13 +176,11 @@ def get_match_index(
             prefix = seq
 
             if isinstance(prompt, str):
-                if not isinstance(prefix, str):
-                    # Make both `str`
-                    prefix = decode_tokens(tokenizer, prefix)
+                # Make both `str`
+                prefix = _seq2text(tokenizer, prefix, use_cache=False)
             else:
-                if isinstance(prefix, str):
-                    # Make both `list[int]`
-                    prefix = encode_tokens(tokenizer, prefix, add_special_tokens=False)
+                # Make both `list[int]`
+                prefix = _seq2tokens(tokenizer, prefix, use_cache=False)
 
             match_idx = len(prefix)
             return match_idx if prompt[:match_idx] == prefix else None
@@ -200,7 +220,7 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    is_embed: Callable[[TokenizerLike, PromptSeq], torch.Tensor] | None = None
+    is_embed: Callable[[TokenizerLike | None, PromptSeq], torch.Tensor] | None = None
     """
     Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
     return a boolean mask of shape `(len(full),)` indicating which positions
@@ -221,8 +241,8 @@ def select_text(
         seq: _S,
         embed_text: str,
     ) -> "PromptUpdateDetails[_S]":
-        def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor:
-            embed_token_ids = encode_tokens(tokenizer, embed_text)
+        def is_embed(tokenizer: TokenizerLike | None, full: PromptSeq) -> torch.Tensor:
+            embed_token_ids = _seq2tokens(tokenizer, embed_text, use_cache=False)
             token_ids = _seq2tokens(tokenizer, full)
 
             return torch.isin(
@@ -237,7 +257,7 @@ def select_token_id(
         seq: _S,
         embed_token_id: int,
     ) -> "PromptUpdateDetails[_S]":
-        def is_embed(tokenizer: TokenizerLike, full: PromptSeq) -> torch.Tensor:
+        def is_embed(tokenizer: TokenizerLike | None, full: PromptSeq) -> torch.Tensor:
             token_ids = _seq2tokens(tokenizer, full)
 
             return torch.tensor(token_ids) == embed_token_id
@@ -523,7 +543,7 @@ class ResolvedPromptUpdate:
     def iter_token_matches(
         self,
         prompt: list[int],
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -545,7 +565,7 @@ def iter_token_matches(
     def iter_text_matches(
         self,
         prompt: str,
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -567,7 +587,7 @@ def iter_text_matches(
     def iter_matches(
         self,
         prompt: list[int] | str,
-        tokenizer: TokenizerLike,
+        tokenizer: TokenizerLike | None,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -676,7 +696,7 @@ def to_range(self) -> PlaceholderRange:
 def _find_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
     *,
     prev_end_idx: int = 0,
     current_result: "MultiModalPromptUpdatesApplyResult",
@@ -741,7 +761,7 @@ def _all_items_found(
 def _apply_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
 ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
     mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
 
@@ -807,7 +827,7 @@ def _apply_matches(
 def apply_token_matches(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
 ) -> tuple[list[int], "MultiModalPromptUpdatesApplyResult"]:
     """
     Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -824,7 +844,7 @@ def apply_token_matches(
 def apply_text_matches(
     prompt: str,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
 ) -> tuple[str, "MultiModalPromptUpdatesApplyResult"]:
     """
     Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -841,7 +861,7 @@ def apply_text_matches(
 def _iter_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
 ) -> Iterable[PlaceholderFeaturesInfo]:
     """
     Yield each set of placeholder tokens found in `prompt`.
@@ -910,7 +930,7 @@ def _iter_placeholders(
 def find_mm_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: TokenizerLike,
+    tokenizer: TokenizerLike | None,
 ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
     it = _iter_placeholders(prompt, mm_prompt_updates, tokenizer)
     return dict(full_groupby_modality(it))
@@ -931,9 +951,17 @@ class InputProcessingContext:
     model_config: ModelConfig
     """The configuration of the model."""
 
-    tokenizer: TokenizerLike
+    tokenizer: TokenizerLike | None
     """The tokenizer used to tokenize the inputs."""
 
+    def get_tokenizer(self) -> TokenizerLike:
+        if self.tokenizer is None:
+            raise ValueError(
+                "You cannot pass text prompts when `skip_tokenizer_init=True`"
+            )
+
+        return self.tokenizer
+
     @overload
     def get_hf_config(self, /) -> PretrainedConfig: ...
 
@@ -1148,7 +1176,7 @@ def model_id(self) -> str:
         return self.ctx.model_config.model
 
     def get_tokenizer(self) -> TokenizerLike:
-        return self.ctx.tokenizer
+        return self.ctx.get_tokenizer()
 
     def get_hf_config(self) -> PretrainedConfig:
         return self.ctx.get_hf_config()
@@ -1960,15 +1988,11 @@ def _apply_prompt_updates(
             for update_idxs in match_result.values()
         ):
             new_text, match_result = self._apply_text_matches(
-                decode_tokens(tokenizer, token_ids),
+                _seq2text(tokenizer, token_ids, use_cache=False),
                 mm_prompt_updates,
             )
 
-            new_token_ids = encode_tokens(
-                tokenizer,
-                new_text,
-                add_special_tokens=False,
-            )
+            new_token_ids = _seq2tokens(tokenizer, new_text, use_cache=False)
 
         matched_updates = defaultdict[str, list[Sequence[ResolvedPromptUpdate]]](list)
         for modality, update_idxs in match_result.items():
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index ee90570b24aa..2fdae46e547b 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -234,9 +234,7 @@ def _create_processing_ctx(
         model_config: "ModelConfig",
         tokenizer: TokenizerLike | None = None,
     ) -> InputProcessingContext:
-        if model_config.skip_tokenizer_init:
-            tokenizer = cast(TokenizerLike, object())
-        elif tokenizer is None:
+        if tokenizer is None and not model_config.skip_tokenizer_init:
             tokenizer = cached_tokenizer_from_config(model_config)
 
         return InputProcessingContext(model_config, tokenizer)

From fa59fe417f509641fed102dfa2e3b8a63f224241 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Nov 2025 22:25:17 +0800
Subject: [PATCH 551/578] [Chore] Move `detokenizer_utils` to `vllm/tokenizers`
 (#29727)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/tool_use/test_ernie45_moe_tool_parser.py               | 2 +-
 tests/tool_use/test_jamba_tool_parser.py                     | 2 +-
 tests/tool_use/test_qwen3coder_tool_parser.py                | 2 +-
 tests/tool_use/test_seed_oss_tool_parser.py                  | 2 +-
 tests/tool_use/test_xlam_tool_parser.py                      | 2 +-
 tests/utils_/test_argparse_utils.py                          | 2 +-
 vllm/{transformers_utils => tokenizers}/detokenizer_utils.py | 0
 vllm/v1/engine/detokenizer.py                                | 2 +-
 vllm/v1/engine/logprobs.py                                   | 2 +-
 9 files changed, 8 insertions(+), 8 deletions(-)
 rename vllm/{transformers_utils => tokenizers}/detokenizer_utils.py (100%)

diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py
index ee9da4fd6464..8fbbbba32538 100644
--- a/tests/tool_use/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_use/test_ernie45_moe_tool_parser.py
@@ -15,7 +15,7 @@
 )
 from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 # Use a common model that is likely to be available
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 2413b983fe87..c7ca024f3a76 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -11,7 +11,7 @@
 from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 3cf1f4ef89f1..864bb0d0c06c 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -18,7 +18,7 @@
 )
 from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index 8e1ad5e9cedc..d94df61128c9 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -16,7 +16,7 @@
 )
 from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index a1852c368eeb..fdcdd4038131 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -14,7 +14,7 @@
 )
 from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 0ea4a43d2602..2d969b8c9347 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -10,7 +10,7 @@
 from transformers import AutoTokenizer
 from pydantic import ValidationError
 
-from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
+from vllm.tokenizers.detokenizer_utils import convert_ids_list_to_tokens
 
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from ..utils import flat_product
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/tokenizers/detokenizer_utils.py
similarity index 100%
rename from vllm/transformers_utils/detokenizer_utils.py
rename to vllm/tokenizers/detokenizer_utils.py
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index c55240c40f6f..6c0acd9a9f59 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -9,7 +9,7 @@
 from transformers import PreTrainedTokenizerFast
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.detokenizer_utils import (
+from vllm.tokenizers.detokenizer_utils import (
     TokenizerLike,
     convert_prompt_ids_to_tokens,
     detokenize_incrementally,
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 1c8f808bc25b..599725b6de91 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -12,7 +12,7 @@
     create_prompt_logprobs,
     create_sample_logprobs,
 )
-from vllm.transformers_utils.detokenizer_utils import (
+from vllm.tokenizers.detokenizer_utils import (
     TokenizerLike,
     convert_ids_list_to_tokens,
 )

From 1656ad37045579999a5a9ef3b940f945cd92bb4e Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Sat, 29 Nov 2025 23:19:33 +0800
Subject: [PATCH 552/578] [Kernel][Quantization] add w4a8 support for marlin
 kernel (#24722)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
---
 CMakeLists.txt                                |  102 +-
 benchmarks/kernels/benchmark_machete.py       |    1 +
 benchmarks/kernels/benchmark_marlin.py        |    4 +-
 csrc/moe/marlin_moe_wna16/.gitignore          |    3 +-
 csrc/moe/marlin_moe_wna16/generate_kernels.py |  306 +++-
 csrc/moe/marlin_moe_wna16/kernel.h            |   12 +-
 csrc/moe/marlin_moe_wna16/marlin_template.h   | 1444 ++++++++++-------
 csrc/moe/marlin_moe_wna16/ops.cu              |  592 +++----
 csrc/moe/torch_bindings.cpp                   |    8 +-
 .../gptq_allspark/allspark_qgemm_w8a16.cu     |    8 +-
 .../gptq_allspark/allspark_utils.cuh          |    6 +-
 csrc/quantization/gptq_marlin/.gitignore      |    3 +-
 .../gptq_marlin/awq_marlin_repack.cu          |  104 +-
 csrc/quantization/gptq_marlin/dequant.h       |   87 +
 .../gptq_marlin/generate_kernels.py           |  333 ++--
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |  473 ++----
 .../gptq_marlin/gptq_marlin_repack.cu         |   95 +-
 csrc/quantization/gptq_marlin/kernel.h        |   10 +-
 csrc/quantization/gptq_marlin/marlin.cuh      |   39 +
 .../gptq_marlin/marlin_dtypes.cuh             |   74 +-
 .../gptq_marlin/marlin_int4_fp8_preprocess.cu |  106 ++
 .../gptq_marlin/marlin_template.h             | 1201 +++++++++-----
 csrc/torch_bindings.cpp                       |   17 +-
 docs/design/moe_kernel_features.md            |    2 +-
 tests/kernels/moe/test_moe.py                 |  256 ++-
 .../kernels/quantization/test_marlin_gemm.py  |  397 +++--
 tests/kernels/utils.py                        |   18 +
 vllm/_custom_ops.py                           |   58 +-
 vllm/envs.py                                  |    5 +
 .../layers/fused_moe/fused_marlin_moe.py      |   47 +-
 .../layers/quantization/auto_round.py         |    4 +-
 .../model_executor/layers/quantization/awq.py |    4 +-
 .../layers/quantization/awq_marlin.py         |   82 +-
 .../compressed_tensors/compressed_tensors.py  |   11 +-
 .../compressed_tensors_moe.py                 |  105 +-
 .../schemes/compressed_tensors_wNa16.py       |   11 +
 .../model_executor/layers/quantization/fp8.py |   23 +-
 .../layers/quantization/gptq_marlin.py        |   74 +-
 .../layers/quantization/hqq_marlin.py         |    1 +
 .../kernels/mixed_precision/marlin.py         |   34 +
 .../layers/quantization/modelopt.py           |   18 +-
 .../layers/quantization/mxfp4.py              |   12 +-
 .../layers/quantization/utils/marlin_utils.py |  125 +-
 .../quantization/utils/marlin_utils_fp4.py    |  121 +-
 .../quantization/utils/marlin_utils_fp8.py    |   41 +-
 .../quantization/utils/marlin_utils_test.py   |  118 +-
 46 files changed, 4363 insertions(+), 2232 deletions(-)
 create mode 100644 csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d88ba3aa6630..e09972fe7199 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -354,8 +354,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+
+  # marlin arches for fp16 output
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
+  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+
   if (MARLIN_ARCHS)
 
     #
@@ -365,16 +374,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(MARLIN_GEN_SCRIPT
       ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
     file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
 
-    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
 
-    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
       execute_process(
         COMMAND ${CMAKE_COMMAND} -E env
         PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
         RESULT_VARIABLE marlin_generation_result
         OUTPUT_VARIABLE marlin_generation_result
         OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@@ -387,15 +398,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                             "\nCheck the log for details: "
                             "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
       else()
-        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
-            CACHE STRING "Last run Marlin generate script hash" FORCE)
+        set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+            CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
         message(STATUS "Marlin generation completed successfully.")
       endif()
     else()
       message(STATUS "Marlin generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -403,12 +414,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
         PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
     endif()
-
     list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
+    file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+      CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
+    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
+
+    if (MARLIN_FP8_ARCHS) 
+      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
+    endif()
+
     set(MARLIN_SRCS
        "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
        "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+       "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
        "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
        "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
     set_gencode_flags_for_srcs(
@@ -941,8 +974,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     CUDA_ARCHS "${CUDA_ARCHS}")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin arches
+  # note that we always set `use_atomic_add=False` for moe marlin now,
+  # so we don't need 9.0 for bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #
@@ -952,16 +992,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(MOE_MARLIN_GEN_SCRIPT
       ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
     file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
 
-    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
 
-    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
       execute_process(
         COMMAND ${CMAKE_COMMAND} -E env
         PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
         RESULT_VARIABLE moe_marlin_generation_result
         OUTPUT_VARIABLE moe_marlin_generation_output
         OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@@ -974,7 +1016,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                             "\nCheck the log for details: "
                             "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
       else()
-        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+        set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
             CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
         message(STATUS "Marlin MOE generation completed successfully.")
       endif()
@@ -982,16 +1024,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
+    file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+    list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
     set_gencode_flags_for_srcs(
-      SRCS "${MOE_WNAA16_MARLIN_SRC}"
+      SRCS "${MARLIN_MOE_SRC}"
       CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
     if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+      set_source_files_properties(${MARLIN_MOE_SRC}
         PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
     endif()
-
-    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+    list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
+
+    if (MARLIN_MOE_FP8_ARCHS)
+      file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_FP8_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_FP8_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
+    endif()
 
     message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
   else()
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 8787724d77cf..ac78c019a59e 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
             b_q_weight=w_q,
             b_bias=None,
             b_scales=w_s,
+            a_scales=None,
             global_scale=None,
             b_zeros=w_zp,
             g_idx=g_idx,
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 12ca9214b1f9..48d790aec9e0 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -263,7 +263,7 @@ def gen_allspark_params():
 
     results.append(
         benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -273,7 +273,7 @@ def gen_allspark_params():
 
     results.append(
         benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
diff --git a/csrc/moe/marlin_moe_wna16/.gitignore b/csrc/moe/marlin_moe_wna16/.gitignore
index 77088552b85b..ba805f9250ec 100644
--- a/csrc/moe/marlin_moe_wna16/.gitignore
+++ b/csrc/moe/marlin_moe_wna16/.gitignore
@@ -1 +1,2 @@
-kernel_*.cu
\ No newline at end of file
+sm*_kernel_*.cu
+kernel_selector.h
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index be5b68cc53e6..88f1055337fd 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -4,134 +4,282 @@
 import itertools
 import os
 import subprocess
+import sys
 
 import jinja2
 
-FILE_HEAD = """
-// auto generated by generate.py
+ARCHS = []
+SUPPORT_FP8 = False
+for arch in sys.argv[1].split(","):
+    arch = arch[: arch.index(".") + 2].replace(".", "")
+    arch = int(arch)
+    # only SM89 and SM120 fully support
+    # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
+    # SM90 and SM100 can use this PTX, but it’s simulated
+    # with FP16 MMA, so it cannot achieve any acceleration.
+    if arch in [89, 120]:
+        SUPPORT_FP8 = True
+
+FILE_HEAD_COMMENT = """
+// auto generated by generate_kernels.py
 // clang-format off
+""".lstrip()
 
+FILE_HEAD = (
+    FILE_HEAD_COMMENT
+    + """
 #include "kernel.h"
 #include "marlin_template.h"
 
 namespace MARLIN_NAMESPACE_NAME {
-""".strip()
+"""
+)
 
 TEMPLATE = (
     "template __global__ void Marlin<"
-    "{{scalar_t}}, "
-    "{{w_type_id}}, "
+    "{{a_type_id}}, "
+    "{{b_type_id}}, "
+    "{{c_type_id}}, "
     "{{s_type_id}}, "
     "{{threads}}, "
     "{{thread_m_blocks}}, "
     "{{thread_n_blocks}}, "
     "{{thread_k_blocks}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{m_block_size_8}}, "
     "{{stages}}, "
     "{{group_blocks}}, "
-    "{{'true' if is_zp_float else 'false'}}>"
+    "{{is_zp_float}}>"
     "( MARLIN_KERNEL_PARAMS );"
 )
 
-# int8 with zero point case (vllm::kU8) is also supported,
-# we don't add it to reduce wheel size.
-SCALAR_TYPES = [
-    "vllm::kU4",
-    "vllm::kU4B8",
-    "vllm::kU8B128",
-    "vllm::kFE4M3fn",
-    "vllm::kFE2M1f",
-]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
 
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
-# group_blocks:
-#   = 0 : act order case
-#   = -1 : channelwise quantization
-#   > 0 : group_size=16*group_blocks
-GROUP_BLOCKS = [0, -1, 1, 2, 4, 8]
-DTYPES = ["fp16", "bf16"]
+
+QUANT_CONFIGS = [
+    # AWQ-INT4
+    {
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4
+    {
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # AWQ-INT8
+    {
+        "b_type": "kU8B128",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # FP8
+    {
+        "b_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 8],
+    },
+    # NVFP4
+    {
+        "b_type": "kFE2M1f",
+        "s_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [1],
+    },
+    # MXFP4
+    {
+        "a_type": ["kBFloat16"],
+        "b_type": "kFE2M1f",
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kFE2M1f",
+        "c_type": ["kBFloat16"],
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [2],
+    },
+]
 
 
 def remove_old_kernels():
-    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+    for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
         subprocess.call(["rm", "-f", filename])
 
+    filename = os.path.dirname(__file__) + "/kernel_selector.h"
+    subprocess.call(["rm", "-f", filename])
+
 
 def generate_new_kernels():
-    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
-        all_template_str_list = []
+    result_dict = {}
 
-        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
-        ):
-            # act order case only support gptq-int4 and gptq-int8
-            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
-                "vllm::kU8B128",
-            ]:
-                continue
-            if thread_configs[2] == 256:
-                # for small batch (m_blocks == 1), we only need (128, 128, 256)
-                # for large batch (m_blocks > 1), we only need (64, 256, 256)
-                if m_blocks <= 1 and thread_configs[0] != 128:
-                    continue
-                if m_blocks > 1 and thread_configs[0] != 64:
-                    continue
-
-            # we only support channelwise quantization and group_size == 128
-            # for fp8
-            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
-                continue
-            # nvfp4 only supports group_size == 16
-            # mxfp4 only supports group_size == 32
-            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
+    for quant_config in QUANT_CONFIGS:
+        c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
+        a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
+        b_type = quant_config["b_type"]
+        all_group_blocks = quant_config["group_blocks"]
+        all_m_blocks = quant_config["thread_m_blocks"]
+        all_thread_configs = quant_config["thread_configs"]
+
+        for a_type, c_type in itertools.product(a_types, c_types):
+            if not SUPPORT_FP8 and a_type == "kFE4M3fn":
                 continue
-            # other quantization methods don't support group_size = 16
-            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
+            if "16" in a_type and "16" in c_type and a_type != c_type:
                 continue
+            s_type = quant_config.get("s_type", c_type)
+            if (a_type, b_type, c_type) not in result_dict:
+                result_dict[(a_type, b_type, c_type)] = []
+
+            for group_blocks, m_blocks, thread_configs in itertools.product(
+                all_group_blocks, all_m_blocks, all_thread_configs
+            ):
+                thread_k, thread_n, threads = thread_configs
+
+                if threads == 256:
+                    # for small batch (m_blocks == 1),
+                    #     we only need (128, 128, 256)
+                    # for large batch (m_blocks > 1),
+                    #     we only need (64, 256, 256)
+                    if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
+                        continue
+                    if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
+                        continue
 
-            k_blocks = thread_configs[0] // 16
-            n_blocks = thread_configs[1] // 16
-            threads = thread_configs[2]
+                config = {
+                    "threads": threads,
+                    "s_type": s_type,
+                    "thread_m_blocks": max(m_blocks, 1),
+                    "thread_k_blocks": thread_k // 16,
+                    "thread_n_blocks": thread_n // 16,
+                    "m_block_size_8": "true" if m_blocks == 0.5 else "false",
+                    "stages": "pipe_stages",
+                    "group_blocks": group_blocks,
+                    "is_zp_float": "false",
+                }
 
-            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
+                result_dict[(a_type, b_type, c_type)].append(config)
 
-            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
-                s_type = "vllm::kFE4M3fn"
-            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
-                s_type = "vllm::kFE8M0fnu"
-                if dtype == "fp16":
-                    # we cannot safely dequantize e8m0 to fp16, so skip this
-                    continue
-            elif dtype == "fp16":
-                s_type = "vllm::kFloat16"
-            elif dtype == "bf16":
-                s_type = "vllm::kBFloat16"
+    kernel_selector_str = FILE_HEAD_COMMENT
 
+    for (a_type, b_type, c_type), config_list in result_dict.items():
+        all_template_str_list = []
+        for config in config_list:
+            s_type = config["s_type"]
             template_str = jinja2.Template(TEMPLATE).render(
-                scalar_t=c_dtype,
-                w_type_id=scalar_type + ".id()",
-                s_type_id=s_type + ".id()",
-                threads=threads,
-                thread_m_blocks=max(m_blocks, 1),
-                thread_n_blocks=n_blocks,
-                thread_k_blocks=k_blocks,
-                m_block_size_8=m_blocks == 0.5,
-                stages="pipe_stages",
-                group_blocks=group_blocks,
-                is_zp_float=False,
+                a_type_id=f"vllm::{a_type}.id()",
+                b_type_id=f"vllm::{b_type}.id()",
+                c_type_id=f"vllm::{c_type}.id()",
+                s_type_id=f"vllm::{s_type}.id()",
+                **config,
             )
-
             all_template_str_list.append(template_str)
 
+            conditions = [
+                f"a_type == vllm::{a_type}",
+                f"b_type == vllm::{b_type}",
+                f"c_type == vllm::{c_type}",
+                f"s_type == vllm::{s_type}",
+                f"threads == {config['threads']}",
+                f"thread_m_blocks == {config['thread_m_blocks']}",
+                f"thread_n_blocks == {config['thread_n_blocks']}",
+                f"thread_k_blocks == {config['thread_k_blocks']}",
+                f"m_block_size_8 == {config['m_block_size_8']}",
+                f"group_blocks == {config['group_blocks']}",
+                f"is_zp_float == {config['is_zp_float']}",
+            ]
+            conditions = " && ".join(conditions)
+
+            if kernel_selector_str == FILE_HEAD_COMMENT:
+                kernel_selector_str += f"if ({conditions})\n  kernel = "
+            else:
+                kernel_selector_str += f"else if ({conditions})\n  kernel = "
+
+            kernel_template2 = (
+                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                "{{is_zp_float}}>;"
+            )
+
+            kernel_selector_str += (
+                jinja2.Template(kernel_template2).render(
+                    a_type_id=f"vllm::{a_type}.id()",
+                    b_type_id=f"vllm::{b_type}.id()",
+                    c_type_id=f"vllm::{c_type}.id()",
+                    s_type_id=f"vllm::{s_type}.id()",
+                    **config,
+                )
+                + "\n"
+            )
+
         file_content = FILE_HEAD + "\n\n"
         file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+        if a_type == "kFE4M3fn":
+            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+        else:
+            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+        filename = filename.lower()
 
         with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
             f.write(file_content)
 
+    if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
+        kernel_selector_str += (
+            "else if (a_type == vllm::kFE4M3fn)\n"
+            "  TORCH_CHECK(false, "
+            '"marlin kernel with fp8 activation is not built.");'
+        )
+
+    with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
+        f.write(kernel_selector_str)
+
 
 if __name__ == "__main__":
     remove_old_kernels()
diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h
index 6190f7ee21ec..57f5a17932d4 100644
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -11,8 +11,9 @@
   const int4 *__restrict__ A, const int4 *__restrict__ B,                     \
       int4 *__restrict__ C, int4 *__restrict__ C_tmp,                         \
       const int4 *__restrict__ b_bias_ptr,                                    \
+      const float *__restrict__ a_scales_ptr,                                 \
       const int4 *__restrict__ scales_ptr,                                    \
-      const uint16_t *__restrict__ scale2_ptr,                                \
+      const uint16_t *__restrict__ global_scale_ptr,                          \
       const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,         \
       const int32_t *__restrict__ sorted_token_ids_ptr,                       \
       const int32_t *__restrict__ expert_ids_ptr,                             \
@@ -20,12 +21,13 @@
       const float *__restrict__ topk_weights_ptr, int top_k,                  \
       bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,          \
       int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
-      bool use_fp32_reduce, int max_shared_mem
+      bool use_fp32_reduce
 
 namespace MARLIN_NAMESPACE_NAME {
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index 4dbca30da57a..5b6b2456b411 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -38,7 +38,7 @@ namespace MARLIN_NAMESPACE_NAME {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -49,9 +49,10 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
                                       // only works when thread_m_blocks == 1
           const int stages,  // number of stages for the async global->shared
                              // fetch pipeline
-          const int group_blocks,  // number of consecutive 16x16 blocks
-                                   // with a separate quantization scale
-          const bool is_zp_float   // is zero point of float16 type?
+          const bool has_act_order,  // whether act_order is enabled
+          const int group_blocks,    // number of consecutive 16x16 blocks
+                                     // with a separate quantization scale
+          const bool is_zp_float     // is zero point of float16 type?
           >
 __global__ void Marlin(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
@@ -76,8 +77,8 @@ __global__ void Marlin(
     int prob_k,             // reduction dimension k
     int* locks,             // extra global storage for barrier synchronization
     bool use_atomic_add,    // whether to use atomic add to reduce
-    bool use_fp32_reduce,   // whether to use fp32 global reduce
-    int max_shared_mem) {}
+    bool use_fp32_reduce    // whether to use fp32 global reduce
+) {}
 
 }  // namespace MARLIN_NAMESPACE_NAME
 
@@ -85,65 +86,148 @@ __global__ void Marlin(
 
 // m16n8k16 tensor core mma instruction with fp16 inputs and fp32
 // output/accumulation.
-template <typename scalar_t>
-__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
-                           const typename ScalarType<scalar_t>::FragB& frag_b,
-                           typename ScalarType<scalar_t>::FragC& frag_c) {
+template <vllm::ScalarTypeId type_id, int k_size = 16>
+__device__ inline void mma(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
+            "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
+            "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
+  } else if (k_size == 32) {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id, int k_size = 16>
 __device__ inline void mma_trans(
-    const typename ScalarType<scalar_t>::FragA& a_frag,
-    const typename ScalarType<scalar_t>::FragB& frag_b,
-    const typename ScalarType<scalar_t>::FragB& frag_b2,
-    typename ScalarType<scalar_t>::FragC& frag_c) {
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    const typename MarlinScalarType<type_id>::FragB& frag_b2,
+    typename MarlinScalarType<type_id>::FragC& frag_c) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
   const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
   float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+            "r"(c[3]));
+    }
   } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1200
+      asm volatile(
+          "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  #else
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  #endif
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
-template <int count, typename scalar_t>
-__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
+template <int count, vllm::ScalarTypeId type_id>
+__device__ inline void ldsm(typename MarlinScalarType<type_id>::FragA& frag_a,
                             const void* smem_ptr) {
   uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
   uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
@@ -167,47 +251,54 @@ __device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
 
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
-template <typename scalar_t>
-__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
-                             typename ScalarType<scalar_t>::FragS& frag_s,
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale(typename MarlinScalarType<type_id>::FragB& frag_b,
+                             typename MarlinScalarType<type_id>::FragS& frag_s,
                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_s)[i]);
   frag_b[0] = __hmul2(frag_b[0], s);
   frag_b[1] = __hmul2(frag_b[1], s);
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id>
 __device__ inline void scale_and_sub(
-    typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
-  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t s,
+    typename MarlinScalarType<type_id>::scalar_t zp) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s2 = MarlinScalarType<type_id>::num2num2(s);
+  scalar_t2 zp2 = MarlinScalarType<type_id>::num2num2(zp);
   frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
   frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
 }
 
-template <typename scalar_t>
-__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 zp =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+template <vllm::ScalarTypeId type_id>
+__device__ inline void sub_zp(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t2& frag_zp, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 zp = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_zp)[i]);
   frag_b[0] = __hsub2(frag_b[0], zp);
   frag_b[1] = __hsub2(frag_b[1], zp);
 }
 
 // Same as above, but for act_order (each K is multiplied individually)
-template <typename scalar_t>
-__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::FragS& frag_s_1,
-                              typename ScalarType<scalar_t>::FragS& frag_s_2,
-                              typename ScalarType<scalar_t>::FragS& frag_s_3,
-                              typename ScalarType<scalar_t>::FragS& frag_s_4,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale4(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragS& frag_s_1,
+    typename MarlinScalarType<type_id>::FragS& frag_s_2,
+    typename MarlinScalarType<type_id>::FragS& frag_s_3,
+    typename MarlinScalarType<type_id>::FragS& frag_s_4, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+
   scalar_t2 s_val_1_2;
   s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
   s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
@@ -221,12 +312,13 @@ __device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
 }
 
 // Given 2 floats multiply by 2 scales (halves)
-template <typename scalar_t>
-__device__ inline void scale_float(float* c,
-                                   typename ScalarType<scalar_t>::FragS& s) {
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale_float(
+    float* c, typename MarlinScalarType<type_id>::FragS& s) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
   scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
-  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
-  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+  c[0] = __fmul_rn(c[0], MarlinScalarType<type_id>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], MarlinScalarType<type_id>::num2float(s_ptr[1]));
 }
 
 // Wait until barrier reaches `count`, then lock for current threadblock.
@@ -278,9 +370,10 @@ __device__ inline void wait_negative_and_add(int* lock) {
   __syncthreads();
 }
 
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -301,13 +394,18 @@ __global__ void Marlin(
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
     int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
     const int4* __restrict__ b_bias_ptr,
-    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
-                                          // (k/groupsize)xn
-    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
-                                              // only)
-    const int4* __restrict__ zp_ptr,  // 4bit packed zero-points of shape
-                                      // (k/groupsize)x(n/pack_factor)
-    const int* __restrict__ g_idx,    // int32 group indices of shape k
+    // float scales of input matrix, only used when is_a_8bit == true.
+    // shape (m,)
+    const float* __restrict__ a_scales_ptr,
+    // fp16 quantization scales. shape (k/groupsize, n)
+    const int4* __restrict__ scales_ptr,
+    // fp16 global scale (for nvfp4// only)
+    const uint16_t* __restrict__ global_scale_ptr,
+    // 4bit packed zero-points of shape
+    // (k/groupsize, n/pack_factor)
+    const int4* __restrict__ zp_ptr,
+    // int32 group indices of shape k
+    const int* __restrict__ g_idx,
     const int32_t* __restrict__ sorted_token_ids_ptr,        // moe sorted_ids
     const int32_t* __restrict__ expert_ids_ptr,              // moe expert ids
     const int32_t* __restrict__ num_tokens_past_padded_ptr,  // moe num tokens
@@ -321,9 +419,9 @@ __global__ void Marlin(
     int prob_k,             // reduction dimension k
     int* locks,             // extra global storage for barrier synchronization
     bool has_bias,
-    bool use_atomic_add,   // whether to use atomic add to reduce
-    bool use_fp32_reduce,  // whether to use fp32 global reduce
-    int max_shared_mem) {
+    bool use_atomic_add,  // whether to use atomic add to reduce
+    bool use_fp32_reduce  // whether to use fp32 global reduce
+) {
   // Each threadblock processes one "stripe" of the B matrix with (roughly) the
   // same size, which might involve multiple column "slices" (of width 16 *
   // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
@@ -335,18 +433,37 @@ __global__ void Marlin(
   // ensures good utilization of all SMs for many kinds of shape and GPU
   // configurations, while requiring as few slow global cross-threadblock
   // reductions as possible.
-  using Dtype = ScalarType<scalar_t>;
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  using FragA = typename ScalarType<scalar_t>::FragA;
-  using FragB = typename ScalarType<scalar_t>::FragB;
-  using FragC = typename ScalarType<scalar_t>::FragC;
-  using FragS = typename ScalarType<scalar_t>::FragS;
-  using FragZP = typename ScalarType<scalar_t>::FragZP;
+
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 890
+  // FP8 computation is only supported for Ada Lovelace or newer architectures.
+  if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
+  #endif
+
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
+
+  using Adtype = MarlinScalarType<a_type_id>;
+  using Cdtype = MarlinScalarType<c_type_id>;
+
+  using scalar_t = typename MarlinScalarType<a_type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<a_type_id>::scalar_t2;
+  using scalar_32bit_t = typename MarlinScalarType<a_type_id>::scalar_32bit_t;
+
+  using c_scalar_t = typename MarlinScalarType<c_type_id>::scalar_t;
+  using c_scalar_t2 = typename MarlinScalarType<c_type_id>::scalar_t2;
+
+  using FragA = typename MarlinScalarType<a_type_id>::FragA;
+  using FragB = typename MarlinScalarType<a_type_id>::FragB;
+  using FragC = typename MarlinScalarType<a_type_id>::FragC;
+  using FragS = typename MarlinScalarType<c_type_id>::FragS;
+  using FragZP = typename MarlinScalarType<c_type_id>::FragZP;
 
   extern __shared__ int4 sh[];
-  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  static constexpr auto a_type = vllm::ScalarType::from_id(a_type_id);
+  static constexpr auto b_type = vllm::ScalarType::from_id(b_type_id);
+  static constexpr auto c_type = vllm::ScalarType::from_id(c_type_id);
   static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
-  if constexpr (w_type == vllm::kFE2M1f) {
+  if constexpr (b_type == vllm::kFE2M1f) {
     static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
                   s_type == vllm::kFE8M0fnu && group_blocks == 2);
   } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
@@ -355,34 +472,37 @@ __global__ void Marlin(
     static_assert(s_type == vllm::kFloat16);
   }
 
-  constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
-  constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
-                               w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
+  constexpr bool is_a_8bit = a_type.size_bits() == 8;
+  if constexpr (!is_a_8bit) {
+    static_assert(std::is_same<scalar_t, c_scalar_t>::value);
+  }
+  constexpr bool has_zp = b_type == vllm::kU4 || b_type == vllm::kU8;
+  constexpr bool is_int_type = b_type == vllm::kU4 || b_type == vllm::kU8 ||
+                               b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                               b_type == vllm::kU4B8 || b_type == vllm::kU8B128;
   // see comments of dequant.h for more details
   constexpr bool dequant_skip_flop =
-      w_type == vllm::kFE4M3fn ||
-      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
+      is_a_8bit || b_type == vllm::kFE4M3fn ||
+      b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
       has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
-      has_zp && !is_zp_float && !(w_type == vllm::kU8);
+      has_zp && !is_zp_float && !(b_type == vllm::kU8);
 
-  scalar_t2 global_scale;
+  c_scalar_t2 global_scale;
 
   constexpr bool has_act_order = group_blocks == 0;
 
-  constexpr int pack_factor = 32 / w_type.size_bits();
+  constexpr int pack_factor = 32 / b_type.size_bits();
   static_assert(thread_m_blocks == 1 || !m_block_size_8);
-  constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
   const int group_size =
       (!has_act_order && group_blocks == -1) ? prob_k : prob_k / num_groups;
   const int scales_expert_stride =
-      prob_n * prob_k / group_size / (w_type == vllm::kFE2M1f ? 16 : 8);
+      prob_n * prob_k / group_size / (b_type == vllm::kFE2M1f ? 16 : 8);
   const int zp_expert_stride =
       is_zp_float ? prob_n * prob_k / group_size / 8
                   : prob_n * prob_k / group_size / (pack_factor * 4);
   const int b_bias_expert_stride = prob_n / 8;
 
   // parallel: num valid moe blocks
-  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
   int parallel = num_tokens_past_padded / moe_block_size;
   int num_valid_blocks = parallel;
   if (is_ep) {
@@ -395,7 +515,23 @@ __global__ void Marlin(
 
   int k_tiles = prob_k / 16 / thread_k_blocks;
   int n_tiles = prob_n / 16 / thread_n_blocks;
-  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  int global_mn_tiles = parallel * n_tiles;
+  int part2_mn_tiles = global_mn_tiles;
+  int part1_mn_iters = 0;
+  bool in_part2 = false;
+
+  // we use DP + two-tile SK here
+  // part1: DP
+  // part2: two-tile SK
+  // see https://github.com/vllm-project/vllm/pull/24722 for more details
+  if (global_mn_tiles > gridDim.x) {
+    part2_mn_tiles = global_mn_tiles % gridDim.x;
+    if (part2_mn_tiles * 3 <= gridDim.x) part2_mn_tiles += gridDim.x;
+    part1_mn_iters = (global_mn_tiles - part2_mn_tiles) / gridDim.x;
+  }
+
+  int iters = div_ceil(k_tiles * part2_mn_tiles, gridDim.x);
 
   if constexpr (!has_act_order && group_blocks != -1) {
     if (group_blocks >= thread_k_blocks) {
@@ -407,14 +543,15 @@ __global__ void Marlin(
     }
   }
 
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  int slice_iters;  // number of threadblock tiles in the current slice
-  int slice_count =
-      0;          // total number of active threadblocks in the current slice
-  int slice_idx;  // index of threadblock in current slice; numbered bottom to
-                  // top
+  int slice_row = 0;
+  int slice_col_par = blockIdx.x;
+  int slice_col;
+  int slice_iters =
+      k_tiles;  // number of threadblock tiles in the current slice
+  // total number of active threadblocks in the current slice
+  int slice_count = 1;
+  // index of threadblock in current slice; numbered bottom to top
+  int slice_idx = 0;
 
   int par_id = 0;
   int block_id = -1;
@@ -422,87 +559,89 @@ __global__ void Marlin(
   int old_expert_id = 0;
   int64_t B_expert_off = 0;
 
-  int4* sh_block_sorted_ids_int4 = sh;
+  float* sh_a_s = reinterpret_cast<float*>(sh);
+  int4* sh_block_sorted_ids_int4 = sh + (is_a_8bit ? (4 * thread_m_blocks) : 0);
   int4* sh_rd_block_sorted_ids_int4 =
       sh_block_sorted_ids_int4 + moe_block_size / 4;
   int4* sh_block_topk_weights_int4 =
       sh_rd_block_sorted_ids_int4 + moe_block_size / 4;
   // sh_block_topk_weights_int4 only need (moe_block_size / 4);
   // but we pad to align to 256 bytes
-  int4* sh_new =
-      sh_block_topk_weights_int4 + moe_block_size / 2 + moe_block_size;
+  int4* sh_new = sh_block_topk_weights_int4 + moe_block_size / 2;
   int32_t* sh_block_sorted_ids =
       reinterpret_cast<int*>(sh_block_sorted_ids_int4);
   int32_t* sh_rd_block_sorted_ids =
       reinterpret_cast<int*>(sh_rd_block_sorted_ids_int4);
-  scalar_t2* sh_block_topk_weights =
-      reinterpret_cast<scalar_t2*>(sh_block_topk_weights_int4);
+  c_scalar_t2* sh_block_topk_weights =
+      reinterpret_cast<c_scalar_t2*>(sh_block_topk_weights_int4);
 
   int32_t block_num_valid_tokens = 0;
   int32_t locks_off = 0;
 
   // We can easily implement parallel problem execution by just remapping
   // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    slice_col = slice_col_par % n_tiles;
-    par_id = slice_col_par / n_tiles;
-  }
-  if (parallel * n_tiles >= gridDim.x) {
-    // when parallel * n_tiles >= sms
+  if (part2_mn_tiles >= gridDim.x) {
+    // when part2_mn_tiles >= sms
     // then there are at most $sms$ conflict tile blocks
     locks_off = blockIdx.x;
   } else {
     locks_off = (iters * blockIdx.x) / k_tiles - 1;
   }
 
+  int prob_m_top_k = prob_m * top_k;
   // read moe block data given block_id
   // block_sorted_ids / block_num_valid_tokens / block_topk_weights
   auto read_moe_block_data = [&](int block_id) {
     block_num_valid_tokens = moe_block_size;
+
+    cp_async4_pred(sh_block_sorted_ids_int4 + threadIdx.x,
+                   reinterpret_cast<const int4*>(sorted_token_ids_ptr) +
+                       (block_id * moe_block_size / 4 + threadIdx.x),
+                   threadIdx.x < moe_block_size / 4);
+
+    cp_async_fence();
+    cp_async_wait<0>();
+
+    __syncthreads();
+
+    if (threadIdx.x >= threads - 32) {
+      constexpr int size_per_thread = div_ceil(moe_block_size, 32);
+      int lane_id = threadIdx.x - (threads - 32);
+
+      int local_count = 0;
   #pragma unroll
-    for (int i = 0; i < moe_block_size / 4; i++) {
-      int4 sorted_token_ids_int4 = reinterpret_cast<const int4*>(
-          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
-      int* sorted_token_ids = reinterpret_cast<int*>(&sorted_token_ids_int4);
-  #pragma unroll
-      for (int j = 0; j < 4; j++) {
-        if (sorted_token_ids[j] >= prob_m * top_k) {
-          block_num_valid_tokens = i * 4 + j;
-          break;
+      for (int i = 0; i < size_per_thread; i++) {
+        int j = lane_id * size_per_thread + i;
+        if (j < moe_block_size) {
+          int idx = sh_block_sorted_ids[j];
+          if (idx < prob_m_top_k) local_count++;
         }
       }
-      if (block_num_valid_tokens != moe_block_size) break;
-    }
 
-    __syncthreads();
-    int tid4 = threadIdx.x / 4;
-    if (threadIdx.x % 4 == 0 && threadIdx.x < block_num_valid_tokens) {
-      sh_block_sorted_ids_int4[tid4] = reinterpret_cast<const int4*>(
-          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + tid4];
+      block_num_valid_tokens = __reduce_add_sync(0xffffffff, local_count);
 
-  #pragma unroll
-      for (int i = 0; i < 4; i++)
-        sh_rd_block_sorted_ids[tid4 * 4 + i] =
-            sh_block_sorted_ids[tid4 * 4 + i] / top_k;
+      if (lane_id == 0)
+        reinterpret_cast<int*>(sh_new)[0] = block_num_valid_tokens;
+    }
+
+    if (threadIdx.x < moe_block_size) {
+      int idx = sh_block_sorted_ids[threadIdx.x];
+      sh_rd_block_sorted_ids[threadIdx.x] = idx / top_k;
 
       if (mul_topk_weights) {
-  #pragma unroll
-        for (int i = 0; i < 4; i++) {
-          int idx = tid4 * 4 + i;
-          if (idx < block_num_valid_tokens) {
-            if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-              sh_block_topk_weights[idx] =
-                  __hmul2(global_scale,
-                          Dtype::num2num2(Dtype::float2num(
-                              topk_weights_ptr[sh_block_sorted_ids[idx]])));
-            } else {
-              sh_block_topk_weights[idx] = Dtype::num2num2(
-                  Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
-            }
-          }
+        idx = idx < prob_m_top_k ? idx : 0;
+        c_scalar_t2 topk_weight_val =
+            Cdtype::num2num2(Cdtype::float2num(topk_weights_ptr[idx]));
+        if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+          topk_weight_val = __hmul2(topk_weight_val, global_scale);
         }
+        sh_block_topk_weights[threadIdx.x] = topk_weight_val;
       }
     }
+
+    __syncthreads();
+
+    block_num_valid_tokens = reinterpret_cast<int*>(sh_new)[0];
     __syncthreads();
   };
 
@@ -513,9 +652,8 @@ __global__ void Marlin(
 
     old_expert_id = expert_id;
     if (num_invalid_blocks > 0) {
-      int skip_count = block_id == -1 ? par_id : 0;
-      block_id++;
-      for (int i = block_id; i < num_tokens_past_padded / moe_block_size; i++) {
+      int skip_count = par_id;
+      for (int i = 0; i < num_tokens_past_padded / moe_block_size; i++) {
         expert_id = expert_ids_ptr[i];
         if (expert_id != -1) {
           if (skip_count == 0) {
@@ -530,9 +668,9 @@ __global__ void Marlin(
       expert_id = expert_ids_ptr[block_id];
     }
 
-    if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-      uint16_t val = scale2_ptr[expert_id];
-      global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
+    if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+      uint16_t val = global_scale_ptr[expert_id];
+      global_scale = Cdtype::num2num2(*reinterpret_cast<c_scalar_t*>(&val));
     }
 
     B_expert_off = expert_id * prob_n * prob_k / (pack_factor * 4);
@@ -552,10 +690,11 @@ __global__ void Marlin(
 
   // Compute all information about the current slice which is required for
   // synchronization.
-  auto init_slice = [&](bool first_init = false) {
+  bool first_init = true;
+  auto init_part2_slice = [&]() {
     slice_iters =
         iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters < 0 || slice_col_par >= part2_mn_tiles) slice_iters = 0;
     if (slice_iters == 0) return;
     if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
     slice_count = 1;
@@ -573,7 +712,7 @@ __global__ void Marlin(
         if (col_off > 0) slice_idx--;
       }
     }
-    if (parallel * n_tiles >= gridDim.x) {
+    if (part2_mn_tiles >= gridDim.x) {
       if (slice_count > 1 && slice_idx == slice_count - 1) {
         locks_off++;
       }
@@ -607,25 +746,61 @@ __global__ void Marlin(
       par_id++;
       update_next_moe_block_data();
     }
+    if (is_a_8bit && (first_init || slice_col == 0)) {
+      __syncthreads();
+      cp_async1_ca_pred(&sh_a_s[threadIdx.x],
+                        &a_scales_ptr[sh_rd_block_sorted_ids[threadIdx.x]],
+                        threadIdx.x < block_num_valid_tokens);
+    }
   };
 
-  update_next_moe_block_data();
-  init_slice(true);
+  auto init_part1_slice = [&]() {
+    if (part1_mn_iters) {
+      part1_mn_iters--;
+      par_id = slice_col_par / n_tiles;
+      slice_col = slice_col_par % n_tiles;
+      slice_iters = k_tiles;
+      update_next_moe_block_data();
+      if (is_a_8bit) {
+        __syncthreads();
+        cp_async1_ca_pred(&sh_a_s[threadIdx.x],
+                          &a_scales_ptr[sh_rd_block_sorted_ids[threadIdx.x]],
+                          threadIdx.x < block_num_valid_tokens);
+      }
+    }
+  };
+
+  auto init_slice = [&]() {
+    if (!in_part2 && !part1_mn_iters) {
+      in_part2 = true;
+      slice_col_par = (iters * blockIdx.x) / k_tiles;
+      slice_row = (iters * blockIdx.x) % k_tiles;
+      slice_col = (slice_col_par + global_mn_tiles - part2_mn_tiles) % n_tiles;
+      par_id = (slice_col_par + global_mn_tiles - part2_mn_tiles) / n_tiles;
+      update_next_moe_block_data();
+    }
+    if (!in_part2) {
+      init_part1_slice();
+    } else {
+      init_part2_slice();
+      first_init = false;
+    }
+  };
+
+  init_slice();
 
   // A sizes/strides
 
   // stride of the A matrix in global memory
-  int a_gl_stride = prob_k / 8;
+  int a_gl_stride = prob_k / (is_a_8bit ? 16 : 8);
   // stride of an A matrix tile in shared memory
-  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  constexpr int a_sh_stride = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // delta between subsequent A tiles in global memory
-  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // between subsequent accesses within a tile
   int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
   // between shared memory writes
   constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory tile reads
-  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
   // within a shared memory tile
   constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
   // overall size of a tile
@@ -634,24 +809,25 @@ __global__ void Marlin(
   constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
-  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
-  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  int b_gl_stride = 16 * prob_n / (pack_factor * (is_a_8bit ? 2 : 4));
+  constexpr int b_sh_stride =
+      ((thread_n_blocks * 16) * 16 / pack_factor) / (is_a_8bit ? 2 : 4);
+  constexpr int b_thread_vecs = b_type.size_bits() == 4 ? 1 : 2;
   constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
 
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_delta = threads * b_thread_vecs;
-  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_stage =
+      b_sh_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
   // Scale sizes/strides without act_order
-  int s_gl_stride = prob_n / 8;
-  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  int s_gl_stride = prob_n / (b_type == vllm::kFE2M1f ? 16 : 8);
+  constexpr int s_sh_stride =
+      16 * thread_n_blocks / (b_type == vllm::kFE2M1f ? 16 : 8);
   constexpr int s_tb_groups =
       !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
-          ? thread_k_blocks / group_blocks / (w_type == vllm::kFE2M1f ? 2 : 1)
+          ? thread_k_blocks / group_blocks
           : 1;
   constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
   int s_gl_rd_delta = s_gl_stride;
@@ -664,7 +840,8 @@ __global__ void Marlin(
   constexpr int act_s_max_num_groups = 32;
   int act_s_col_stride = 1;
   int act_s_col_warp_stride = act_s_col_stride * 8;
-  int tb_n_warps = thread_n_blocks / 4;
+
+  constexpr int tb_n_warps = thread_n_blocks / (is_a_8bit ? 2 : 4);
   int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
 
   // Zero-points sizes/strides
@@ -679,7 +856,6 @@ __global__ void Marlin(
   // Global A read index of current thread.
   int a_gl_rd_row = threadIdx.x / a_gl_rd_delta_o;
   int a_gl_rd_col = a_gl_rd_delta_o * slice_row + threadIdx.x % a_gl_rd_delta_o;
-
   // Shared write index of current thread.
   int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
                 (threadIdx.x % a_gl_rd_delta_o);
@@ -687,17 +863,22 @@ __global__ void Marlin(
   int a_sh_rd =
       a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
       (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
-  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / tb_n_warps) * b_sh_wr_iters;
+
+  int b_gl_rd;
+  if (threads <= b_sh_stride) {
+    b_gl_rd = threadIdx.x;
+  } else {
+    b_gl_rd =
+        b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  }
 
-  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
-                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
-  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += B_expert_off + b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  auto b_sh_wr = threadIdx.x * b_thread_vecs;
   auto b_sh_rd = threadIdx.x * b_thread_vecs;
+  b_sh_rd += b_sh_rd / b_sh_stride * (b_sh_stride * (b_sh_wr_iters - 1));
 
   // For act_order
-  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
   int slice_k_start = tb_k * slice_row;
   int slice_k_finish = slice_k_start + tb_k * slice_iters;
   int slice_k_start_shared_fetch = slice_k_start;
@@ -708,58 +889,54 @@ __global__ void Marlin(
   if constexpr (!has_act_order) {
     if constexpr (group_blocks == -1) {
       s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-    } else {
-      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) /
-                    (w_type == vllm::kFE2M1f ? 2 : 1) +
+    } else if constexpr (group_blocks >= thread_k_blocks) {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                 s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
     }
   }
   auto s_sh_wr = threadIdx.x;
-  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stage;
 
   // Zero-points
   int zp_gl_rd;
   if constexpr (has_zp) {
     if constexpr (group_blocks == -1) {
       zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
-    } else {
+    } else if constexpr (group_blocks >= thread_k_blocks) {
       zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                  zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                 threadIdx.x / zp_sh_stride) +
+                 zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
     }
   }
   auto zp_sh_wr = threadIdx.x;
-  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+  bool zp_sh_wr_pred = zp_sh_stage > 0 && threadIdx.x < zp_sh_stage;
 
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
   // row-major in the latter case.
   int s_sh_rd;
-  if constexpr (group_blocks != -1 && w_type == vllm::kFE2M1f) {
-    auto warp_id = threadIdx.x / 32;
-    int n_warps = thread_n_blocks / 4;
-    int warp_row = warp_id / n_warps;
-
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
-
+  if constexpr (is_a_8bit) {
+    s_sh_rd = 4 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 4);
   } else if constexpr (group_blocks != -1)
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
   else if constexpr (group_blocks == -1 &&
                      (m_block_size_8 || (has_zp && !dequant_skip_flop)))
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 8;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   else
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) % 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) % 4;
 
   int bias_sh_rd;
   if constexpr (m_block_size_8) {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                 (threadIdx.x % 32) / 8;
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   } else {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+    bias_sh_rd = (is_a_8bit ? 4 : 8) * ((threadIdx.x / 32) % tb_n_warps) +
                  (threadIdx.x % 32) % 4;
   }
 
@@ -775,12 +952,16 @@ __global__ void Marlin(
   if constexpr (has_zp) {
     if constexpr (is_zp_float) {
       if constexpr (group_blocks != -1) {
-        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                   (threadIdx.x % 32) / 4;
+        zp_sh_rd =
+            8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
       }
+    } else if (is_a_8bit) {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % tb_n_warps / 2) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     } else {
       zp_sh_rd = num_ints_per_thread * num_col_threads *
-                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                     ((threadIdx.x / 32) % tb_n_warps) +
                  num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     }
   }
@@ -807,18 +988,13 @@ __global__ void Marlin(
   for (int i = 0; i < b_sh_wr_iters; i++) {
   #pragma unroll
     for (int j = 0; j < thread_m_blocks; j++)
-      a_sh_rd_trans[i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+      a_sh_rd_trans[i][j] = transform_a(2 * i + a_sh_rd_delta_i * j + a_sh_rd);
   }
 
   // Since B-accesses have non-constant stride they have to be computed at
   // runtime; we break dependencies between subsequent accesses with a tile by
   // maintining multiple pointers (we have enough registers), a tiny
   // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
 
   // Shared memory storage for global fetch pipelines.
   constexpr int sh_red_size = (2 * thread_n_blocks + 1) * 16 * thread_m_blocks;
@@ -847,19 +1023,12 @@ __global__ void Marlin(
   static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
                 stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
-  constexpr int shm_size_used = moe_block_size +
-                                stages * (g_idx_stage + zp_sh_stage) +
-                                sh_s_size + sh_b_red_bias_size;
-
-  // all remaining shared memory is used to cache A (input)
-  // sh_a_max_row is at least ` stages * 16 * thread_m_blocks `
-  int sh_a_max_row =
-      ((max_shared_mem - 1024) / 16 - shm_size_used) / (thread_k_blocks * 2);
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
   I4 frag_b_quant[2][b_thread_vecs];
-  FragC frag_c[thread_m_blocks][4][2];
+  FragC frag_c[thread_m_blocks][is_a_8bit ? 2 : 4][2];
+  FragC frag_c_tmp[thread_m_blocks][is_a_8bit ? 2 : 4][2];
   FragS frag_s[2][4];  // No act-order
   FragS frag_bias[2][4];
   FragS act_frag_s[2][4][4];             // For act-order
@@ -867,6 +1036,24 @@ __global__ void Marlin(
   FragZP frag_zp;                        // Zero-points in fp16
   FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
 
+  if constexpr (is_a_8bit && group_blocks != -1) {
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][0][g] = 0.0f;
+        }
+
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][1][g] = 0.0f;
+        }
+      }
+    }
+  }
+
   // Zero accumulators.
   auto zero_accums = [&]() {
   #pragma unroll
@@ -910,43 +1097,36 @@ __global__ void Marlin(
       }
     }
   };
-
   // Asynchronously fetch the next A, B and s tile from global to the next
   // shared memory pipeline location.
-  bool should_load_a = true;
-  int max_num_stage_groups =
-      ((sh_a_max_row - moe_block_size) / moe_block_size + 1) / stages;
-  max_num_stage_groups = max(max_num_stage_groups, 1);
-  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true,
-                             int pipe_a = 0) {
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
     if (pred) {
-      if (should_load_a) {
-        int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe_a;
+      int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe;
   #pragma unroll
-        for (int i = 0; i < a_sh_wr_iters; i++) {
-          int row = a_gl_rd_delta_i / a_gl_stride * i + a_gl_rd_row;
-          int64_t sorted_row = 0;
-          if (!m_block_size_8 || row < 8)
-            sorted_row = sh_rd_block_sorted_ids[row];
-          int64_t true_idx =
-              sorted_row * a_gl_stride + a_gl_rd_col + a_gl_rd_delta_o * a_off;
-          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx],
-                         row < block_num_valid_tokens);
-        }
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        int row = a_gl_rd_delta_i / a_gl_stride * i + a_gl_rd_row;
+        int64_t sorted_row = 0;
+        if (!m_block_size_8 || row < 8)
+          sorted_row = sh_rd_block_sorted_ids[row];
+        int64_t true_idx =
+            sorted_row * a_gl_stride + a_gl_rd_col + a_gl_rd_delta_o * a_off;
+        cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx],
+                       row < block_num_valid_tokens);
       }
 
       int4* sh_b_stage = sh_b + b_sh_stage * pipe;
   #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-        for (int j = 0; j < b_thread_vecs; j++) {
-          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j],
-                    B_ptr[i] + j + B_expert_off);
-        }
+      for (int i = 0; i < (b_sh_wr_iters * b_thread_vecs); i++) {
+        constexpr int count = div_ceil(b_sh_stride, threads);
+        int b_gl_idx =
+            b_gl_rd + (i % count) * threads +
+            b_gl_stride * (i / count) * div_ceil(threads, b_sh_stride);
 
-        B_ptr[i] += b_gl_rd_delta_o;
+        cp_async4(&sh_b_stage[threads * i + threadIdx.x], &B[b_gl_idx]);
       }
 
+      b_gl_rd += b_gl_rd_delta_o;
+
       if constexpr (has_act_order) {
         // Fetch g_idx thread-block portion
         int full_pipe = a_off;
@@ -966,44 +1146,24 @@ __global__ void Marlin(
         if constexpr (group_blocks != -1) {
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch scales if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < s_tb_groups; i++) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
-                          &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
+          // Only fetch scales if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
             }
+            s_gl_rd += s_gl_rd_delta * s_tb_groups;
           }
         }
 
         if constexpr (has_zp && group_blocks != -1) {
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch zero-points if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < zp_tb_groups; i++) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
-                          &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
+          // Only fetch zero points if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (zp_sh_wr_pred) {
+              cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
             }
+            zp_gl_rd += zp_gl_rd_delta * zp_tb_groups;
           }
         }
       }
@@ -1037,18 +1197,18 @@ __global__ void Marlin(
 
   // Load the next sub-tile from the current location in the shared memory pipe
   // into the current register buffer.
-  auto fetch_to_registers = [&](int k, int pipe, int pipe_a = 0) {
-    int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe_a;
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe;
   #pragma unroll
     for (int i = 0; i < thread_m_blocks; i++)
-      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(
+      ldsm<m_block_size_8 ? 2 : 4, a_type_id>(
           frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4* sh_b_stage = sh_b + b_sh_stage * pipe;
 
   #pragma unroll
     for (int i = 0; i < b_thread_vecs; i++) {
       frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
-          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+          &sh_b_stage[b_sh_stride * (k % b_sh_wr_iters) + b_sh_rd + i]);
     }
   };
 
@@ -1072,53 +1232,54 @@ __global__ void Marlin(
 
   auto fetch_scales_to_registers = [&](int k, int full_pipe) {
     int pipe = full_pipe % stages;
+    using IT1 = typename std::conditional_t<is_a_8bit, int2, int4>;
+    using IT0 = typename std::conditional_t<is_a_8bit, int, int2>;
+    constexpr int group_blocks2 = div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
     if constexpr (!has_act_order) {
       // No act-order case
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 && dequant_skip_flop) {
           reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
           reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
         }
       } else if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_s_stage =
-                sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
-            reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
-          } else {
-            reinterpret_cast<int4*>(&frag_s[1])[0] =
-                reinterpret_cast<int4*>(&frag_s[0])[0];
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0) {
+            if (k % b_sh_wr_iters == 0) {
+              int4* sh_s_stage = sh_s + s_sh_stage * (g * (pipe / g));
+              reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+            } else {
+              reinterpret_cast<int4*>(&frag_s[1])[0] =
+                  reinterpret_cast<int4*>(&frag_s[0])[0];
+            }
           }
-        } else {
+        } else if (group_blocks2 < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
-
-          int warp_row = warp_id / n_warps;
+          int warp_row = warp_id / tb_n_warps;
 
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          int cur_group_id =
-              k_blocks / (group_blocks * (w_type == vllm::kFE2M1f ? 2 : 1));
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+          int cur_group_id = k_blocks / group_blocks2;
 
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (w_type_id != vllm::kFE2M1f.id()) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
             reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                 sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
+          } else {
             reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                 reinterpret_cast<int2*>(
                     sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          }
+        } else if (group_blocks >= b_sh_wr_iters) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
+            reinterpret_cast<int4*>(&frag_s[1])[0] =
+                reinterpret_cast<int4*>(&frag_s[0])[0];
           } else {
-            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
-                reinterpret_cast<int2*>(
-                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
-                                k % 2];
+            reinterpret_cast<int2*>(&frag_s[1])[0] =
+                reinterpret_cast<int2*>(&frag_s[0])[0];
           }
         }
       }
@@ -1139,18 +1300,15 @@ __global__ void Marlin(
     cur_k = 0;
 
     // Progress to current iteration
-    cur_k += k_iter_size * (k % b_sh_wr_iters);
+    cur_k += k % b_sh_wr_iters;
 
     // Determine "position" inside the thread-block (based on warp and
     // thread-id)
     auto warp_id = threadIdx.x / 32;
-    int n_warps =
-        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
-
-    int warp_row = warp_id / n_warps;
-    int warp_col = warp_id % n_warps;
+    int warp_row = warp_id / tb_n_warps;
+    int warp_col = warp_id % tb_n_warps;
 
-    cur_k += warp_row * 16;
+    cur_k += warp_row * 16 * b_sh_wr_iters;
 
     auto th_id = threadIdx.x % 32;
     cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
@@ -1205,18 +1363,16 @@ __global__ void Marlin(
 
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 || is_a_8bit) {
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
           }
         }
-
       } else if constexpr (group_blocks >= thread_k_blocks) {
-        if (k % b_sh_wr_iters == 0) {
-          int4* sh_zp_stage =
-              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
+        constexpr int g = group_blocks / thread_k_blocks;
+        if (pipe % g == 0 && k % b_sh_wr_iters == 0 || is_a_8bit) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] =
@@ -1225,21 +1381,11 @@ __global__ void Marlin(
         }
       } else {
         auto warp_id = threadIdx.x / 32;
-        int n_warps = thread_n_blocks / 4;
 
-        int warp_row = warp_id / n_warps;
+        int warp_row = warp_id / tb_n_warps;
 
-        int cur_k = warp_row * 16;
-        cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-        int k_blocks = cur_k / 16;
-        int cur_group_id = 0;
-
-        // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
-        cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
+        int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+        int cur_group_id = k_blocks / div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
         int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1258,29 +1404,18 @@ __global__ void Marlin(
 
       if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_zp_stage =
-                sh_zp +
-                zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                               (pipe / (group_blocks / thread_k_blocks)));
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0 && k % b_sh_wr_iters == 0) {
+            int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
             reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
                 sh_zp_stage[zp_sh_rd];
           }
-        } else {
+        } else if (group_blocks < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
-
-          int warp_row = warp_id / n_warps;
 
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
+          int warp_row = warp_id / tb_n_warps;
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
           int cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
 
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1291,33 +1426,46 @@ __global__ void Marlin(
     }
   };
 
-  auto dequant_data = [&](int q, scalar_t2* frag_b_ptr) {
-    dequant<scalar_t2, w_type_id, dequant_skip_flop>(q, frag_b_ptr);
+  auto dequant_data = [&](int q, scalar_32bit_t* frag_b_ptr, int zp = 0) {
+    if constexpr (a_type.size_bits() != b_type.size_bits()) {
+      if constexpr (is_a_8bit && has_zp) {
+        sub_zp_and_dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(
+            q, frag_b_ptr, zp);
+      } else {
+        dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(q, frag_b_ptr);
+      }
+    }
   };
 
   // Execute the actual tensor core matmul of a sub-tile.
   bool is_first_matmul_in_slice = true;
-  auto matmul = [&](int k) {
+  auto matmul = [&](int k, int pipe) {
+    if (is_a_8bit) return;
     int k2 = k % 2;
+    constexpr int g =
+        group_blocks > 0 ? div_ceil(group_blocks, thread_k_blocks) : 1;
     const bool is_new_zp =
-        ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+        (group_blocks == 0) ||
+        ((group_blocks > 0) && (group_blocks < b_sh_wr_iters || k == 0)) &&
+            (pipe % g == 0) ||
         (group_blocks == -1 && is_first_matmul_in_slice);
     if constexpr (has_zp && !is_zp_float) {
       if (is_new_zp) {
         if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
         int zp_quant_0, zp_quant_1;
 
-        if constexpr (w_type.size_bits() == 4) {
+        if constexpr (b_type.size_bits() == 4) {
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = zp_quant_0 >> 8;
         } else {
-          static_assert(w_type.size_bits() == 8);
+          static_assert(b_type.size_bits() == 8);
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = frag_qzp[k2][1];
         }
 
-        dequant_data(zp_quant_0, reinterpret_cast<scalar_t2*>(&frag_zp));
-        dequant_data(zp_quant_1, reinterpret_cast<scalar_t2*>(&frag_zp) + 2);
+        dequant_data(zp_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_zp));
+        dequant_data(zp_quant_1,
+                     reinterpret_cast<scalar_32bit_t*>(&frag_zp) + 2);
       }
     }
     if constexpr (!dequant_skip_flop && has_zp && is_zp_float) {
@@ -1327,14 +1475,14 @@ __global__ void Marlin(
       }
     }
 
-    if constexpr (w_type == vllm::kFE2M1f) {
+    if constexpr (b_type == vllm::kFE2M1f) {
       int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
       int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
 
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_1, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]) + 2);
     }
 
   // We have the m dimension as the inner loop in order to encourage overlapping
@@ -1345,61 +1493,168 @@ __global__ void Marlin(
       FragB frag_b1;
       int b_quant_0, b_quant_1;
 
-      if constexpr (w_type_id == vllm::kFE2M1f.id()) {
+      if constexpr (b_type_id == vllm::kFE2M1f.id()) {
         b_quant_1 = frag_b_quant[k2][0][j];
         b_quant_0 = b_quant_1 << 8;
-      } else if constexpr (w_type.size_bits() == 4) {
+      } else if constexpr (b_type.size_bits() == 4) {
         b_quant_0 = frag_b_quant[k2][0][j];
         b_quant_1 = b_quant_0 >> 8;
       } else {
-        static_assert(w_type.size_bits() == 8);
+        static_assert(b_type.size_bits() == 8);
         int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
         b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
         b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
       }
 
-      dequant_data(b_quant_0, reinterpret_cast<scalar_t2*>(&frag_b0));
-      dequant_data(b_quant_1, reinterpret_cast<scalar_t2*>(&frag_b1));
+      dequant_data(b_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_b0));
+      dequant_data(b_quant_1, reinterpret_cast<scalar_32bit_t*>(&frag_b1));
 
-      if constexpr (dequant_skip_flop && has_zp && !is_zp_float) {
-        sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
-        sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
+      if constexpr (dequant_skip_flop && has_zp && !is_zp_float && !is_a_8bit) {
+        sub_zp<a_type_id>(frag_b0, frag_zp[j], 0);
+        sub_zp<a_type_id>(frag_b1, frag_zp[j], 1);
       }
 
       // Apply scale to frag_b0
-      if constexpr (has_act_order) {
+      if constexpr (has_act_order && !is_a_8bit) {
         static_assert(group_blocks != -1);
-        scale4<scalar_t>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
-        scale4<scalar_t>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
+        scale4<a_type_id>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<a_type_id>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
       } else if constexpr (!dequant_skip_flop && has_zp && !is_zp_float &&
-                           group_blocks == -1) {
+                           group_blocks == -1 && !is_a_8bit) {
         int idx = (threadIdx.x / 4) % 2;
-        scalar_t2 s2 = Dtype::nums2num2(
+        scalar_t2 s2 = Adtype::nums2num2(
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
         if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
-        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
-      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1) {
+        scale_and_sub<a_type_id>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1 &&
+                           !is_a_8bit) {
         if (is_new_zp)
           frag_zp[j] = __hmul2(frag_zp[j],
                                *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
-        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
-      } else if constexpr (group_blocks != -1) {
-        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
-        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+        scale_and_sub<a_type_id>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
+      } else if constexpr (group_blocks != -1 && !is_a_8bit) {
+        scale<a_type_id>(frag_b0, frag_s[k2][j], 0);
+        scale<a_type_id>(frag_b1, frag_s[k2][j], 1);
       }
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
+                               frag_c[i][j][0]);
         } else {
-          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  auto matmul_a8 = [&](int k) {
+    int k2 = k % 2;
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+      FragB frag_b[2];
+
+      if (is_a_8bit && b_type.size_bits() == 4 && !has_zp) {
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b));
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2);
+      } else if (is_a_8bit && b_type.size_bits() == 4 && has_zp) {
+        int off = (threadIdx.x / 32) % 2 * 2 + j;
+        int zp = (frag_qzp[k2][0] >> (off * 8)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b), zp);
+        zp = (frag_qzp[k2][0] >> (off * 8 + 4)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2, zp);
+      } else {
+        reinterpret_cast<int2*>(&frag_b)[0] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[0];
+        reinterpret_cast<int2*>(&frag_b)[1] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[1];
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+      }
+
+      if constexpr (group_blocks != -1) {
+        if (group_blocks == 2 || k == 1) {
+          if constexpr (a_type == vllm::kS8) {
+            int2 s_vals[2];
+            s_vals[0] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[1]};
+            s_vals[1] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[1]};
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[0])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][0][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][0][g]) *
+                    scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[1])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][1][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][1][g]) *
+                    scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          } else {
+            float2 s_vals[2];
+            if constexpr (s_type_id != vllm::kFE8M0fnu.id()) {
+              static_assert(a_type.size_bits() == 16 ||
+                            s_type.size_bits() == 16);
+              s_vals[0] = Cdtype::num22float2(frag_s[k2][j * 2][0]);
+              s_vals[1] = Cdtype::num22float2(frag_s[k2][j * 2 + 1][0]);
+            } else {
+              int32_t* s_vals_int = reinterpret_cast<int32_t*>(&s_vals[0]);
+              int32_t s_vals_e8m0 =
+                  *reinterpret_cast<int32_t*>(&frag_s[k2][j][0]);
+
+              s_vals_int[0] = (s_vals_e8m0 & 0xFF) << 23;
+              s_vals_int[1] = (s_vals_e8m0 & 0xFF00) << 15;
+              s_vals_int[2] = (s_vals_e8m0 & 0xFF0000) << 7;
+              s_vals_int[3] = (s_vals_e8m0 & 0xFF000000) >> 1;
+            }
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[0])[g % 2];
+                frag_c[i][j][0][g] += frag_c_tmp[i][j][0][g] * scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[1])[g % 2];
+                frag_c[i][j][1][g] += frag_c_tmp[i][j][1][g] * scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          }
         }
       }
     }
@@ -1413,7 +1668,8 @@ __global__ void Marlin(
     constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
       auto red_idx = threadIdx.x / b_sh_stride_threads;
-      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_stride =
+          b_sh_stride_threads * (is_a_8bit ? 2 : 4) * 2;
       constexpr int red_sh_delta = b_sh_stride_threads;
       int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
                       (threadIdx.x % b_sh_stride_threads);
@@ -1428,7 +1684,8 @@ __global__ void Marlin(
         for (int i = red_off; i > 0; i /= 2) {
           if (i <= red_idx && red_idx < 2 * i) {
   #pragma unroll
-            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+            for (int j = 0; j < (is_a_8bit ? 2 : 4) * 2;
+                 j += (m_block_size_8 ? 2 : 1)) {
               int red_sh_wr =
                   red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
               if (i < red_off) {
@@ -1437,24 +1694,26 @@ __global__ void Marlin(
                 float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
   #pragma unroll
                 for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                  reinterpret_cast<FragC*>(
+                      frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j][k] +=
                       c_rd[k] + c_wr[k];
               }
-              sh_red[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+              sh_red[red_sh_wr] = reinterpret_cast<int4*>(
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j];
             }
           }
           __syncthreads();
         }
         if (red_idx == 0) {
   #pragma unroll
-          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+          for (int i = 0; i < (is_a_8bit ? 2 : 4) * 2;
+               i += (m_block_size_8 ? 2 : 1)) {
             float* c_rd =
                 reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
   #pragma unroll
             for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
+              reinterpret_cast<FragC*>(
+                  frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + i][j] += c_rd[j];
           }
         }
         __syncthreads();
@@ -1470,13 +1729,13 @@ __global__ void Marlin(
     // We are very careful here to reduce directly in the output buffer to
     // maximize L2 cache utilization in this step. To do this, we write out
     // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     bool is_th_active = threadIdx.x < active_threads;
     if (!is_th_active) {
       return;
     }
 
-    int c_gl_stride = prob_n / 8;
+    int c_gl_stride = prob_n / 8 * (is_a_8bit ? 2 : 1);
     int c_gl_wr_delta_o = 8 * c_gl_stride;
     int c_gl_wr_delta_i = 4 * (active_threads / 32);
     int c_gl_wr;
@@ -1487,7 +1746,7 @@ __global__ void Marlin(
     } else {
       c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
                 4 * (threadIdx.x / 32) + threadIdx.x % 4;
-      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col * (is_a_8bit ? 2 : 1);
     }
     constexpr int c_sh_wr_delta = active_threads;
     int c_sh_wr = threadIdx.x;
@@ -1506,7 +1765,13 @@ __global__ void Marlin(
         if (c_idx / c_gl_stride < block_num_valid_tokens) {
           int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
           int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
-          sh_red[c_sh_wr + c_sh_wr_delta * i] = C[true_idx];
+          if constexpr (is_a_8bit) {
+            int2* sh_red_int2 = reinterpret_cast<int2*>(sh_red);
+            int2* c_int2 = reinterpret_cast<int2*>(C);
+            sh_red_int2[c_sh_wr + c_sh_wr_delta * i] = c_int2[true_idx];
+          } else {
+            sh_red[c_sh_wr + c_sh_wr_delta * i] = C[true_idx];
+          }
         }
       }
     }
@@ -1514,29 +1779,37 @@ __global__ void Marlin(
   #pragma unroll
     for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
       if (!first) {
-        int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+        c_scalar_t* c_red_f16;
+        if constexpr (is_a_8bit) {
+          int2 tmp =
+              reinterpret_cast<int2*>(sh_red)[c_sh_wr + i * c_sh_wr_delta];
+          c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+        } else {
+          int4 tmp = sh_red[c_sh_wr + i * c_sh_wr_delta];
+          c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+        }
   #pragma unroll
-        for (int j = 0; j < 2 * 4; j++) {
+        for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
           int delta = 0;
           if constexpr (m_block_size_8) {
             delta = j % 2 == 1 ? -2 : 0;
           }
           reinterpret_cast<float*>(
-              &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
-              Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+              &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j + (i % 4) +
+                       delta] += Cdtype::num2float(c_red_f16[j]);
         }
       }
       if (!last) {
-        int4 c;
+        c_scalar_t c_f16[is_a_8bit ? 4 : 8];
   #pragma unroll
-        for (int j = 0; j < 2 * 4; j++) {
+        for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
           int delta = 0;
           if constexpr (m_block_size_8) {
             delta = j % 2 == 1 ? -2 : 0;
           }
-          reinterpret_cast<scalar_t*>(&c)[j] =
-              Dtype::float2num(reinterpret_cast<float*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+          c_f16[j] = Cdtype::float2num(reinterpret_cast<float*>(
+              &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j + (i % 4) +
+                       delta]);
         }
 
         int c_idx;
@@ -1549,7 +1822,12 @@ __global__ void Marlin(
         if (c_idx / c_gl_stride < block_num_valid_tokens) {
           int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
           int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
-          C[true_idx] = c;
+          if constexpr (is_a_8bit) {
+            int2* c_int2 = reinterpret_cast<int2*>(C);
+            c_int2[true_idx] = *reinterpret_cast<int2*>(c_f16);
+          } else {
+            C[true_idx] = *reinterpret_cast<int4*>(c_f16);
+          }
         }
       }
     }
@@ -1563,10 +1841,10 @@ __global__ void Marlin(
 
     constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
 
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     bool is_th_active = threadIdx.x < active_threads;
 
-    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int num_floats = thread_m_blocks * (is_a_8bit ? 2 : 4) * 2 * 4;
     constexpr int th_size = num_floats * sizeof(float) / 16;
 
     int c_cur_offset = locks_off * c_size;
@@ -1634,7 +1912,7 @@ __global__ void Marlin(
     } else {
       c_sh_wr =
           (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
-      c_sh_wr += 32 * (threadIdx.x / 32);
+      c_sh_wr += (is_a_8bit ? 16 : 32) * (threadIdx.x / 32);
     }
 
     int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
@@ -1643,49 +1921,49 @@ __global__ void Marlin(
     // We first reorder in shared memory to guarantee the most efficient final
     // global write patterns
     auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
-      scalar_t2 res =
-          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+      c_scalar_t2 res =
+          Cdtype::nums2num2(Cdtype::float2num(c0), Cdtype::float2num(c1));
 
       // For per-column quantization we finally apply the scale here (only for
       // 4-bit)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 4 &&
+      if constexpr (!has_act_order && group_blocks == -1 && !is_a_8bit &&
+                    b_type.size_bits() == 4 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        scalar_t2 tmp_scale = s[0];
+        c_scalar_t2 tmp_scale = s[0];
         if constexpr (m_block_size_8) {
-          tmp_scale = Dtype::num2num2(
+          tmp_scale = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hmul2(res, tmp_scale);
       }
 
-      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+      if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
         if (!mul_topk_weights) {
           res = __hmul2(res, global_scale);
         }
       }
       if (has_bias && last) {
-        scalar_t2 tmp_bias = b_bias[0];
+        c_scalar_t2 tmp_bias = b_bias[0];
         if constexpr (m_block_size_8) {
-          tmp_bias = Dtype::num2num2(
+          tmp_bias = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hadd2(res, tmp_bias);
       }
 
       if constexpr (m_block_size_8) {
-        ((scalar_t*)sh_red)[idx] = res.x;
-        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+        ((c_scalar_t*)sh_red)[idx] = res.x;
+        ((c_scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
       } else {
-        ((scalar_t2*)sh_red)[idx] = res;
+        ((c_scalar_t2*)sh_red)[idx] = res;
       }
     };
 
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+    if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
-        for (int j = 0; j < 4; j++) {
+        for (int j = 0; j < (is_a_8bit ? 2 : 4); j++) {
           if constexpr (m_block_size_8) {
             int wr = c_sh_wr + 16 * j;
             write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
@@ -1723,24 +2001,26 @@ __global__ void Marlin(
       if (row < block_num_valid_tokens) {
         int64_t sorted_row = sh_block_sorted_ids[row];
         int64_t true_idx = sorted_row * c_gl_stride + c_gl_wr % c_gl_stride;
-        scalar_t2 topk_weight_score;
+        c_scalar_t2 topk_weight_score;
         if (mul_topk_weights) topk_weight_score = sh_block_topk_weights[row];
         if (use_atomic_add && slice_count > 1 || mul_topk_weights) {
-          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[true_idx]);
-          scalar_t2* sh_red_half2 =
-              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+          c_scalar_t2* C_half2 = reinterpret_cast<c_scalar_t2*>(&C[true_idx]);
+          c_scalar_t2* sh_red_half2 =
+              reinterpret_cast<c_scalar_t2*>(&sh_red[c_sh_rd]);
+          if (mul_topk_weights) {
   #pragma unroll
-          for (int a = 0; a < 4; a++) {
-            scalar_t2 res = sh_red_half2[a];
-            if (mul_topk_weights) {
-              res = __hmul2(res, topk_weight_score);
+            for (int a = 0; a < 4; a++) {
+              sh_red_half2[a] = __hmul2(sh_red_half2[a], topk_weight_score);
             }
+          }
 
-            if (use_atomic_add && slice_count > 1) {
-              atomicAdd(&C_half2[a], res);
-            } else {
-              C_half2[a] = res;
-            };
+          if (use_atomic_add && slice_count > 1) {
+  #pragma unroll
+            for (int a = 0; a < 4; a++) {
+              atomicAdd(&C_half2[a], sh_red_half2[a]);
+            }
+          } else {
+            C[true_idx] = *reinterpret_cast<int4*>(sh_red_half2);
           }
         } else {
           C[true_idx] = sh_red[c_sh_rd];
@@ -1774,7 +2054,7 @@ __global__ void Marlin(
           }
         }
       }
-      fetch_to_shared(i, i, i < slice_iters, i);
+      fetch_to_shared(i, i, i < slice_iters);
     }
 
     zero_accums();
@@ -1799,73 +2079,100 @@ __global__ void Marlin(
     // have even length meaning that the next iteration will always start at
     // index 0.
 
-    for (int stage_group_id = 0; stage_group_id < max_num_stage_groups;
-         stage_group_id++) {
   #pragma unroll
-      for (int pipe = 0; pipe < stages;) {
+    for (int pipe = 0; pipe < stages;) {
   #pragma unroll
-        for (int k = 0; k < b_sh_wr_iters; k++) {
-          int idx =
-              (pipe >= stages && stage_group_id == max_num_stage_groups - 1)
-                  ? (pipe - stages)
-                  : (pipe + stage_group_id * stages);
-          fetch_to_registers(k + 1, pipe % stages, idx);
-          fetch_scales_to_registers(k + 1, pipe);
-          fetch_zp_to_registers(k + 1, pipe);
-          if (k == b_sh_wr_iters - 2) {
-            int idx = (pipe >= 1 && stage_group_id == max_num_stage_groups - 1)
-                          ? (pipe - 1)
-                          : (pipe + (stage_group_id + 1) * stages - 1);
-            fetch_to_shared((pipe + stages - 1) % stages, pipe,
-                            slice_iters >= stages, idx);
-            pipe++;
-            wait_for_stage();
-            init_same_group(pipe % stages);
-          }
-          matmul(k);
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        fetch_zp_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
         }
-        slice_iters--;
-        if (slice_iters == 0) {
-          break;
-        }
-      }
-
-      a_gl_rd_col += a_gl_rd_delta_o * stages;
 
-      if constexpr (has_act_order) {
-        slice_k_start += tb_k * stages;
-
-        if (slice_k_start < prob_k) {
-          slice_k_start_shared_fetch += tb_k * stages;
-          int first_group_id = g_idx[slice_k_start];
-          int last_g_idx = slice_k_start + stages * tb_k * 2;
-          if (last_g_idx >= prob_k) {
-            last_g_idx = prob_k - 1;
-          }
-          int last_group_id = g_idx[last_g_idx];
-          if (last_group_id >= sh_first_group_id + sh_num_groups) {
-            fetch_act_order_scales_to_shared(false, first_group_id,
-                                             last_group_id);
-            __syncthreads();
-          }
+        if constexpr (!is_a_8bit) {
+          matmul(k, pipe - (k >= b_sh_wr_iters - 2 ? 1 : 0));
+        } else {
+          static_assert(group_blocks != 0 && group_blocks != 1);
+          matmul_a8(k);
         }
       }
+      slice_iters--;
       if (slice_iters == 0) {
         break;
       }
     }
 
+    a_gl_rd_col += a_gl_rd_delta_o * stages;
+
+    if constexpr (has_act_order) {
+      slice_k_start += tb_k * stages;
+
+      if (slice_k_start < prob_k) {
+        slice_k_start_shared_fetch += tb_k * stages;
+        int first_group_id = g_idx[slice_k_start];
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        int last_group_id = g_idx[last_g_idx];
+        if (last_group_id >= sh_first_group_id + sh_num_groups) {
+          fetch_act_order_scales_to_shared(false, first_group_id,
+                                           last_group_id);
+          __syncthreads();
+        }
+      }
+    }
+
     // Process results and, if necessary, proceed to the next column slice.
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      if constexpr (is_a_8bit) {
+        float frag_a_s[2 * thread_m_blocks];
+
+        for (int i = 0; i < 2 * thread_m_blocks; i++)
+          frag_a_s[i] = sh_a_s[i * 8 + (threadIdx.x % 32) / 4];
+
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][0][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][0][g] = c_val * s_val;
+            }
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][1][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][1][g] = c_val * s_val;
+            }
+          }
+        }
+      }
+
       cp_async_wait<0>();
       bool last = slice_idx == slice_count - 1;
       // For per-column scales, we only fetch them here in the final step before
       // write-out
       if constexpr (!has_act_order && group_blocks == -1 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+        if (b_type.size_bits() == 8 || (last || use_atomic_add) || is_a_8bit) {
           if (s_sh_wr_pred) {
             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
@@ -1883,20 +2190,27 @@ __global__ void Marlin(
       }
 
       if constexpr (!has_act_order && group_blocks == -1 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+                    (has_zp && dequant_skip_flop || !has_zp || is_a_8bit)) {
+        if constexpr (is_a_8bit) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < tb_n_warps) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+          }
+        } else if (b_type.size_bits() == 8 || (last || use_atomic_add)) {
           cp_async_wait<0>();
           __syncthreads();
-          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+          if (threadIdx.x / 32 < tb_n_warps) {
             reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
             if constexpr (m_block_size_8) {
               int idx = (threadIdx.x / 4) % 2;
-              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+              c_scalar_t2* frag_s_half2 =
+                  reinterpret_cast<c_scalar_t2*>(frag_s);
   #pragma unroll
               for (int i = 0; i < 8; i++) {
-                frag_s_half2[i] = Dtype::num2num2(
-                    reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+                frag_s_half2[i] = Cdtype::num2num2(
+                    reinterpret_cast<c_scalar_t*>(&frag_s_half2[i])[idx]);
               }
             }
           }
@@ -1906,26 +2220,48 @@ __global__ void Marlin(
       // For 8-bit channelwise, we apply the scale before the global reduction
       // that converts the fp32 results to fp16 (so that we avoid possible
       // overflow in fp16)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 8 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+      if constexpr (!has_act_order && group_blocks == -1 && is_a_8bit) {
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+          float2 aa[2];
+          aa[0] = Cdtype::num22float2(frag_s[0][j * 2][0]);
+          aa[1] = Cdtype::num22float2(frag_s[0][j * 2 + 1][0]);
+
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[0])[g % 2];
+              frag_c[i][j][0][g] *= scale;
+            }
+
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[1])[g % 2];
+              frag_c[i][j][1][g] *= scale;
+            }
+          }
+        }
+      } else if (!has_act_order && group_blocks == -1 &&
+                 b_type.size_bits() == 8 &&
+                 (has_zp && dequant_skip_flop || !has_zp)) {
+        if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
           for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
             for (int j = 0; j < 4; j++) {
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][0]),
                   frag_s[j / 2][2 * (j % 2) + 0]);
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][2]),
                   frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
 
               if constexpr (!m_block_size_8) {
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][0]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][2]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
               }
@@ -1949,7 +2285,8 @@ __global__ void Marlin(
         cp_async_wait<0>();
         __syncthreads();
         reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
-        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        if constexpr (!is_a_8bit)
+          reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
         __syncthreads();
       }
 
@@ -1958,37 +2295,22 @@ __global__ void Marlin(
       if (last || use_atomic_add)
         // only the last block in a slice actually writes the result
         write_result(last);
-      int old_slice_row = slice_row;
       slice_row = 0;
-      slice_col_par++;
-      slice_col++;
-      is_first_matmul_in_slice = true;
-      init_slice();
-
-      // Should we load A matrix in next slice?
-      // `slice_col == 0`: when move to a new moe block
-      // `old_slice_row > 0`:
-      //    when the last slice is not starting from k_index == 0
-      //    (only happen when it is the first slice of a threadblock)
-      // `prob_k > thread_k_blocks * 16 * stages * max_num_stage_groups`:
-      //    when the required shared memory size is larger than
-      //    the remaining shared memory
-      if (slice_col == 0 || old_slice_row ||
-          prob_k > thread_k_blocks * 16 * stages * max_num_stage_groups) {
-        should_load_a = true;
+      if (!in_part2) {
+        slice_col_par += gridDim.x;
       } else {
-        should_load_a = false;
+        slice_col_par++;
+        slice_col++;
       }
+      is_first_matmul_in_slice = true;
+      init_slice();
 
       if (slice_iters) {
-        a_gl_rd_col = (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-        }
+        a_gl_rd_col =
+            a_gl_rd_delta_o * slice_row + threadIdx.x % a_gl_rd_delta_o;
+        b_gl_rd = B_expert_off + b_gl_stride * (threadIdx.x / b_sh_stride) +
+                  (threadIdx.x % b_sh_stride);
+        b_gl_rd += b_sh_stride * slice_col + b_gl_rd_delta_o * slice_row;
 
         bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
         // Update slice k/n for scales loading
@@ -1998,8 +2320,26 @@ __global__ void Marlin(
           slice_k_start_shared_fetch = slice_k_start;
           slice_n_offset = act_s_col_tb_stride * slice_col;
         } else {
-          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          if constexpr (group_blocks == -1) {
+            s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          } else if constexpr (group_blocks >= thread_k_blocks) {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                zp_sh_stride * slice_col + threadIdx.x;
+          } else {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                threadIdx.x / zp_sh_stride) +
+                zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
+          }
         }
         start_pipes();
       }
diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
index 601e2aa6f991..27b6ffaa6717 100644
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -37,39 +37,6 @@ __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
 
 using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-
-template <int moe_block_size>
-__global__ void permute_cols_kernel(
-    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
-    int4* __restrict__ out_int4_ptr,
-    const int32_t* __restrict__ sorted_token_ids_ptr,
-    const int32_t* __restrict__ expert_ids_ptr,
-    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
-    int size_k, int top_k) {};
-
-}  // namespace marlin
-
-torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
-    torch::Tensor& b_q_weight,
-    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
-    std::optional<torch::Tensor> const& b_zeros_or_none,
-    std::optional<torch::Tensor> const& g_idx_or_none,
-    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
-    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
-    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
-    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
-    bool is_zp_float) {
-  TORCH_CHECK_NOT_IMPLEMENTED(false,
-                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
-  return torch::empty({1, 1});
-}
-
-#else
-
 // For a given "a" of size [M,K] performs a permutation of the K columns based
 // on the given "perm" indices.
 template <int moe_block_size>
@@ -207,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
                           int thread_m_blocks, int prob_m, int prob_n,
                           int prob_k, int num_bits, int group_size,
                           bool has_act_order, bool is_k_full, int has_zp,
-                          int is_zp_float) {
+                          int is_zp_float, bool is_a_8bit) {
   int pack_factor = 32 / num_bits;
 
   // Get B size
@@ -217,8 +184,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
 
   // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
   // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
-  int sh_block_meta_size = tb_m * 4;
-  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_block_meta_size = tb_m * 16;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
   int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
   int sh_red_size = tb_m * (tb_n + 8) * 2;
   int sh_bias_size = tb_n * 2;
@@ -250,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
                      int thread_m_blocks, int prob_m, int prob_n, int prob_k,
                      int num_bits, int group_size, bool has_act_order,
                      bool is_k_full, int has_zp, int is_zp_float,
-                     int max_shared_mem) {
+                     int max_shared_mem, bool is_a_8bit) {
   // Sanity
   if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
       th_config.num_threads == -1) {
@@ -273,188 +240,34 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
   }
 
   // Check that pipeline fits into cache
-  int cache_size = get_kernel_cache_size(
-      th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
-      num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size + 512 <= max_shared_mem;
+  int cache_size =
+      get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
+                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+  return cache_size <= max_shared_mem;
 }
 
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
-             thread_n_blocks == THREAD_N_BLOCKS &&                             \
-             thread_k_blocks == THREAD_K_BLOCKS &&                             \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
-             is_zp_float == IS_ZP_FLOAT) {                                     \
-      constexpr auto S_TYPE =                                                  \
-          W_TYPE == vllm::kFE2M1f                                              \
-              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
-              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
-                                                     : vllm::kBFloat16);       \
-      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
-                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
-                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
-    }
-
-  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
-  //         this is the most common cases
-  // BIGGROUP: cases for big group size (group_blocks in [-1, 8])
-  // FZP: cases for float-zero-point (is_zp_float = true)
-  // ACT: cases for act order case (group_blocks == 0)
-  // FP4: cases for nvfp4(e2m1) (group_blocks == 1)
-  #define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF(W_TYPE)            \
-    COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF(W_TYPE)            \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF(W_TYPE)            \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF(W_TYPE)            \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF(W_TYPE)            \
-    FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FZP_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF(W_TYPE)            \
-    ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    ACT_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-template <typename scalar_t>
-MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
-                                int thread_m_blocks, int thread_n_blocks,
-                                int thread_k_blocks, bool m_block_size_8,
-                                bool has_act_order, bool has_zp,
-                                int group_blocks, int num_threads,
-                                bool is_zp_float) {
-  int num_bits = q_type.size_bits();
+MarlinFuncPtr get_marlin_kernel(
+    const vllm::ScalarType a_type, const vllm::ScalarType b_type,
+    const vllm::ScalarType c_type, const vllm::ScalarType s_type,
+    int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
+    bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
+    int threads, bool is_zp_float) {
+  int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
-  if (false) {
-  }
-
-  COMMON_GET_IF(vllm::kU4)
-  COMMON_GET_IF(vllm::kU4B8)
-  COMMON_GET_IF(vllm::kU8B128)
 
-  NVFP4_GET_IF(vllm::kFE2M1f)
-
-  BIGGROUP_GET_IF(vllm::kFE4M3fn)
-
-  ACT_GET_IF(vllm::kU4B8)
-  ACT_GET_IF(vllm::kU8B128)
-  if (std::is_same<scalar_t, nv_bfloat16>::value) {
-    if (false) {
-    }
-    MXFP4_GET_IF(vllm::kFE2M1f)
-  }
+#include "kernel_selector.h"
 
   return kernel;
 }
 
-template <typename scalar_t>
-exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
-                                    int prob_n, int prob_k, int thread_m_blocks,
-                                    bool m_block_size_8, int num_bits,
-                                    int group_size, bool has_act_order,
-                                    bool is_k_full, bool has_zp,
-                                    bool is_zp_float, int max_shared_mem) {
+exec_config_t determine_exec_config(
+    const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
+    const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
+    int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
+    bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
+    bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
+    bool is_a_8bit) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -471,73 +284,69 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 
     if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
                          prob_n, prob_k, num_bits, group_size, has_act_order,
-                         is_k_full, has_zp, is_zp_float, max_shared_mem)) {
+                         is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
+                         is_a_8bit)) {
       continue;
     }
 
     int cache_size = get_kernel_cache_size(
         th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
-        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
+        is_a_8bit);
 
     int group_blocks = 0;
     if (!has_act_order) {
       group_blocks = group_size == -1 ? -1 : (group_size / 16);
     }
 
-    auto kernel = get_marlin_kernel<scalar_t>(
-        q_type, thread_m_blocks, th_config.thread_n / 16,
-        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
-        group_blocks, th_config.num_threads, is_zp_float);
+    auto kernel =
+        get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
+                          th_config.thread_n / 16, th_config.thread_k / 16,
+                          m_block_size_8, has_act_order, has_zp, group_blocks,
+                          th_config.num_threads, is_zp_float);
 
     if (kernel == MarlinDefault) continue;
 
-    if (thread_m_blocks > 1) {
-      exec_cfg = {1, th_config};
-      break;
-    } else {
-      cudaFuncAttributes attr;
-      cudaFuncGetAttributes(&attr, kernel);
-      int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
-      int allow_count = min(device_max_reg_size / reg_size,
-                            max_shared_mem / (cache_size + 1024));
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr, kernel);
+    int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
+    int allow_count = min(device_max_reg_size / reg_size,
+                          max_shared_mem / (cache_size + 1536));
+    if (thread_m_blocks == 1)
       allow_count = max(min(allow_count, 4), 1);
-      if (allow_count > count) {
-        count = allow_count;
-        exec_cfg = {count, th_config};
-      };
+    else
+      allow_count = max(min(allow_count, 2), 1);
+
+    if (prob_n / th_config.thread_n * prob_m * top_k * 4 < sms * allow_count) {
+      allow_count =
+          max(prob_n / th_config.thread_n * prob_m * top_k * 4 / sms, 1);
     }
+
+    if (allow_count > count) {
+      count = allow_count;
+      exec_cfg = {count, th_config};
+    };
   }
 
   return exec_cfg;
 }
 
-template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
-               void* s, void* s2, void* zp, void* g_idx, void* perm,
-               void* a_tmp, void* sorted_token_ids, void* expert_ids,
-               void* num_tokens_past_padded, void* topk_weights,
-               int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
-               int prob_m, int prob_n, int prob_k, void* workspace,
-               vllm::ScalarType const& q_type, bool has_bias,
-               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
-               int group_size, int dev, cudaStream_t stream, int thread_k,
-               int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce,
-               bool is_zp_float) {
+               void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
+               void* perm, void* a_tmp, void* sorted_token_ids,
+               void* expert_ids, void* num_tokens_past_padded,
+               void* topk_weights, int moe_block_size, int num_experts,
+               int top_k, bool mul_topk_weights, bool is_ep, int prob_m,
+               int prob_n, int prob_k, void* workspace,
+               vllm::ScalarType const& a_type, vllm::ScalarType const& b_type,
+               vllm::ScalarType const& c_type, vllm::ScalarType const& s_type,
+               bool has_bias, bool has_act_order, bool is_k_full, bool has_zp,
+               int num_groups, int group_size, int dev, cudaStream_t stream,
+               int thread_k, int thread_n, int sms, int blocks_per_sm,
+               bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
   int thread_m_blocks = div_ceil(moe_block_size, 16);
   bool m_block_size_8 = moe_block_size == 8;
-
-  if (has_zp) {
-    TORCH_CHECK(
-        q_type == vllm::kU4 || q_type == vllm::kU8,
-        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
-  } else {
-    TORCH_CHECK(
-        q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
-            q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
-        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
-        "has_zp = False. Got = ",
-        q_type.str());
-  }
+  bool is_a_8bit = a_type.size_bits() == 8;
 
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
@@ -563,14 +372,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     }
   }
 
-  int num_bits = q_type.size_bits();
+  int num_bits = b_type.size_bits();
   const int4* A_ptr = (const int4*)A;
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
   int4* C_tmp_ptr = (int4*)C_tmp;
   const int4* bias_ptr = (const int4*)b_bias;
-  const int4* s_ptr = (const int4*)s;
-  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const float* a_s_ptr = (const float*)a_s;
+  const int4* b_s_ptr = (const int4*)b_s;
+  const uint16_t* g_s_ptr = (const uint16_t*)g_s;
   const int4* zp_ptr = (const int4*)zp;
   const int* g_idx_ptr = (const int*)g_idx;
   const int* perm_ptr = (const int*)perm;
@@ -618,22 +428,41 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
   TORCH_CHECK(max_shared_mem > 0);
 
+  int major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         dev);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         dev);
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
+              "marlin kernel only support Ampere or newer GPUs.");
+  if (a_type == vllm::kFE4M3fn) {
+    TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
+                "FP8 only support Ada Lovelace or newer GPUs.");
+    TORCH_CHECK(
+        major_capability * 10 + minor_capability == 89 ||
+            major_capability * 10 + minor_capability == 120,
+        "Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
+        "Marlin W4A16 on other devices).");
+  }
+
   // Set thread config
   exec_config_t exec_cfg;
   thread_config_t thread_tfg;
   if (thread_k != -1 && thread_n != -1) {
-    thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
-    exec_cfg = exec_config_t{1, thread_tfg};
+    thread_tfg = thread_config_t{thread_k, thread_n, thread_k * thread_n / 64};
+    if (blocks_per_sm == -1) blocks_per_sm = 1;
+    exec_cfg = exec_config_t{blocks_per_sm, thread_tfg};
     TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
                 " is not divisible by thread_n = ", thread_n);
     TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
                 " is not divisible by thread_k = ", thread_k);
   } else {
     // Auto config
-    exec_cfg = determine_exec_config<scalar_t>(
-        q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8,
-        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
-        max_shared_mem);
+    exec_cfg = determine_exec_config(
+        a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
+        top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
+        has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
+        is_a_8bit);
     thread_tfg = exec_cfg.tb_cfg;
   }
 
@@ -647,22 +476,29 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   int thread_k_blocks = thread_k / 16;
   int thread_n_blocks = thread_n / 16;
 
-  TORCH_CHECK(
-      is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
-                      prob_n, prob_k, num_bits, group_size, has_act_order,
-                      is_k_full, has_zp, is_zp_float, max_shared_mem),
-      "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
-      ", thread_k = ", thread_tfg.thread_k,
-      ", thread_n = ", thread_tfg.thread_n,
-      ", num_threads = ", thread_tfg.num_threads, " for MKN = [", prob_m, ", ",
-      prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
-      ", group_size = ", group_size, ", has_act_order = ", has_act_order,
-      ", is_k_full = ", is_k_full, ", has_zp = ", has_zp,
-      ", is_zp_float = ", is_zp_float, ", max_shared_mem = ", max_shared_mem);
-
-  auto kernel = get_marlin_kernel<scalar_t>(
-      q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8,
-      has_act_order, has_zp, group_blocks, num_threads, is_zp_float);
+  TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
+                              prob_m, prob_n, prob_k, num_bits, group_size,
+                              has_act_order, is_k_full, has_zp, is_zp_float,
+                              max_shared_mem, is_a_8bit),
+              "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
+              ", thread_k = ", thread_tfg.thread_k,
+              ", thread_n = ", thread_tfg.thread_n,
+              ", num_threads = ", thread_tfg.num_threads, " for MKN = [",
+              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+              ", group_size = ", group_size,
+              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+              ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
+              ", max_shared_mem = ", max_shared_mem);
+
+  int sh_cache_size =
+      get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
+                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+
+  auto kernel = get_marlin_kernel(
+      a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
+      thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
+      num_threads, is_zp_float);
 
   if (kernel == MarlinDefault) {
     TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@@ -679,19 +515,20 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   // avoid ">>>" being formatted to "> > >"
   // clang-format off
   kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
-      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr, g_idx_ptr,
       sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
       topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
-      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem);
+      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce);
   // clang-format on
 }
 
 }  // namespace MARLIN_NAMESPACE_NAME
 
 torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
     torch::Tensor& b_q_weight,
     std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& a_scales_or_none,
     std::optional<torch::Tensor> const& global_scale_or_none,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
@@ -699,11 +536,70 @@ torch::Tensor moe_wna16_marlin_gemm(
     torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
     torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
     int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
     int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
-    bool is_zp_float) {
-  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
-  int pack_factor = 32 / b_q_type.size_bits();
+    bool is_zp_float, int64_t thread_k, int64_t thread_n,
+    int64_t blocks_per_sm) {
+  vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
+
+  auto c_dtype = a.dtype();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    a_type_id = vllm::kFloat16.id();
+    c_type_id = vllm::kFloat16.id();
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    a_type_id = vllm::kBFloat16.id();
+    c_type_id = vllm::kBFloat16.id();
+  } else {
+    c_dtype = b_scales.dtype();
+    if (b_scales.scalar_type() == at::ScalarType::Half) {
+      c_type_id = vllm::kFloat16.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
+      c_type_id = vllm::kBFloat16.id();
+    } else {
+      c_type_id = vllm::kBFloat16.id();
+
+      TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
+      torch::Tensor c = c_or_none.value();
+      c_dtype = c.dtype();
+
+      if (c.scalar_type() == at::ScalarType::Half) {
+        c_type_id = vllm::kFloat16.id();
+      } else if (c.scalar_type() == at::ScalarType::BFloat16) {
+        c_type_id = vllm::kBFloat16.id();
+      } else {
+        TORCH_CHECK(false, "unsupported c dtype");
+      }
+    }
+
+    if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      a_type_id = vllm::kFE4M3fn.id();
+    } else if (a.scalar_type() == at::ScalarType::Char) {
+      a_type_id = vllm::kS8.id();
+    } else {
+      TORCH_CHECK(false, "unsupported `a` scalar_type");
+    }
+  }
+
+  s_type_id = c_type_id;
+  if (b_type_id == vllm::kFE2M1f.id()) {
+    if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      s_type_id = vllm::kFE4M3fn.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
+      s_type_id = vllm::kFE8M0fnu.id();
+    } else {
+      TORCH_CHECK(false,
+                  "When b_type = float4_e2m1f, b_scale scalar type must be",
+                  "float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
+    }
+  }
+
+  vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
+  vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
+  vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
+  vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
+
+  int pack_factor = 32 / b_type.size_bits();
+  int num_experts = b_q_weight.size(0);
 
   if (moe_block_size != 8) {
     TORCH_CHECK(moe_block_size % 16 == 0,
@@ -745,19 +641,27 @@ torch::Tensor moe_wna16_marlin_gemm(
   TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
   TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
 
-  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_k = -1;
-  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_n = -1;
+  torch::Tensor a_scales;
+  auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+
+  if (a_scales_or_none.has_value()) {
+    a_scales = a_scales_or_none.value();
+    TORCH_CHECK(a_type.size_bits() == 8,
+                "a_scales can only be used for 8bit activation.");
+  } else {
+    a_scales = torch::empty({0}, options_fp32);
+    TORCH_CHECK(a_type.size_bits() != 8,
+                "the a_scales parameter must be passed for 8bit activation.");
+  }
+
   // sms: number of SMs to use for the kernel
   int sms = -1;
   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
 
   // Alloc buffers
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
   torch::Tensor c;
   if (c_or_none.has_value()) {
     c = c_or_none.value();
@@ -774,8 +678,6 @@ torch::Tensor moe_wna16_marlin_gemm(
 
   // Alloc C tmp buffer that is going to be used for the global reduce
   torch::Tensor c_tmp;
-  auto options_fp32 =
-      torch::TensorOptions().dtype(at::kFloat).device(a.device());
   if (use_fp32_reduce && !use_atomic_add) {
     // max num of threadblocks is sms * 4
     long max_c_tmp_size = min(
@@ -846,11 +748,11 @@ torch::Tensor moe_wna16_marlin_gemm(
   torch::Tensor global_scale;
   if (global_scale_or_none.has_value()) {
     global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+    TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
                 "global_scale can only be used for nvfp4 format.");
   } else {
     global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+    TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
                 "the global_scale parameter must be passed for nvfp4 format.");
   }
 
@@ -877,15 +779,15 @@ torch::Tensor moe_wna16_marlin_gemm(
   bool has_zp = b_zeros.size(-1) > 0;
   if (has_zp) {
     TORCH_CHECK(
-        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
-        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
+        b_type == vllm::kU4 || b_type == vllm::kU8,
+        "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
   } else {
-    TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 ||
-                    b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f,
-                "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
-                "float4_e2m1f when "
-                "has_zp = False. Got = ",
-                b_q_type.str());
+    TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
+                    b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                    b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
+                "b_type must be uint4b8, uint8b128, int4, int8, "
+                "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
+                b_type.str());
   }
 
   if (has_zp && is_zp_float) {
@@ -929,71 +831,33 @@ torch::Tensor moe_wna16_marlin_gemm(
               " is below min_workspace_size = ", min_workspace_size);
 
   int dev = a.get_device();
-  if (a.scalar_type() == at::ScalarType::Half) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::Half>();
-    }
-
-    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
-        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
-        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
-        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
-        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
-        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
-        has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::BFloat16>();
-    }
 
-    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
-        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
-        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
-        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
-        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
-        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
-        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
-        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
-        has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else {
-    TORCH_CHECK(false,
-                "moe_wna16_marlin_gemm only supports bfloat16 and float16");
+  TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
+              "scalar type of a_scales must be float");
+  TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
+              "scalar type of global_scale must be the same with c");
+  if (a_type.size_bits() == 16) {
+    TORCH_CHECK(
+        a.scalar_type() == c.scalar_type(),
+        "scalar type of a must be the same with c for 16 bit activation");
   }
 
+  MARLIN_NAMESPACE_NAME::marlin_mm(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
+      b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
+      global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
+      perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(),
+      expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
+      topk_weights.data_ptr(), moe_block_size, num_experts, top_k,
+      mul_topk_weights, is_ep, size_m, size_n, size_k, workspace.data_ptr(),
+      a_type, b_type, c_type, s_type, has_bias, has_act_order, is_k_full,
+      has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+      thread_k, thread_n, sms, blocks_per_sm, use_atomic_add, use_fp32_reduce,
+      is_zp_float);
+
   return c;
 }
 
-#endif
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
 }
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index bd95ade40a08..e0a8280722f3 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -63,16 +63,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   m.def(
       "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
       "Tensor! b_q_weight, Tensor? b_bias_or_none,"
-      "Tensor! b_scales, Tensor? global_scale, Tensor? "
+      "Tensor! b_scales, Tensor? a_scales, Tensor? global_scale, Tensor? "
       "b_zeros_or_none,"
       "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
       "Tensor sorted_token_ids,"
       "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
       "Tensor! topk_weights, int moe_block_size, int top_k, "
-      "bool mul_topk_weights, bool is_ep, int b_q_type_id,"
+      "bool mul_topk_weights, bool is_ep, int b_type_id,"
       "int size_m, int size_n, int size_k,"
       "bool is_full_k, bool use_atomic_add,"
-      "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+      "bool use_fp32_reduce, bool is_zp_float,"
+      "int thread_k, int thread_n, int blocks_per_sm) -> Tensor");
+
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
index 03bd5964a7fc..e306ff02605b 100644
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@@ -437,10 +437,10 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
       for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
   #pragma unroll
         for (int k_idx = 0; k_idx < 2; ++k_idx) {
-          FType low16 =
-              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2]);
-          FType high16 =
-              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
+          FType low16 = MarlinScalarType2<FType>::float2num(
+              C_frag[m_idx][n_idx][k_idx * 2]);
+          FType high16 = MarlinScalarType2<FType>::float2num(
+              C_frag[m_idx][n_idx][k_idx * 2 + 1]);
           uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
                          (reinterpret_cast<uint32_t&>(high16) << 16);
           int sts_offset =
diff --git a/csrc/quantization/gptq_allspark/allspark_utils.cuh b/csrc/quantization/gptq_allspark/allspark_utils.cuh
index 831413016538..14a61ad8fd88 100644
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -8,7 +8,7 @@
 #include <cuda_bf16.h>
 #include <iostream>
 #include "../gptq_marlin/marlin_dtypes.cuh"
-using marlin::ScalarType;
+using marlin::MarlinScalarType2;
 
 namespace allspark {
 
@@ -72,10 +72,10 @@ __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
 
   int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
   for (int i = 0; i < n_mat; ++i) {
-    sum += ScalarType<FType>::num2float(C_split[idx + i * matrix_size]);
+    sum += MarlinScalarType2<FType>::num2float(C_split[idx + i * matrix_size]);
   }
 
-  C[idx] = ScalarType<FType>::float2num(sum);
+  C[idx] = MarlinScalarType2<FType>::float2num(sum);
 }
 
 template <typename FType>
diff --git a/csrc/quantization/gptq_marlin/.gitignore b/csrc/quantization/gptq_marlin/.gitignore
index 77088552b85b..ba805f9250ec 100644
--- a/csrc/quantization/gptq_marlin/.gitignore
+++ b/csrc/quantization/gptq_marlin/.gitignore
@@ -1 +1,2 @@
-kernel_*.cu
\ No newline at end of file
+sm*_kernel_*.cu
+kernel_selector.h
diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
index e607107b3e77..307bae6738ec 100644
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -4,14 +4,16 @@
 
 namespace marlin {
 
-template <int const num_threads, int const num_bits>
+template <int const num_threads, int const num_bits, bool is_a_8bit>
 __global__ void awq_marlin_repack_kernel(
     uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
     int size_k, int size_n) {
   constexpr int pack_factor = 32 / num_bits;
 
-  int k_tiles = size_k / tile_k_size;
-  int n_tiles = size_n / tile_n_size;
+  constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
+  constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
+  int k_tiles = size_k / target_tile_k_size;
+  int n_tiles = size_n / target_tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
 
   auto start_k_tile = blockIdx.x * block_k_tiles;
@@ -33,10 +35,10 @@ __global__ void awq_marlin_repack_kernel(
 
   extern __shared__ int4 sh[];
 
-  constexpr int tile_n_ints = tile_n_size / pack_factor;
+  constexpr int tile_n_ints = target_tile_n_size / pack_factor;
 
   constexpr int stage_n_threads = tile_n_ints / 4;
-  constexpr int stage_k_threads = tile_k_size;
+  constexpr int stage_k_threads = target_tile_k_size;
   constexpr int stage_size = stage_k_threads * stage_n_threads;
 
   auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
@@ -45,7 +47,7 @@ __global__ void awq_marlin_repack_kernel(
       return;
     }
 
-    int first_n = n_tile_id * tile_n_size;
+    int first_n = n_tile_id * target_tile_n_size;
     int first_n_packed = first_n / pack_factor;
 
     int4* sh_ptr = sh + stage_size * pipe;
@@ -54,7 +56,7 @@ __global__ void awq_marlin_repack_kernel(
       auto k_id = threadIdx.x / stage_n_threads;
       auto n_id = threadIdx.x % stage_n_threads;
 
-      int first_k = k_tile_id * tile_k_size;
+      int first_k = k_tile_id * target_tile_k_size;
 
       cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
                 reinterpret_cast<int4 const*>(
@@ -78,11 +80,11 @@ __global__ void awq_marlin_repack_kernel(
     }
 
     int tc_col = th_id / 4;
-    int tc_row = (th_id % 4) * 2;
+    int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
 
     constexpr int tc_offsets[4] = {0, 1, 8, 9};
 
-    int cur_n = warp_id * 16 + tc_col;
+    int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
     int cur_n_packed = cur_n / pack_factor;
     int cur_n_pos = cur_n % pack_factor;
 
@@ -105,23 +107,50 @@ __global__ void awq_marlin_repack_kernel(
     uint32_t vals[8];
 #pragma unroll
     for (int i = 0; i < 4; i++) {
-      int cur_elem = tc_row + tc_offsets[i];
-
-      int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
-      int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
-                                          sh_stride * cur_elem];
-
-      vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
-      vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      if constexpr (is_a_8bit) {
+        int cur_elem = tc_row + i;
+
+        int packed_src_0 =
+            sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
+                             sh_stride * cur_elem];
+        int packed_src_1 =
+            sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
+                             sh_stride * (cur_elem + 16)];
+
+        vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+        vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      } else {
+        int cur_elem = tc_row + tc_offsets[i];
+
+        int packed_src_0 =
+            sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
+        int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
+                                            sh_stride * cur_elem];
+
+        vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+        vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      }
     }
 
-    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    constexpr int tile_size =
+        target_tile_k_size * target_tile_n_size / pack_factor;
     int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
 
     // Result of:
     // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-    if constexpr (num_bits == 4) {
-      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    if constexpr (!is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else if constexpr (is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 
       uint32_t res = 0;
 #pragma unroll
@@ -138,8 +167,9 @@ __global__ void awq_marlin_repack_kernel(
       uint32_t res2 = 0;
 #pragma unroll
       for (int i = 0; i < 4; i++) {
-        res1 |= vals[pack_idx[i]] << (i * 8);
-        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+        const int ii = is_a_8bit ? i : pack_idx[i];
+        res1 |= vals[ii] << (i * 8);
+        res2 |= vals[4 + ii] << (i * 8);
       }
 
       out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
@@ -176,18 +206,21 @@ __global__ void awq_marlin_repack_kernel(
 
 }  // namespace marlin
 
-#define CALL_IF(NUM_BITS)                                                   \
-  else if (num_bits == NUM_BITS) {                                          \
-    cudaFuncSetAttribute(                                                   \
-        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>, \
-        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
-    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>      \
-        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
-            b_q_weight_ptr, out_ptr, size_k, size_n);                       \
+#define CALL_IF(NUM_BITS, IS_A_8BIT)                                       \
+  else if (num_bits == NUM_BITS && is_a_8bit == IS_A_8BIT) {               \
+    cudaFuncSetAttribute(                                                  \
+        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
+                                         IS_A_8BIT>,                       \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);      \
+    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
+                                     IS_A_8BIT>                            \
+        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(      \
+            b_q_weight_ptr, out_ptr, size_k, size_n);                      \
   }
 
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
-                                int64_t size_n, int64_t num_bits) {
+                                int64_t size_n, int64_t num_bits,
+                                bool is_a_8bit) {
   // Verify compatibility with marlin tile of 16x64
   TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
               " is not divisible by tile_k_size = ", marlin::tile_k_size);
@@ -238,10 +271,13 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
 
   if (false) {
   }
-  CALL_IF(4)
-  CALL_IF(8)
+  CALL_IF(4, false)
+  CALL_IF(8, false)
+  CALL_IF(4, true)
+  CALL_IF(8, true)
   else {
-    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits);
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
+                ", is_a_8bit = ", is_a_8bit);
   }
 
   return out;
diff --git a/csrc/quantization/gptq_marlin/dequant.h b/csrc/quantization/gptq_marlin/dequant.h
index e8b0c302b202..26b8d40368aa 100644
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
@@ -470,6 +470,50 @@ __device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), false>(
   frag_b[0] = __hmul2(frag_b[0], bias_reg);
 }
 
+template <>
+__device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kFE2M1f.id(), true>(
+    int q, __nv_fp8x4_e4m3* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP8_EXPONENT = 4;
+  constexpr int RIGHT_SHIFT = FP8_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70707070;
+
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note1: reverse indexing is intentional because weights are permuted
+  // Note2: when dequant to 8bit type, we write to `frag_b[2]` instead of
+  //        `frag_b[1]` to fit the layout of tensorcore
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<int32_t, vllm::kU4B8.id(), true>(
+    int q, int32_t* frag_b) {
+  constexpr int repeated_zp = 0x08080808;
+  constexpr int MASK = 0x80808080;
+
+  frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+  q >>= 4;
+  frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+}
+
+template <>
+__device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kU4B8.id(), true>(
+    int q, __nv_fp8x4_e4m3* frag_b) {
+  int s = q & 0x08080808;
+  int Out1 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
+  q >>= 4;
+  s = q & 0x08080808;
+  int Out2 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
+
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
+}
+
 template <typename scalar_t2, vllm::ScalarTypeId s_type_id>
 __device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);
 
@@ -515,6 +559,49 @@ __device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE8M0fnu.id()>(
   // Note: reverse indexing is intentional because weights are permuted
   frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
   frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+};
+
+// subtract zero point in quanted format and then dequant
+template <typename scalar_t2, vllm::ScalarTypeId w_type_id,
+          bool skip_flop = false>
+__device__ inline void sub_zp_and_dequant(int q, scalar_t2* frag_b, int zp);
+
+template <>
+__device__ inline void sub_zp_and_dequant<int32_t, vllm::kU4.id(), true>(
+    int q, int32_t* frag_b, int zp) {
+  // INT4 with zp -> INT8
+  // see https://github.com/vllm-project/vllm/pull/24722
+  int repeated_zp = 0x01010101 * zp;
+  int MASK = 0x80808080;
+
+  frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+  q >>= 4;
+  frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+}
+
+template <>
+__device__ inline void sub_zp_and_dequant<__nv_fp8x4_e4m3, vllm::kU4.id(),
+                                          true>(int q, __nv_fp8x4_e4m3* frag_b,
+                                                int zp) {
+  // INT4 with zp -> FP8
+  // see https://github.com/vllm-project/vllm/pull/24722
+  uint32_t u_q = *reinterpret_cast<uint32_t*>(&q);
+  uint32_t u_zp = *reinterpret_cast<uint32_t*>(&zp);
+  uint32_t u_zp1 = u_zp + 1;
+  uint32_t repeated_zp = 0x01010101 * u_zp;
+
+  uint32_t q0, s;
+  q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
+  s = (q0 + repeated_zp) & 0x80808080;
+  uint32_t Out1 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
+
+  u_q >>= 4;
+  q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
+  s = (q0 + repeated_zp) & 0x80808080;
+  uint32_t Out2 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
+
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
 }
 
 #endif
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 42d3b456096e..27ef7271ba41 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -4,141 +4,292 @@
 import itertools
 import os
 import subprocess
+import sys
 
 import jinja2
 
-FILE_HEAD = """
-// auto generated by generate.py
+ARCHS = []
+SUPPORT_FP8 = False
+for arch in sys.argv[1].split(","):
+    arch = arch[: arch.index(".") + 2].replace(".", "")
+    arch = int(arch)
+    # only SM89 and SM120 fully support
+    # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
+    # SM90 and SM100 can use this PTX, but it’s simulated
+    # with FP16 MMA, so it cannot achieve any acceleration.
+    if arch in [89, 120]:
+        SUPPORT_FP8 = True
+
+FILE_HEAD_COMMENT = """
+// auto generated by generate_kernels.py
 // clang-format off
+""".lstrip()
 
+FILE_HEAD = (
+    FILE_HEAD_COMMENT
+    + """
 #include "kernel.h"
 #include "marlin_template.h"
 
 namespace MARLIN_NAMESPACE_NAME {
-""".strip()
+"""
+)
 
 TEMPLATE = (
     "template __global__ void Marlin<"
-    "{{scalar_t}}, "
-    "{{w_type_id}}, "
+    "{{a_type_id}}, "
+    "{{b_type_id}}, "
+    "{{c_type_id}}, "
     "{{s_type_id}}, "
     "{{threads}}, "
     "{{thread_m_blocks}}, "
     "{{thread_n_blocks}}, "
     "{{thread_k_blocks}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{m_block_size_8}}, "
     "{{stages}}, "
     "{{group_blocks}}, "
-    "{{'true' if is_zp_float else 'false'}}>"
+    "{{is_zp_float}}>"
     "( MARLIN_KERNEL_PARAMS );"
 )
 
-# int8 with zero point case (vllm::kU8) is also supported,
-# we don't add it to reduce wheel size.
-SCALAR_TYPES = [
-    "vllm::kU4",
-    "vllm::kU4B8",
-    "vllm::kU8B128",
-    "vllm::kFE4M3fn",
-    "vllm::kFE2M1f",
-]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
 
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
-# group_blocks:
-#   = 0 : act order case
-#   = -1 : channelwise quantization
-#   > 0 : group_size=16*group_blocks
-GROUP_BLOCKS = [0, 1, -1, 2, 4, 8]
-DTYPES = ["fp16", "bf16"]
+
+QUANT_CONFIGS = [
+    # AWQ-INT4
+    {
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # HQQ
+    {
+        "a_type": ["kFloat16"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [4],
+        "is_zp_float": True,
+    },
+    # GPTQ-INT4
+    {
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # GPTQ-INT8
+    {
+        "b_type": "kU8B128",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # FP8
+    {
+        "b_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 8],
+    },
+    # NVFP4
+    {
+        "b_type": "kFE2M1f",
+        "s_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [1],
+    },
+    # MXFP4
+    {
+        "a_type": ["kBFloat16"],
+        "b_type": "kFE2M1f",
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kFE2M1f",
+        "c_type": ["kBFloat16"],
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [2],
+    },
+]
 
 
 def remove_old_kernels():
-    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+    for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
         subprocess.call(["rm", "-f", filename])
 
+    filename = os.path.dirname(__file__) + "/kernel_selector.h"
+    subprocess.call(["rm", "-f", filename])
+
 
 def generate_new_kernels():
-    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
-        all_template_str_list = []
+    result_dict = {}
 
-        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
-        ):
-            # act order case only support gptq-int4 and gptq-int8
-            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
-                "vllm::kU8B128",
-            ]:
-                continue
-            if thread_configs[2] == 256:
-                # for small batch (m_blocks == 1), we only need (128, 128, 256)
-                # for large batch (m_blocks > 1), we only need (64, 256, 256)
-                if m_blocks <= 1 and thread_configs[0] != 128:
-                    continue
-                if m_blocks > 1 and thread_configs[0] != 64:
-                    continue
-
-            # we only support channelwise quantization and group_size == 128
-            # for fp8
-            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
-                continue
-            # nvfp4 only supports group_size == 16
-            # mxfp4 only supports group_size == 32
-            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
+    for quant_config in QUANT_CONFIGS:
+        c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
+        a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
+        b_type = quant_config["b_type"]
+        is_zp_float = quant_config.get("is_zp_float", False)
+        all_group_blocks = quant_config["group_blocks"]
+        all_m_blocks = quant_config["thread_m_blocks"]
+        all_thread_configs = quant_config["thread_configs"]
+
+        for a_type, c_type in itertools.product(a_types, c_types):
+            if not SUPPORT_FP8 and a_type == "kFE4M3fn":
                 continue
-            # other quantization methods don't support group_size = 16
-            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
+            if "16" in a_type and "16" in c_type and a_type != c_type:
                 continue
+            s_type = quant_config.get("s_type", c_type)
+            if (a_type, b_type, c_type) not in result_dict:
+                result_dict[(a_type, b_type, c_type)] = []
 
-            k_blocks = thread_configs[0] // 16
-            n_blocks = thread_configs[1] // 16
-            threads = thread_configs[2]
-
-            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
-
-            is_zp_float_list = [False]
-            if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4:
-                # HQQ (is_zp_float = true) only supports
-                # 4bit quantization and fp16
-                is_zp_float_list.append(True)
-
-            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
-                s_type = "vllm::kFE4M3fn"
-            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
-                s_type = "vllm::kFE8M0fnu"
-                if dtype == "fp16":
-                    # we cannot safely dequantize e8m0 to fp16, so skip this
-                    continue
-            elif dtype == "fp16":
-                s_type = "vllm::kFloat16"
-            elif dtype == "bf16":
-                s_type = "vllm::kBFloat16"
-
-            for is_zp_float in is_zp_float_list:
-                template_str = jinja2.Template(TEMPLATE).render(
-                    scalar_t=c_dtype,
-                    w_type_id=scalar_type + ".id()",
-                    s_type_id=s_type + ".id()",
-                    threads=threads,
-                    thread_m_blocks=max(m_blocks, 1),
-                    thread_n_blocks=n_blocks,
-                    thread_k_blocks=k_blocks,
-                    m_block_size_8=m_blocks == 0.5,
-                    stages="pipe_stages",
-                    group_blocks=group_blocks,
-                    is_zp_float=is_zp_float,
-                )
+            for group_blocks, m_blocks, thread_configs in itertools.product(
+                all_group_blocks, all_m_blocks, all_thread_configs
+            ):
+                thread_k, thread_n, threads = thread_configs
+
+                if threads == 256:
+                    # for small batch (m_blocks == 1),
+                    #     we only need (128, 128, 256)
+                    # for large batch (m_blocks > 1),
+                    #     we only need (64, 256, 256)
+                    if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
+                        continue
+                    if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
+                        continue
 
-                all_template_str_list.append(template_str)
+                config = {
+                    "threads": threads,
+                    "s_type": s_type,
+                    "thread_m_blocks": max(m_blocks, 1),
+                    "thread_k_blocks": thread_k // 16,
+                    "thread_n_blocks": thread_n // 16,
+                    "m_block_size_8": "true" if m_blocks == 0.5 else "false",
+                    "stages": "pipe_stages",
+                    "group_blocks": group_blocks,
+                    "is_zp_float": "true" if is_zp_float else "false",
+                }
+
+                result_dict[(a_type, b_type, c_type)].append(config)
+
+    kernel_selector_str = FILE_HEAD_COMMENT
+
+    for (a_type, b_type, c_type), config_list in result_dict.items():
+        all_template_str_list = []
+        for config in config_list:
+            s_type = config["s_type"]
+            template_str = jinja2.Template(TEMPLATE).render(
+                a_type_id=f"vllm::{a_type}.id()",
+                b_type_id=f"vllm::{b_type}.id()",
+                c_type_id=f"vllm::{c_type}.id()",
+                s_type_id=f"vllm::{s_type}.id()",
+                **config,
+            )
+            all_template_str_list.append(template_str)
+
+            conditions = [
+                f"a_type == vllm::{a_type}",
+                f"b_type == vllm::{b_type}",
+                f"c_type == vllm::{c_type}",
+                f"s_type == vllm::{s_type}",
+                f"threads == {config['threads']}",
+                f"thread_m_blocks == {config['thread_m_blocks']}",
+                f"thread_n_blocks == {config['thread_n_blocks']}",
+                f"thread_k_blocks == {config['thread_k_blocks']}",
+                f"m_block_size_8 == {config['m_block_size_8']}",
+                f"group_blocks == {config['group_blocks']}",
+                f"is_zp_float == {config['is_zp_float']}",
+            ]
+            conditions = " && ".join(conditions)
+
+            if kernel_selector_str == FILE_HEAD_COMMENT:
+                kernel_selector_str += f"if ({conditions})\n  kernel = "
+            else:
+                kernel_selector_str += f"else if ({conditions})\n  kernel = "
+
+            kernel_template2 = (
+                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                "{{is_zp_float}}>;"
+            )
+
+            kernel_selector_str += (
+                jinja2.Template(kernel_template2).render(
+                    a_type_id=f"vllm::{a_type}.id()",
+                    b_type_id=f"vllm::{b_type}.id()",
+                    c_type_id=f"vllm::{c_type}.id()",
+                    s_type_id=f"vllm::{s_type}.id()",
+                    **config,
+                )
+                + "\n"
+            )
 
         file_content = FILE_HEAD + "\n\n"
         file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+        if a_type == "kFE4M3fn":
+            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+        else:
+            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+        filename = filename.lower()
 
         with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
             f.write(file_content)
 
+    if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
+        kernel_selector_str += (
+            "else if (a_type == vllm::kFE4M3fn)\n"
+            "  TORCH_CHECK(false, "
+            '"marlin kernel with fp8 activation is not built.");'
+        )
+
+    with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
+        f.write(kernel_selector_str)
+
 
 if __name__ == "__main__":
     remove_old_kernels()
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index cc30abcf0080..28ff06559a98 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -53,7 +53,7 @@ torch::Tensor gptq_marlin_gemm(
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
     std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
     int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
     bool is_zp_float) {
   TORCH_CHECK_NOT_IMPLEMENTED(false,
@@ -243,204 +243,29 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
   int cache_size = get_kernel_cache_size(
       th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
       has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size + 512 <= max_shared_mem;
+  return cache_size <= max_shared_mem;
 }
 
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
-             thread_n_blocks == THREAD_N_BLOCKS &&                             \
-             thread_k_blocks == THREAD_K_BLOCKS &&                             \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
-             is_zp_float == IS_ZP_FLOAT) {                                     \
-      constexpr auto S_TYPE =                                                  \
-          W_TYPE == vllm::kFE2M1f                                              \
-              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
-              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
-                                                     : vllm::kBFloat16);       \
-      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
-                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
-                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
-    }
-
-  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
-  //         this is the most common cases
-  // BIGGROUP: cases for big group size (group_blocks in [-1, 8])
-  // FZP: cases for float-zero-point (is_zp_float = true)
-  // ACT: cases for act order case (group_blocks == 0)
-  // FP4: cases for nvfp4(e2m1) (group_blocks == 1)
-  #define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF(W_TYPE)            \
-    COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    COMMON_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    COMMON_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  #define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF(W_TYPE)            \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    BIGGROUP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF(W_TYPE)            \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    NVFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    NVFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF(W_TYPE)            \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    MXFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    MXFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF(W_TYPE)            \
-    FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FZP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FZP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    FZP_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF(W_TYPE)            \
-    ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    ACT_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    ACT_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    ACT_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-template <typename scalar_t>
-MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
-                                int thread_m_blocks, int thread_n_blocks,
-                                int thread_k_blocks, bool m_block_size_8,
-                                bool has_act_order, bool has_zp,
-                                int group_blocks, int num_threads,
-                                bool is_zp_float) {
-  int num_bits = q_type.size_bits();
+MarlinFuncPtr get_marlin_kernel(
+    const vllm::ScalarType a_type, const vllm::ScalarType b_type,
+    const vllm::ScalarType c_type, const vllm::ScalarType s_type,
+    int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
+    bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
+    int threads, bool is_zp_float) {
+  int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
-  if (false) {
-  }
-
-  COMMON_GET_IF(vllm::kU4)
-  COMMON_GET_IF(vllm::kU4B8)
-  COMMON_GET_IF(vllm::kU8B128)
 
-  NVFP4_GET_IF(vllm::kFE2M1f)
-
-  BIGGROUP_GET_IF(vllm::kFE4M3fn)
-
-  ACT_GET_IF(vllm::kU4B8)
-  ACT_GET_IF(vllm::kU8B128)
-
-  if (std::is_same<scalar_t, half>::value) {
-    if (false) {
-    }
-    FZP_GET_IF(vllm::kU4)
-  }
-  if (std::is_same<scalar_t, nv_bfloat16>::value) {
-    if (false) {
-    }
-    MXFP4_GET_IF(vllm::kFE2M1f)
-  }
+  #include "kernel_selector.h"
 
   return kernel;
 }
 
-template <typename scalar_t>
-exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
-                                    int prob_n, int prob_k, int thread_m_blocks,
-                                    bool m_block_size_8, int num_bits,
-                                    int group_size, bool has_act_order,
-                                    bool is_k_full, bool has_zp,
-                                    bool is_zp_float, int max_shared_mem,
-                                    int sms) {
+exec_config_t determine_exec_config(
+    const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
+    const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
+    int prob_n, int prob_k, int thread_m_blocks, bool m_block_size_8,
+    int num_bits, int group_size, bool has_act_order, bool is_k_full,
+    bool has_zp, bool is_zp_float, int max_shared_mem, int sms) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -455,7 +280,7 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 
     if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
                          num_bits, group_size, has_act_order, is_k_full, has_zp,
-                         is_zp_float, max_shared_mem)) {
+                         is_zp_float, max_shared_mem - 512)) {
       continue;
     }
 
@@ -468,10 +293,11 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
       group_blocks = group_size == -1 ? -1 : group_size / 16;
     }
 
-    auto kernel = get_marlin_kernel<scalar_t>(
-        q_type, thread_m_blocks, th_config.thread_n / 16,
-        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
-        group_blocks, th_config.num_threads, is_zp_float);
+    auto kernel =
+        get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
+                          th_config.thread_n / 16, th_config.thread_k / 16,
+                          m_block_size_8, has_act_order, has_zp, group_blocks,
+                          th_config.num_threads, is_zp_float);
 
     if (kernel == MarlinDefault) continue;
 
@@ -485,28 +311,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
   return exec_cfg;
 }
 
-template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
-               void* s, void* s2, void* zp, void* g_idx, void* perm,
-               void* a_tmp, int prob_m, int prob_n, int prob_k, int lda,
-               void* workspace, vllm::ScalarType const& q_type, bool has_bias,
+               void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
+               void* perm, void* a_tmp, int prob_m, int prob_n, int prob_k,
+               int lda, void* workspace, vllm::ScalarType const& a_type,
+               vllm::ScalarType const& b_type, vllm::ScalarType const& c_type,
+               vllm::ScalarType const& s_type, bool has_bias,
                bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
                int group_size, int dev, cudaStream_t stream, int thread_k_init,
                int thread_n_init, int sms, bool use_atomic_add,
                bool use_fp32_reduce, bool is_zp_float) {
-  if (has_zp) {
-    TORCH_CHECK(
-        q_type == vllm::kU4 || q_type == vllm::kU8,
-        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
-  } else {
-    TORCH_CHECK(
-        q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
-            q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
-        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
-        "has_zp = False. Got = ",
-        q_type.str());
-  }
-
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -531,19 +345,21 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     }
   }
 
-  int num_bits = q_type.size_bits();
+  int num_bits = b_type.size_bits();
   const int4* A_ptr = (const int4*)A;
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
   int4* C_tmp_ptr = (int4*)C_tmp;
+
   const int4* bias_ptr = (const int4*)b_bias;
-  const int4* s_ptr = (const int4*)s;
-  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const float* a_s_ptr = (const float*)a_s;
+  const int4* b_s_ptr = (const int4*)b_s;
+  const uint16_t* g_s_ptr = (const uint16_t*)g_s;
+
   const int4* zp_ptr = (const int4*)zp;
   const int* g_idx_ptr = (const int*)g_idx;
   const int* perm_ptr = (const int*)perm;
   int4* a_tmp_ptr = (int4*)a_tmp;
-
   int* locks = (int*)workspace;
 
   if (has_act_order) {
@@ -568,6 +384,21 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
   TORCH_CHECK(max_shared_mem > 0);
 
+  int major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         dev);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         dev);
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
+              "marlin kernel only support Ampere or newer GPUs.");
+  if (a_type == vllm::kFE4M3fn) {
+    TORCH_CHECK(
+        major_capability * 10 + minor_capability == 89 ||
+            major_capability * 10 + minor_capability == 120,
+        "Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
+        "Marlin W4A16 on other devices).");
+  }
+
   int max_par = 16;
   if (prob_n <= 4096) max_par = 16 * 8;
   int max_shared_mem_new = max_shared_mem;
@@ -583,7 +414,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     int thread_n = thread_n_init;
 
     int thread_m_blocks = min(div_ceil(prob_m_split, 16), max_thread_m_blocks);
-    int m_block_size_8 = prob_m_split <= 8;
+    int m_block_size_8 = prob_m_split <= 8 && a_type.size_bits() == 16;
 
     // Set thread config
     exec_config_t exec_cfg;
@@ -597,11 +428,25 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                   " is not divisible by thread_k = ", thread_k);
     } else {
       // Auto config
-      exec_cfg = determine_exec_config<scalar_t>(
-          q_type, prob_m_split, prob_n, prob_k, thread_m_blocks, m_block_size_8,
-          num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
-          max_shared_mem, sms);
+      exec_cfg = determine_exec_config(
+          a_type, b_type, c_type, s_type, prob_m_split, prob_n, prob_k,
+          thread_m_blocks, m_block_size_8, num_bits, group_size, has_act_order,
+          is_k_full, has_zp, is_zp_float, max_shared_mem, sms);
       thread_tfg = exec_cfg.tb_cfg;
+      if (thread_tfg.thread_n != -1) {
+        if (prob_n / thread_tfg.thread_n *
+                div_ceil(prob_m_split, thread_m_blocks * 16) * 4 <=
+            sms) {
+          if (is_valid_config({128, 64, 128}, thread_m_blocks, prob_m_split,
+                              prob_n, prob_k, num_bits, group_size,
+                              has_act_order, is_k_full, has_zp, is_zp_float,
+                              max_shared_mem_new)) {
+            thread_tfg = {128, 64, 128};
+            exec_cfg = {1, thread_tfg};
+          }
+        }
+      }
+
       if (thread_tfg.thread_k == -1 && max_thread_m_blocks > 1) {
         max_thread_m_blocks--;
         continue;
@@ -632,10 +477,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
         ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
         ", max_shared_mem_new = ", max_shared_mem_new);
 
-    auto kernel = get_marlin_kernel<scalar_t>(
-        q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks,
-        m_block_size_8, has_act_order, has_zp, group_blocks, num_threads,
-        is_zp_float);
+    auto kernel = get_marlin_kernel(
+        a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
+        thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
+        num_threads, is_zp_float);
 
     if (kernel == MarlinDefault) {
       TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@@ -657,13 +502,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     // avoid ">>>" being formatted to "> > >"
     // clang-format off
     kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>(
-        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr,
+        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr,
         g_idx_ptr, num_groups,
         prob_m_split, prob_n, prob_k, lda, locks, has_bias, part_use_atomic_add,
         use_fp32_reduce, max_shared_mem_new);
     // clang-format on
 
-    A_ptr += prob_m_split * (lda / 8);
+    bool is_a_8bit = a_type.size_bits() == 8;
+    A_ptr += prob_m_split * (lda / (is_a_8bit ? 16 : 8));
+    a_s_ptr += prob_m_split;
     C_ptr += prob_m_split * (prob_n / 8);
     rest_m -= prob_m_split;
   }
@@ -675,15 +522,73 @@ torch::Tensor gptq_marlin_gemm(
     torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
     torch::Tensor& b_q_weight,
     std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& a_scales_or_none,
     std::optional<torch::Tensor> const& global_scale_or_none,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
     std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
     int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
     bool is_zp_float) {
-  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
-  int pack_factor = 32 / b_q_type.size_bits();
+  vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
+
+  auto c_dtype = a.dtype();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    a_type_id = vllm::kFloat16.id();
+    c_type_id = vllm::kFloat16.id();
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    a_type_id = vllm::kBFloat16.id();
+    c_type_id = vllm::kBFloat16.id();
+  } else {
+    c_dtype = b_scales.dtype();
+    if (b_scales.scalar_type() == at::ScalarType::Half) {
+      c_type_id = vllm::kFloat16.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
+      c_type_id = vllm::kBFloat16.id();
+    } else {
+      c_type_id = vllm::kBFloat16.id();
+
+      TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
+      torch::Tensor c = c_or_none.value();
+      c_dtype = c.dtype();
+
+      if (c.scalar_type() == at::ScalarType::Half) {
+        c_type_id = vllm::kFloat16.id();
+      } else if (c.scalar_type() == at::ScalarType::BFloat16) {
+        c_type_id = vllm::kBFloat16.id();
+      } else {
+        TORCH_CHECK(false, "unsupported c dtype");
+      }
+    }
+
+    if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      a_type_id = vllm::kFE4M3fn.id();
+    } else if (a.scalar_type() == at::ScalarType::Char) {
+      a_type_id = vllm::kS8.id();
+    } else {
+      TORCH_CHECK(false, "unsupported `a` scalar_type");
+    }
+  }
+
+  s_type_id = c_type_id;
+  if (b_type_id == vllm::kFE2M1f.id()) {
+    if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      s_type_id = vllm::kFE4M3fn.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
+      s_type_id = vllm::kFE8M0fnu.id();
+    } else {
+      TORCH_CHECK(false,
+                  "When b_type = float4_e2m1f, b_scale scalar type must be",
+                  "float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
+    }
+  }
+
+  vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
+  vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
+  vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
+  vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
+
+  int pack_factor = 32 / b_type.size_bits();
 
   // Verify A
   TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
@@ -721,6 +626,21 @@ torch::Tensor gptq_marlin_gemm(
   TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
   TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
 
+  torch::Tensor a_scales;
+  auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+
+  if (a_scales_or_none.has_value()) {
+    a_scales = a_scales_or_none.value();
+    TORCH_CHECK(a_type.size_bits() == 8,
+                "a_scales can only be used for 8bit activation.");
+  } else {
+    a_scales = torch::empty({0}, options_fp32);
+    TORCH_CHECK(a_type.size_bits() != 8,
+                "the a_scales parameter must be passed for 8bit activation.");
+  }
+
   // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
   // auto -1)
   int thread_k = -1;
@@ -733,7 +653,6 @@ torch::Tensor gptq_marlin_gemm(
 
   // Alloc buffers
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
   torch::Tensor c;
   if (c_or_none.has_value()) {
     c = c_or_none.value();
@@ -750,8 +669,6 @@ torch::Tensor gptq_marlin_gemm(
 
   // Alloc C tmp buffer that is going to be used for the global reduce
   torch::Tensor c_tmp;
-  auto options_fp32 =
-      torch::TensorOptions().dtype(at::kFloat).device(a.device());
   if (use_fp32_reduce) {
     int max_m_block_size = (size_m + 16 - 1) / 16 * 16;
     max_m_block_size = min(max_m_block_size, 64);
@@ -821,11 +738,11 @@ torch::Tensor gptq_marlin_gemm(
   torch::Tensor global_scale;
   if (global_scale_or_none.has_value()) {
     global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+    TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
                 "global_scale can only be used for nvfp4 format.");
   } else {
     global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+    TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
                 "the global_scale parameter must be passed for nvfp4 format.");
   }
 
@@ -852,15 +769,15 @@ torch::Tensor gptq_marlin_gemm(
   bool has_zp = b_zeros.size(-1) > 0;
   if (has_zp) {
     TORCH_CHECK(
-        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
-        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
+        b_type == vllm::kU4 || b_type == vllm::kU8,
+        "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
   } else {
-    TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 ||
-                    b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f,
-                "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
-                "float4_e2m1f when "
-                "has_zp = False. Got = ",
-                b_q_type.str());
+    TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
+                    b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                    b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
+                "b_type must be uint4b8, uint8b128, int4, int8, "
+                "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
+                b_type.str());
   }
 
   if (has_zp && is_zp_float) {
@@ -902,59 +819,27 @@ torch::Tensor gptq_marlin_gemm(
               " is below min_workspace_size = ", min_workspace_size);
 
   int dev = a.get_device();
-  if (a.scalar_type() == at::ScalarType::Half) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::Half>();
-    }
 
-    marlin::marlin_mm<half>(
-        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
-        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
-        a.stride(0), workspace.data_ptr(), b_q_type, has_bias, has_act_order,
-        is_k_full, has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::BFloat16>();
-    }
-
-    marlin::marlin_mm<nv_bfloat16>(
-        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
-        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
-        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
-        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
-        size_m, size_n, size_k, a.stride(0), workspace.data_ptr(), b_q_type,
-        has_bias, has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else {
-    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
+  TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
+              "scalar type of a_scales must be float");
+  TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
+              "scalar type of global_scale must be the same with c");
+  if (a_type.size_bits() == 16) {
+    TORCH_CHECK(
+        a.scalar_type() == c.scalar_type(),
+        "scalar type of a must be the same with c for 16 bit activation");
   }
 
+  marlin::marlin_mm(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
+      b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
+      global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
+      perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, a.stride(0),
+      workspace.data_ptr(), a_type, b_type, c_type, s_type, has_bias,
+      has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
+      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+      use_atomic_add, use_fp32_reduce, is_zp_float);
+
   return c;
 }
 
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
index ad80d51ece94..796e6c5359da 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -4,15 +4,18 @@
 
 namespace marlin {
 
-template <int const num_threads, int const num_bits, bool const has_perm>
+template <int const num_threads, int const num_bits, bool const has_perm,
+          bool is_a_8bit>
 __global__ void gptq_marlin_repack_kernel(
     uint32_t const* __restrict__ b_q_weight_ptr,
     uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
     int size_k, int size_n) {
   constexpr int pack_factor = 32 / num_bits;
 
-  int k_tiles = size_k / tile_k_size;
-  int n_tiles = size_n / tile_n_size;
+  constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
+  constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
+  int k_tiles = size_k / target_tile_k_size;
+  int n_tiles = size_n / target_tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
 
   auto start_k_tile = blockIdx.x * block_k_tiles;
@@ -34,7 +37,7 @@ __global__ void gptq_marlin_repack_kernel(
 
   extern __shared__ int4 sh[];
 
-  constexpr int perm_size = tile_k_size / 4;
+  constexpr int perm_size = target_tile_k_size / 4;
 
   int4* sh_perm_ptr = sh;
   int4* sh_pipe_ptr = sh_perm_ptr;
@@ -42,14 +45,14 @@ __global__ void gptq_marlin_repack_kernel(
     sh_pipe_ptr += perm_size;
   }
 
-  constexpr int tile_ints = tile_k_size / pack_factor;
+  constexpr int tile_ints = target_tile_k_size / pack_factor;
 
-  constexpr int stage_n_threads = tile_n_size / 4;
-  constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
+  constexpr int stage_n_threads = target_tile_n_size / 4;
+  constexpr int stage_k_threads = has_perm ? target_tile_k_size : tile_ints;
   constexpr int stage_size = stage_k_threads * stage_n_threads;
 
   auto load_perm_to_shared = [&](int k_tile_id) {
-    int first_k_int4 = (k_tile_id * tile_k_size) / 4;
+    int first_k_int4 = (k_tile_id * target_tile_k_size) / 4;
 
     int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);
 
@@ -65,7 +68,7 @@ __global__ void gptq_marlin_repack_kernel(
       return;
     }
 
-    int first_n = n_tile_id * tile_n_size;
+    int first_n = n_tile_id * target_tile_n_size;
 
     int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;
 
@@ -91,7 +94,7 @@ __global__ void gptq_marlin_repack_kernel(
         auto k_id = threadIdx.x / stage_n_threads;
         auto n_id = threadIdx.x % stage_n_threads;
 
-        int first_k = k_tile_id * tile_k_size;
+        int first_k = k_tile_id * target_tile_k_size;
         int first_k_packed = first_k / pack_factor;
 
         cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
@@ -117,13 +120,13 @@ __global__ void gptq_marlin_repack_kernel(
     }
 
     int tc_col = th_id / 4;
-    int tc_row = (th_id % 4) * 2;
+    int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
 
     constexpr int tc_offsets[4] = {0, 1, 8, 9};
 
-    int cur_n = warp_id * 16 + tc_col;
+    int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
 
-    constexpr int sh_stride = 64;
+    constexpr int sh_stride = target_tile_n_size;
     constexpr uint32_t mask = (1 << num_bits) - 1;
 
     int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
@@ -134,6 +137,7 @@ __global__ void gptq_marlin_repack_kernel(
     uint32_t vals[8];
 
     if constexpr (has_perm) {
+      static_assert(!is_a_8bit);
       for (int i = 0; i < 4; i++) {
         int k_idx = tc_row + tc_offsets[i];
 
@@ -156,28 +160,49 @@ __global__ void gptq_marlin_repack_kernel(
 
 #pragma unroll
       for (int i = 0; i < tile_ints; i++) {
-        b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
-        b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
+        if constexpr (is_a_8bit) {
+          b1_vals[i] =
+              sh_stage_int_ptr[cur_n + sh_stride * i + (warp_id % 2) * 8];
+        } else {
+          b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
+          b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
+        }
       }
 
 #pragma unroll
       for (int i = 0; i < 4; i++) {
-        int cur_elem = tc_row + tc_offsets[i];
+        int cur_elem = tc_row + (is_a_8bit ? i : tc_offsets[i]);
         int cur_int = cur_elem / pack_factor;
         int cur_pos = cur_elem % pack_factor;
 
         vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
-        vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+        if constexpr (is_a_8bit)
+          vals[4 + i] =
+              (b1_vals[cur_int + tile_ints / 2] >> (cur_pos * num_bits)) & mask;
+        else
+          vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
       }
     }
 
-    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    constexpr int tile_size =
+        target_tile_k_size * target_tile_n_size / pack_factor;
     int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
 
     // Result of:
     // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-    if constexpr (num_bits == 4) {
-      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    if constexpr (!is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else if constexpr (is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 
       uint32_t res = 0;
 #pragma unroll
@@ -194,8 +219,9 @@ __global__ void gptq_marlin_repack_kernel(
       uint32_t res2 = 0;
 #pragma unroll
       for (int i = 0; i < 4; i++) {
-        res1 |= vals[pack_idx[i]] << (i * 8);
-        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+        const int ii = is_a_8bit ? i : pack_idx[i];
+        res1 |= vals[ii] << (i * 8);
+        res2 |= vals[4 + ii] << (i * 8);
       }
 
       out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
@@ -236,21 +262,22 @@ __global__ void gptq_marlin_repack_kernel(
 
 }  // namespace marlin
 
-#define CALL_IF(NUM_BITS, HAS_PERM)                                         \
-  else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                  \
+#define CALL_IF(NUM_BITS, HAS_PERM, IS_A_8BIT)                              \
+  else if (num_bits == NUM_BITS && has_perm == HAS_PERM &&                  \
+           is_a_8bit == IS_A_8BIT) {                                        \
     cudaFuncSetAttribute(                                                   \
         marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
-                                          HAS_PERM>,                        \
+                                          HAS_PERM, IS_A_8BIT>,             \
         cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
     marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
-                                      HAS_PERM>                             \
+                                      HAS_PERM, IS_A_8BIT>                  \
         <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
             b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);             \
   }
 
 torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
-                                 int64_t num_bits) {
+                                 int64_t num_bits, bool is_a_8bit) {
   // Verify compatibility with marlin tile of 16x64
   TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
               " is not divisible by tile_k_size = ", marlin::tile_k_size);
@@ -309,13 +336,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 
   if (false) {
   }
-  CALL_IF(4, false)
-  CALL_IF(4, true)
-  CALL_IF(8, false)
-  CALL_IF(8, true)
+  CALL_IF(4, false, false)
+  CALL_IF(4, true, false)
+  CALL_IF(8, false, false)
+  CALL_IF(8, true, false)
+
+  CALL_IF(4, false, true)
+  CALL_IF(8, false, true)
+
   else {
     TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
-                ", has_perm = ", has_perm);
+                ", has_perm = ", has_perm, ", is_a_8bit = ", is_a_8bit);
   }
 
   return out;
diff --git a/csrc/quantization/gptq_marlin/kernel.h b/csrc/quantization/gptq_marlin/kernel.h
index bb454f6aff22..b3b79c8aec45 100644
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
@@ -11,17 +11,19 @@
   const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
       int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
       const int4 *__restrict__ b_bias_ptr,                                     \
+      const float *__restrict__ a_scales_ptr,                                  \
       const int4 *__restrict__ scales_ptr,                                     \
-      const uint16_t *__restrict__ scale2_ptr,                                 \
+      const uint16_t *__restrict__ global_scale_ptr,                           \
       const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
       int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
       bool has_bias, bool use_atomic_add, bool use_fp32_reduce,                \
       int max_shared_mem
 
 namespace MARLIN_NAMESPACE_NAME {
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/quantization/gptq_marlin/marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh
index f3b44641e77e..2505e221322d 100644
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@@ -55,6 +55,45 @@ constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
 // No support for async
 #else
 
+__device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 4;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async2_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 8;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async4_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
 __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
                                       bool pred = true) {
   const int BYTES = 16;
diff --git a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
index cc1605481434..a4807a6887f8 100644
--- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
@@ -2,8 +2,10 @@
 #ifndef _data_types_cuh
 #define _data_types_cuh
 #include "marlin.cuh"
+#include "core/scalar_type.hpp"
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda_fp8.h>
 
 #ifndef MARLIN_NAMESPACE_NAME
   #define MARLIN_NAMESPACE_NAME marlin
@@ -11,14 +13,16 @@
 
 namespace MARLIN_NAMESPACE_NAME {
 
-template <typename scalar_t>
-class ScalarType {};
+template <long scalar_type_id>
+class MarlinScalarType {};
 
 template <>
-class ScalarType<half> {
+class MarlinScalarType<vllm::kFloat16.id()> {
  public:
   using scalar_t = half;
   using scalar_t2 = half2;
+  using scalar_t4 = half2;
+  using scalar_32bit_t = half2;
 
   // Matrix fragments for tensor core instructions; their precise layout is
   // documented here:
@@ -27,6 +31,7 @@ class ScalarType<half> {
   using FragB = Vec<half2, 2>;
   using FragC = Vec<float, 4>;
   using FragS = Vec<half2, 1>;
+  using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
   using FragZP = Vec<half2, 4>;
 
   static __device__ float inline num2float(const half x) {
@@ -44,18 +49,25 @@ class ScalarType<half> {
   static __host__ __device__ half inline float2num(const float x) {
     return __float2half(x);
   }
+
+  static __host__ __device__ float2 inline num22float2(const half2 x) {
+    return __half22float2(x);
+  }
 };
 
 template <>
-class ScalarType<nv_bfloat16> {
+class MarlinScalarType<vllm::kBFloat16.id()> {
  public:
   using scalar_t = nv_bfloat16;
   using scalar_t2 = nv_bfloat162;
+  using scalar_t4 = nv_bfloat162;
+  using scalar_32bit_t = nv_bfloat162;
 
   using FragA = Vec<nv_bfloat162, 4>;
   using FragB = Vec<nv_bfloat162, 2>;
   using FragC = Vec<float, 4>;
   using FragS = Vec<nv_bfloat162, 1>;
+  using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
   using FragZP = Vec<nv_bfloat162, 4>;
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
@@ -75,9 +87,63 @@ class ScalarType<nv_bfloat16> {
   static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
     return __float2bfloat16(x);
   }
+
+  static __host__ __device__ float2 inline num22float2(const nv_bfloat162 x) {
+    return __bfloat1622float2(x);
+  }
 #endif
 };
 
+template <>
+class MarlinScalarType<vllm::kFE4M3fn.id()> {
+ public:
+  using scalar_t = __nv_fp8_e4m3;
+  using scalar_t2 = __nv_fp8x2_e4m3;
+  using scalar_t4 = __nv_fp8x4_e4m3;
+  using scalar_32bit_t = __nv_fp8x4_e4m3;
+
+  using FragA = Vec<__nv_fp8x4_e4m3, 4>;
+  using FragB = Vec<__nv_fp8x4_e4m3, 2>;
+  using FragC = Vec<float, 4>;
+  using FragZP = Vec<__nv_fp8x2_e4m3, 4>;
+
+  static __host__ __device__
+      float2 inline num22float2(const __nv_fp8x2_e4m3 x) {
+    return (float2)x;
+  }
+};
+
+template <>
+class MarlinScalarType<vllm::kS8.id()> {
+ public:
+  using scalar_t = int8_t;
+  using scalar_t2 = int16_t;
+  using scalar_t4 = int32_t;
+  using scalar_32bit_t = int32_t;
+
+  using FragA = Vec<int32_t, 4>;
+  using FragB = Vec<int32_t, 2>;
+  using FragC = Vec<float, 4>;
+  using FragZP = Vec<int16_t, 4>;
+};
+
+template <typename scalar_t>
+class MarlinScalarType2 {};
+
+template <>
+class MarlinScalarType2<half> : public MarlinScalarType<vllm::kFloat16.id()> {};
+
+template <>
+class MarlinScalarType2<nv_bfloat16>
+    : public MarlinScalarType<vllm::kBFloat16.id()> {};
+
+template <>
+class MarlinScalarType2<__nv_fp8_e4m3>
+    : public MarlinScalarType<vllm::kFE4M3fn.id()> {};
+
+template <>
+class MarlinScalarType2<int8_t> : public MarlinScalarType<vllm::kS8.id()> {};
+
 }  // namespace MARLIN_NAMESPACE_NAME
 
 #endif
diff --git a/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu b/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
new file mode 100644
index 000000000000..7d4c97fb57ed
--- /dev/null
+++ b/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
@@ -0,0 +1,106 @@
+
+
+#include "marlin.cuh"
+
+#include "core/registration.h"
+
+// for only non-zp format (like gptq)
+__global__ void marlin_int4_fp8_preprocess_kernel_without_zp(
+    // qweight: (size_k * size_n // 8,)
+    const int32_t* __restrict__ qweight,
+    // output: same shape with qweight
+    int32_t* __restrict__ output) {
+  int32_t val = qweight[blockIdx.x * 32 + threadIdx.x];
+  int32_t new_val = 0;
+
+#pragma unroll
+  for (int32_t i = 0; i < 8; i++) {
+    int32_t single_val = val & 0xF;
+    single_val = single_val >= 8 ? single_val - 8 : 15 - single_val;
+    new_val |= single_val << (i * 4);
+    val >>= 4;
+  }
+
+  output[blockIdx.x * 32 + threadIdx.x] = new_val;
+}
+
+// for awq format only (with zp and with awq weight layout)
+__global__ void marlin_int4_fp8_preprocess_kernel_awq(
+    // AWQ qweight: (size_k, size_n // 8)
+    const int32_t* __restrict__ qweight,
+    // output: same shape with qweight
+    int32_t* __restrict__ output,
+    // AWQ zeros: (size_k // group_size, size_n // 8)
+    const int32_t* __restrict__ qzeros, int32_t size_n, int32_t size_k,
+    int32_t group_size) {
+  int32_t val =
+      qweight[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y];
+  int32_t zero =
+      qzeros[(blockIdx.x * 32 + threadIdx.x) / group_size * size_n / 8 +
+             blockIdx.y];
+  int32_t new_val = 0;
+
+#pragma unroll
+  for (int32_t i = 0; i < 8; i++) {
+    int32_t single_val = val & 0xF;
+    int32_t single_zero = zero & 0xF;
+
+    single_val =
+        single_val >= single_zero ? single_val - single_zero : 15 - single_val;
+    new_val |= single_val << (i * 4);
+    val >>= 4;
+    zero >>= 4;
+  }
+
+  output[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y] = new_val;
+}
+
+torch::Tensor marlin_int4_fp8_preprocess(
+    torch::Tensor& qweight, std::optional<torch::Tensor> qzeros_or_none,
+    bool inplace) {
+  TORCH_CHECK(qweight.device().is_cuda(), "qweight is not on GPU");
+  TORCH_CHECK(qweight.scalar_type() == at::ScalarType::Int,
+              "qweight.dtype != torch.int32");
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(qweight));
+
+  torch::Tensor output = inplace ? qweight : torch::empty_like(qweight);
+
+  if (!qzeros_or_none.has_value()) {
+    TORCH_CHECK(qweight.numel() * 8 % 256 == 0,
+                "qweight.numel() * 8 % 256 != 0");
+
+    int blocks = qweight.numel() * 8 / 256;
+    marlin_int4_fp8_preprocess_kernel_without_zp<<<blocks, 32>>>(
+        (const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr());
+  } else {
+    int32_t size_k = qweight.size(0);
+    int32_t size_n = qweight.size(1) * 8;
+    torch::Tensor qzeros = qzeros_or_none.value();
+
+    TORCH_CHECK(size_k % 32 == 0, "size_k % 32 != 0");
+    TORCH_CHECK(qzeros.device().is_cuda(), "qzeros is not on GPU");
+    TORCH_CHECK(qzeros.scalar_type() == at::ScalarType::Int,
+                "qweight.dtype != torch.int32");
+    TORCH_CHECK(device_of(qweight) == device_of(qzeros),
+                "qzeros is not on the same device with qweight");
+
+    int32_t group_size = qweight.size(0) / qzeros.size(0);
+    TORCH_CHECK(qweight.size(1) == qzeros.size(1),
+                "qweight.size(1) != qzeros.size(1)");
+    TORCH_CHECK(qweight.size(0) % qzeros.size(0) == 0,
+                "qweight.size(0) % qzeros.size(0) != 0");
+    TORCH_CHECK(group_size % 8 == 0, "group_size % 8 != 0");
+
+    dim3 blocks(size_k / 32, size_n / 8);
+    marlin_int4_fp8_preprocess_kernel_awq<<<blocks, 32>>>(
+        (const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr(),
+        (const int32_t*)qzeros.data_ptr(), size_n, size_k, group_size);
+  }
+
+  return output;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("marlin_int4_fp8_preprocess", &marlin_int4_fp8_preprocess);
+}
diff --git a/csrc/quantization/gptq_marlin/marlin_template.h b/csrc/quantization/gptq_marlin/marlin_template.h
index bfb0a3668f52..22bb71e482ce 100644
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
@@ -38,7 +38,7 @@ namespace MARLIN_NAMESPACE_NAME {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
           const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
@@ -77,65 +77,139 @@ __global__ void Marlin(
 
 // m16n8k16 tensor core mma instruction with fp16 inputs and fp32
 // output/accumulation.
-template <typename scalar_t>
-__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
-                           const typename ScalarType<scalar_t>::FragB& frag_b,
-                           typename ScalarType<scalar_t>::FragC& frag_c) {
+template <vllm::ScalarTypeId type_id, int k_size = 16>
+__device__ inline void mma(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
+            "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
+            "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
+  } else if (k_size == 32) {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id, int k_size = 16>
 __device__ inline void mma_trans(
-    const typename ScalarType<scalar_t>::FragA& a_frag,
-    const typename ScalarType<scalar_t>::FragB& frag_b,
-    const typename ScalarType<scalar_t>::FragB& frag_b2,
-    typename ScalarType<scalar_t>::FragC& frag_c) {
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    const typename MarlinScalarType<type_id>::FragB& frag_b2,
+    typename MarlinScalarType<type_id>::FragC& frag_c) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
   const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
   float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+            "r"(c[3]));
+    }
   } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
-template <int count, typename scalar_t>
-__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
+template <int count, vllm::ScalarTypeId type_id>
+__device__ inline void ldsm(typename MarlinScalarType<type_id>::FragA& frag_a,
                             const void* smem_ptr) {
   uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
   uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
@@ -159,47 +233,54 @@ __device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
 
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
-template <typename scalar_t>
-__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
-                             typename ScalarType<scalar_t>::FragS& frag_s,
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale(typename MarlinScalarType<type_id>::FragB& frag_b,
+                             typename MarlinScalarType<type_id>::FragS& frag_s,
                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_s)[i]);
   frag_b[0] = __hmul2(frag_b[0], s);
   frag_b[1] = __hmul2(frag_b[1], s);
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id>
 __device__ inline void scale_and_sub(
-    typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
-  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t s,
+    typename MarlinScalarType<type_id>::scalar_t zp) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s2 = MarlinScalarType<type_id>::num2num2(s);
+  scalar_t2 zp2 = MarlinScalarType<type_id>::num2num2(zp);
   frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
   frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
 }
 
-template <typename scalar_t>
-__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 zp =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+template <vllm::ScalarTypeId type_id>
+__device__ inline void sub_zp(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t2& frag_zp, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 zp = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_zp)[i]);
   frag_b[0] = __hsub2(frag_b[0], zp);
   frag_b[1] = __hsub2(frag_b[1], zp);
 }
 
 // Same as above, but for act_order (each K is multiplied individually)
-template <typename scalar_t>
-__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::FragS& frag_s_1,
-                              typename ScalarType<scalar_t>::FragS& frag_s_2,
-                              typename ScalarType<scalar_t>::FragS& frag_s_3,
-                              typename ScalarType<scalar_t>::FragS& frag_s_4,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale4(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragS& frag_s_1,
+    typename MarlinScalarType<type_id>::FragS& frag_s_2,
+    typename MarlinScalarType<type_id>::FragS& frag_s_3,
+    typename MarlinScalarType<type_id>::FragS& frag_s_4, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+
   scalar_t2 s_val_1_2;
   s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
   s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
@@ -213,12 +294,13 @@ __device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
 }
 
 // Given 2 floats multiply by 2 scales (halves)
-template <typename scalar_t>
-__device__ inline void scale_float(float* c,
-                                   typename ScalarType<scalar_t>::FragS& s) {
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale_float(
+    float* c, typename MarlinScalarType<type_id>::FragS& s) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
   scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
-  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
-  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+  c[0] = __fmul_rn(c[0], MarlinScalarType<type_id>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], MarlinScalarType<type_id>::num2float(s_ptr[1]));
 }
 
 // Wait until barrier reaches `count`, then lock for current threadblock.
@@ -270,9 +352,10 @@ __device__ inline void wait_negative_and_add(int* lock) {
   __syncthreads();
 }
 
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -288,18 +371,23 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
           const bool is_zp_float   // is zero point of float16 type?
           >
 __global__ void Marlin(
-    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // fp16 output buffer of shape mxn
-    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ A0,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,   // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C0,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,     // fp32 tmp output buffer (for reduce)
     const int4* __restrict__ b_bias_ptr,
-    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
-                                          // (k/groupsize)xn
-    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
-                                              // only)
-    const int4* __restrict__ zp_ptr,  // 4bit packed zero-points of shape
-                                      // (k/groupsize)x(n/pack_factor)
-    const int* __restrict__ g_idx,    // int32 group indices of shape k
+    // float scales of input matrix, only used when is_a_8bit == true.
+    // shape (m,)
+    const float* __restrict__ a_scales_ptr,
+    // fp16 quantization scales. shape (k/groupsize, n)
+    const int4* __restrict__ scales_ptr,
+    // fp16 global scale (for nvfp4// only)
+    const uint16_t* __restrict__ global_scale_ptr,
+    // 4bit packed zero-points of shape
+    // (k/groupsize, n/pack_factor)
+    const int4* __restrict__ zp_ptr,
+    // int32 group indices of shape k
+    const int* __restrict__ g_idx,
     int num_groups,  // number of scale groups per output channel
     int prob_m,      // batch dimension m
     int prob_n,      // output dimension n
@@ -321,17 +409,35 @@ __global__ void Marlin(
   // ensures good utilization of all SMs for many kinds of shape and GPU
   // configurations, while requiring as few slow global cross-threadblock
   // reductions as possible.
-  using Dtype = ScalarType<scalar_t>;
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  using FragA = typename ScalarType<scalar_t>::FragA;
-  using FragB = typename ScalarType<scalar_t>::FragB;
-  using FragC = typename ScalarType<scalar_t>::FragC;
-  using FragS = typename ScalarType<scalar_t>::FragS;
-  using FragZP = typename ScalarType<scalar_t>::FragZP;
-
-  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 890
+  // FP8 computation is only supported for Ada Lovelace or newer architectures.
+  if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
+  #endif
+
+  using Adtype = MarlinScalarType<a_type_id>;
+  using Cdtype = MarlinScalarType<c_type_id>;
+  const int4* A = A0;
+  int4* C = C0;
+
+  using scalar_t = typename MarlinScalarType<a_type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<a_type_id>::scalar_t2;
+  using scalar_32bit_t = typename MarlinScalarType<a_type_id>::scalar_32bit_t;
+
+  using c_scalar_t = typename MarlinScalarType<c_type_id>::scalar_t;
+  using c_scalar_t2 = typename MarlinScalarType<c_type_id>::scalar_t2;
+
+  using FragA = typename MarlinScalarType<a_type_id>::FragA;
+  using FragB = typename MarlinScalarType<a_type_id>::FragB;
+  using FragC = typename MarlinScalarType<a_type_id>::FragC;
+  using FragS = typename MarlinScalarType<c_type_id>::FragS;
+  using FragZP = typename MarlinScalarType<c_type_id>::FragZP;
+
+  static constexpr auto a_type = vllm::ScalarType::from_id(a_type_id);
+  static constexpr auto b_type = vllm::ScalarType::from_id(b_type_id);
+  static constexpr auto c_type = vllm::ScalarType::from_id(c_type_id);
   static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
-  if constexpr (w_type == vllm::kFE2M1f) {
+  if constexpr (b_type == vllm::kFE2M1f) {
     static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
                   s_type == vllm::kFE8M0fnu && group_blocks == 2);
   } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
@@ -340,27 +446,35 @@ __global__ void Marlin(
     static_assert(s_type == vllm::kFloat16);
   }
 
-  constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
-  constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
-                               w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
+  constexpr bool is_a_8bit = a_type.size_bits() == 8;
+  if constexpr (!is_a_8bit) {
+    static_assert(std::is_same<scalar_t, c_scalar_t>::value);
+  }
+  constexpr bool has_zp = b_type == vllm::kU4 || b_type == vllm::kU8;
+  constexpr bool is_int_type = b_type == vllm::kU4 || b_type == vllm::kU8 ||
+                               b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                               b_type == vllm::kU4B8 || b_type == vllm::kU8B128;
   // see comments of dequant.h for more details
   constexpr bool dequant_skip_flop =
-      w_type == vllm::kFE4M3fn ||
-      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
+      is_a_8bit || b_type == vllm::kFE4M3fn ||
+      b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
       has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
-      has_zp && !is_zp_float && !(w_type == vllm::kU8);
+      has_zp && !is_zp_float && !(b_type == vllm::kU8);
+
+  c_scalar_t2 global_scale;
 
-  scalar_t2 global_scale;
-  if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-    // NVFP4 format requires global scale
-    uint16_t val = scale2_ptr[0];
-    global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
+  if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+    uint16_t val = global_scale_ptr[0];
+    global_scale = Cdtype::num2num2(*reinterpret_cast<c_scalar_t*>(&val));
   }
 
   constexpr bool has_act_order = group_blocks == 0;
   constexpr int m_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
 
-  constexpr int pack_factor = 32 / w_type.size_bits();
+  extern __shared__ int4 sh[];
+  float* sh_a_s = reinterpret_cast<float*>(sh);
+  int4* sh_new = sh + (is_a_8bit ? (4 * thread_m_blocks) : 0);
+  constexpr int pack_factor = 32 / b_type.size_bits();
   static_assert(thread_m_blocks == 1 || !m_block_size_8);
 
   // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
@@ -373,7 +487,19 @@ __global__ void Marlin(
 
   int k_tiles = prob_k / 16 / thread_k_blocks;
   int n_tiles = prob_n / 16 / thread_n_blocks;
-  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  int global_mn_tiles = parallel * n_tiles;
+  int part2_mn_tiles = global_mn_tiles;
+  int part1_mn_iters = 0;
+  bool in_part2 = false;
+
+  if (global_mn_tiles > gridDim.x) {
+    part2_mn_tiles = global_mn_tiles % gridDim.x;
+    if (part2_mn_tiles * 3 <= gridDim.x) part2_mn_tiles += gridDim.x;
+    part1_mn_iters = (global_mn_tiles - part2_mn_tiles) / gridDim.x;
+  }
+
+  int iters = div_ceil(k_tiles * part2_mn_tiles, gridDim.x);
 
   if constexpr (!has_act_order && group_blocks != -1) {
     if (group_blocks >= thread_k_blocks) {
@@ -385,28 +511,21 @@ __global__ void Marlin(
     }
   }
 
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  int slice_iters;  // number of threadblock tiles in the current slice
-  int slice_count =
-      0;          // total number of active threadblocks in the current slice
-  int slice_idx;  // index of threadblock in current slice; numbered bottom to
-                  // top
+  int slice_row = 0;
+  int slice_col_par = blockIdx.x;
+  int slice_col;
+  int slice_iters =
+      k_tiles;  // number of threadblock tiles in the current slice
+  // total number of active threadblocks in the current slice
+  int slice_count = 1;
+  // index of threadblock in current slice; numbered bottom to top
+  int slice_idx = 0;
 
   int par_id = 0;
   int locks_off = 0;
 
-  // We can easily implement parallel problem execution by just remapping
-  // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8;
-    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
-    slice_col = slice_col_par % n_tiles;
-    par_id = slice_col_par / n_tiles;
-  }
-  if (parallel * n_tiles >= gridDim.x) {
-    // when parallel * n_tiles >= sms
+  if (part2_mn_tiles >= gridDim.x) {
+    // when part2_mn_tiles >= sms
     // then there are at most $sms$ conflict tile blocks
     locks_off = blockIdx.x;
   } else {
@@ -415,10 +534,11 @@ __global__ void Marlin(
 
   // Compute all information about the current slice which is required for
   // synchronization.
-  auto init_slice = [&](bool first_init = false) {
+  bool first_init = true;
+  auto init_part2_slice = [&]() {
     slice_iters =
         iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters < 0 || slice_col_par >= part2_mn_tiles) slice_iters = 0;
     if (slice_iters == 0) return;
     if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
     slice_count = 1;
@@ -436,7 +556,7 @@ __global__ void Marlin(
         if (col_off > 0) slice_idx--;
       }
     }
-    if (parallel * n_tiles >= gridDim.x) {
+    if (part2_mn_tiles >= gridDim.x) {
       if (slice_count > 1 && slice_idx == slice_count - 1) {
         locks_off++;
       }
@@ -466,28 +586,68 @@ __global__ void Marlin(
     }
 
     if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * lda / 8;
+      A += 16 * thread_m_blocks * lda / (is_a_8bit ? 16 : 8);
       C += 16 * thread_m_blocks * prob_n / 8;
       slice_col = 0;
       par_id++;
     }
+    if (is_a_8bit && (first_init || slice_col == 0)) {
+      __syncthreads();
+      int a_s_gl_rd = par_id * 16 * thread_m_blocks + threadIdx.x;
+      cp_async1_ca_pred(&sh_a_s[threadIdx.x], &a_scales_ptr[a_s_gl_rd],
+                        threadIdx.x < prob_m);
+    }
   };
-  init_slice(true);
+
+  auto init_part1_slice = [&]() {
+    if (part1_mn_iters) {
+      part1_mn_iters--;
+      par_id = slice_col_par / n_tiles;
+      slice_col = slice_col_par % n_tiles;
+      slice_iters = k_tiles;
+      A = A0 + 16 * thread_m_blocks / (is_a_8bit ? 16 : 8) * par_id * lda;
+      C = C0 + 16 * thread_m_blocks / 8 * par_id * prob_n;
+      if (is_a_8bit) {
+        __syncthreads();
+        int a_s_gl_rd = par_id * 16 * thread_m_blocks + threadIdx.x;
+        cp_async1_ca_pred(&sh_a_s[threadIdx.x], &a_scales_ptr[a_s_gl_rd],
+                          threadIdx.x < prob_m);
+      }
+    }
+  };
+
+  auto init_slice = [&]() {
+    if (!in_part2 && !part1_mn_iters) {
+      in_part2 = true;
+      slice_col_par = (iters * blockIdx.x) / k_tiles;
+      slice_row = (iters * blockIdx.x) % k_tiles;
+      slice_col = (slice_col_par + global_mn_tiles - part2_mn_tiles) % n_tiles;
+      par_id = (slice_col_par + global_mn_tiles - part2_mn_tiles) / n_tiles;
+      A = A0 + 16 * thread_m_blocks / (is_a_8bit ? 16 : 8) * par_id * lda;
+      C = C0 + 16 * thread_m_blocks / 8 * par_id * prob_n;
+    }
+    if (!in_part2) {
+      init_part1_slice();
+    } else {
+      init_part2_slice();
+      first_init = false;
+    }
+  };
+
+  init_slice();
 
   // A sizes/strides
 
   // stride of the A matrix in global memory
-  int a_gl_stride = lda / 8;
+  int a_gl_stride = lda / (is_a_8bit ? 16 : 8);
   // stride of an A matrix tile in shared memory
-  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  constexpr int a_sh_stride = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // delta between subsequent A tiles in global memory
-  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // between subsequent accesses within a tile
   int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
   // between shared memory writes
   constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory tile reads
-  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
   // within a shared memory tile
   constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
   // overall size of a tile
@@ -496,24 +656,25 @@ __global__ void Marlin(
   constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
-  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
-  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  int b_gl_stride = 16 * prob_n / (pack_factor * (is_a_8bit ? 2 : 4));
+  constexpr int b_sh_stride =
+      ((thread_n_blocks * 16) * 16 / pack_factor) / (is_a_8bit ? 2 : 4);
+  constexpr int b_thread_vecs = b_type.size_bits() == 4 ? 1 : 2;
   constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
 
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_delta = threads * b_thread_vecs;
-  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_stage =
+      b_sh_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
   // Scale sizes/strides without act_order
-  int s_gl_stride = prob_n / 8;
-  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  int s_gl_stride = prob_n / (b_type == vllm::kFE2M1f ? 16 : 8);
+  constexpr int s_sh_stride =
+      16 * thread_n_blocks / (b_type == vllm::kFE2M1f ? 16 : 8);
   constexpr int s_tb_groups =
       !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
-          ? thread_k_blocks / group_blocks / (w_type == vllm::kFE2M1f ? 2 : 1)
+          ? thread_k_blocks / group_blocks
           : 1;
   constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
   int s_gl_rd_delta = s_gl_stride;
@@ -527,7 +688,7 @@ __global__ void Marlin(
   int act_s_col_stride = 1;
   int act_s_col_warp_stride = act_s_col_stride * 8;
 
-  int tb_n_warps = thread_n_blocks / 4;
+  constexpr int tb_n_warps = thread_n_blocks / (is_a_8bit ? 2 : 4);
   int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
 
   // Zero-points sizes/strides
@@ -550,17 +711,22 @@ __global__ void Marlin(
   int a_sh_rd =
       a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
       (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
-  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / tb_n_warps) * b_sh_wr_iters;
+
+  int b_gl_rd;
+  if (threads <= b_sh_stride) {
+    b_gl_rd = threadIdx.x;
+  } else {
+    b_gl_rd =
+        b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  }
 
-  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
-                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  auto b_sh_wr = threadIdx.x * b_thread_vecs;
   auto b_sh_rd = threadIdx.x * b_thread_vecs;
+  b_sh_rd += b_sh_rd / b_sh_stride * (b_sh_stride * (b_sh_wr_iters - 1));
 
   // For act_order
-  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
   int slice_k_start = tb_k * slice_row;
   int slice_k_finish = slice_k_start + tb_k * slice_iters;
   int slice_k_start_shared_fetch = slice_k_start;
@@ -571,58 +737,54 @@ __global__ void Marlin(
   if constexpr (!has_act_order) {
     if constexpr (group_blocks == -1) {
       s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-    } else {
-      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) /
-                    (w_type == vllm::kFE2M1f ? 2 : 1) +
+    } else if constexpr (group_blocks >= thread_k_blocks) {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                 s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
     }
   }
   auto s_sh_wr = threadIdx.x;
-  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stage;
 
   // Zero-points
   int zp_gl_rd;
   if constexpr (has_zp) {
     if constexpr (group_blocks == -1) {
       zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
-    } else {
+    } else if constexpr (group_blocks >= thread_k_blocks) {
       zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                  zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                 threadIdx.x / zp_sh_stride) +
+                 zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
     }
   }
   auto zp_sh_wr = threadIdx.x;
-  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+  bool zp_sh_wr_pred = zp_sh_stage > 0 && threadIdx.x < zp_sh_stage;
 
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
   // row-major in the latter case.
   int s_sh_rd;
-  if constexpr (group_blocks != -1 && w_type == vllm::kFE2M1f) {
-    auto warp_id = threadIdx.x / 32;
-    int n_warps = thread_n_blocks / 4;
-    int warp_row = warp_id / n_warps;
-
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
-
+  if constexpr (is_a_8bit) {
+    s_sh_rd = 4 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 4);
   } else if constexpr (group_blocks != -1)
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
   else if constexpr (group_blocks == -1 &&
                      (m_block_size_8 || (has_zp && !dequant_skip_flop)))
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 8;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   else
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) % 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) % 4;
 
   int bias_sh_rd;
   if constexpr (m_block_size_8) {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                 (threadIdx.x % 32) / 8;
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   } else {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+    bias_sh_rd = (is_a_8bit ? 4 : 8) * ((threadIdx.x / 32) % tb_n_warps) +
                  (threadIdx.x % 32) % 4;
   }
 
@@ -638,12 +800,16 @@ __global__ void Marlin(
   if constexpr (has_zp) {
     if constexpr (is_zp_float) {
       if constexpr (group_blocks != -1) {
-        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                   (threadIdx.x % 32) / 4;
+        zp_sh_rd =
+            8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
       }
+    } else if (is_a_8bit) {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % tb_n_warps / 2) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     } else {
       zp_sh_rd = num_ints_per_thread * num_col_threads *
-                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                     ((threadIdx.x / 32) % tb_n_warps) +
                  num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     }
   }
@@ -678,26 +844,19 @@ __global__ void Marlin(
   for (int i = 0; i < b_sh_wr_iters; i++) {
   #pragma unroll
     for (int j = 0; j < thread_m_blocks; j++)
-      a_sh_rd_trans[i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+      a_sh_rd_trans[i][j] = transform_a(2 * i + a_sh_rd_delta_i * j + a_sh_rd);
   }
 
   // Since B-accesses have non-constant stride they have to be computed at
   // runtime; we break dependencies between subsequent accesses with a tile by
   // maintining multiple pointers (we have enough registers), a tiny
   // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
 
-  extern __shared__ int4 sh[];
   // Shared memory storage for global fetch pipelines.
   constexpr int sh_red_size = (2 * thread_n_blocks + 1) * 16 * thread_m_blocks;
   constexpr int sh_b_size = stages * b_sh_stage;
-  int4* sh_b = sh;
-  int4* sh_red = sh;
-
+  int4* sh_b = sh_new;
+  int4* sh_red = sh_new;
   constexpr int sh_size_b_red_min =
       (sh_red_size < sh_b_size ? sh_red_size : sh_b_size);
   constexpr int sh_size_b_red_max =
@@ -708,8 +867,8 @@ __global__ void Marlin(
           ? sh_size_b_red_max
           : (sh_size_b_red_min + sh_bias_size);
 
-  int4* sh_bias = sh + sh_size_b_red_min;
-  int4* sh_g_idx = sh + sh_b_red_bias_size;
+  int4* sh_bias = sh_new + sh_size_b_red_min;
+  int4* sh_g_idx = sh_new + sh_b_red_bias_size;
   int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
@@ -723,7 +882,8 @@ __global__ void Marlin(
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
   I4 frag_b_quant[2][b_thread_vecs];
-  FragC frag_c[thread_m_blocks][4][2];
+  FragC frag_c[thread_m_blocks][is_a_8bit ? 2 : 4][2];
+  FragC frag_c_tmp[thread_m_blocks][is_a_8bit ? 2 : 4][2];
   FragS frag_s[2][4];  // No act-order
   FragS frag_bias[2][4];
   FragS act_frag_s[2][4][4];             // For act-order
@@ -731,6 +891,24 @@ __global__ void Marlin(
   FragZP frag_zp;                        // Zero-points in fp16
   FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
 
+  if constexpr (is_a_8bit) {
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][0][g] = 0.0f;
+        }
+
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][1][g] = 0.0f;
+        }
+      }
+    }
+  }
+
   // Zero accumulators.
   auto zero_accums = [&]() {
   #pragma unroll
@@ -788,15 +966,17 @@ __global__ void Marlin(
       }
       int4* sh_b_stage = sh_b + b_sh_stage * pipe;
   #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-        for (int j = 0; j < b_thread_vecs; j++) {
-          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
-        }
+      for (int i = 0; i < (b_sh_wr_iters * b_thread_vecs); i++) {
+        constexpr int count = div_ceil(b_sh_stride, threads);
+        int b_gl_idx =
+            b_gl_rd + (i % count) * threads +
+            b_gl_stride * (i / count) * div_ceil(threads, b_sh_stride);
 
-        B_ptr[i] += b_gl_rd_delta_o;
+        cp_async4(&sh_b_stage[threads * i + threadIdx.x], &B[b_gl_idx]);
       }
 
+      b_gl_rd += b_gl_rd_delta_o;
+
       if constexpr (has_act_order) {
         // Fetch g_idx thread-block portion
         int full_pipe = a_off;
@@ -816,44 +996,24 @@ __global__ void Marlin(
         if constexpr (group_blocks != -1) {
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch scales if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < s_tb_groups; i++) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
-                          &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
+          // Only fetch scales if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
             }
+            s_gl_rd += s_gl_rd_delta * s_tb_groups;
           }
         }
 
         if constexpr (has_zp && group_blocks != -1) {
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch zero-points if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < zp_tb_groups; i++) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
-                          &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
+          // Only fetch zero points if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (zp_sh_wr_pred) {
+              cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
             }
+            zp_gl_rd += zp_gl_rd_delta * zp_tb_groups;
           }
         }
       }
@@ -891,14 +1051,14 @@ __global__ void Marlin(
     int4* sh_a_stage = sh_a + a_sh_stage * pipe;
   #pragma unroll
     for (int i = 0; i < thread_m_blocks; i++)
-      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(
+      ldsm<m_block_size_8 ? 2 : 4, a_type_id>(
           frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4* sh_b_stage = sh_b + b_sh_stage * pipe;
 
   #pragma unroll
     for (int i = 0; i < b_thread_vecs; i++) {
       frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
-          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+          &sh_b_stage[b_sh_stride * (k % b_sh_wr_iters) + b_sh_rd + i]);
     }
   };
 
@@ -922,53 +1082,54 @@ __global__ void Marlin(
 
   auto fetch_scales_to_registers = [&](int k, int full_pipe) {
     int pipe = full_pipe % stages;
+    using IT1 = typename std::conditional_t<is_a_8bit, int2, int4>;
+    using IT0 = typename std::conditional_t<is_a_8bit, int, int2>;
+    constexpr int group_blocks2 = div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
     if constexpr (!has_act_order) {
       // No act-order case
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 && dequant_skip_flop) {
           reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
           reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
         }
       } else if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_s_stage =
-                sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
-            reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
-          } else {
-            reinterpret_cast<int4*>(&frag_s[1])[0] =
-                reinterpret_cast<int4*>(&frag_s[0])[0];
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0) {
+            if (k % b_sh_wr_iters == 0) {
+              int4* sh_s_stage = sh_s + s_sh_stage * (g * (pipe / g));
+              reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+            } else {
+              reinterpret_cast<int4*>(&frag_s[1])[0] =
+                  reinterpret_cast<int4*>(&frag_s[0])[0];
+            }
           }
-        } else {
+        } else if (group_blocks2 < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
+          int warp_row = warp_id / tb_n_warps;
 
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          int cur_group_id =
-              k_blocks / (group_blocks * (w_type == vllm::kFE2M1f ? 2 : 1));
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+          int cur_group_id = k_blocks / group_blocks2;
 
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (w_type_id != vllm::kFE2M1f.id()) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
             reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                 sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
+          } else {
             reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                 reinterpret_cast<int2*>(
                     sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          }
+        } else if (group_blocks >= b_sh_wr_iters) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
+            reinterpret_cast<int4*>(&frag_s[1])[0] =
+                reinterpret_cast<int4*>(&frag_s[0])[0];
           } else {
-            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
-                reinterpret_cast<int2*>(
-                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
-                                k % 2];
+            reinterpret_cast<int2*>(&frag_s[1])[0] =
+                reinterpret_cast<int2*>(&frag_s[0])[0];
           }
         }
       }
@@ -989,18 +1150,15 @@ __global__ void Marlin(
     cur_k = 0;
 
     // Progress to current iteration
-    cur_k += k_iter_size * (k % b_sh_wr_iters);
+    cur_k += k % b_sh_wr_iters;
 
     // Determine "position" inside the thread-block (based on warp and
     // thread-id)
     auto warp_id = threadIdx.x / 32;
-    int n_warps =
-        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
-
-    int warp_row = warp_id / n_warps;
-    int warp_col = warp_id % n_warps;
+    int warp_row = warp_id / tb_n_warps;
+    int warp_col = warp_id % tb_n_warps;
 
-    cur_k += warp_row * 16;
+    cur_k += warp_row * 16 * b_sh_wr_iters;
 
     auto th_id = threadIdx.x % 32;
     cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
@@ -1055,18 +1213,16 @@ __global__ void Marlin(
 
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 || is_a_8bit) {
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
           }
         }
-
       } else if constexpr (group_blocks >= thread_k_blocks) {
-        if (k % b_sh_wr_iters == 0) {
-          int4* sh_zp_stage =
-              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
+        constexpr int g = group_blocks / thread_k_blocks;
+        if (pipe % g == 0 && k % b_sh_wr_iters == 0 || is_a_8bit) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] =
@@ -1075,21 +1231,11 @@ __global__ void Marlin(
         }
       } else {
         auto warp_id = threadIdx.x / 32;
-        int n_warps = thread_n_blocks / 4;
-
-        int warp_row = warp_id / n_warps;
 
-        int cur_k = warp_row * 16;
-        cur_k += k_iter_size * (k % b_sh_wr_iters);
+        int warp_row = warp_id / tb_n_warps;
 
-        int k_blocks = cur_k / 16;
-        int cur_group_id = 0;
-
-        // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
-        cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
+        int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+        int cur_group_id = k_blocks / div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
         int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1108,29 +1254,18 @@ __global__ void Marlin(
 
       if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_zp_stage =
-                sh_zp +
-                zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                               (pipe / (group_blocks / thread_k_blocks)));
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0 && k % b_sh_wr_iters == 0) {
+            int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
             reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
                 sh_zp_stage[zp_sh_rd];
           }
-        } else {
+        } else if (group_blocks < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
 
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
+          int warp_row = warp_id / tb_n_warps;
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
           int cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
 
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1141,33 +1276,46 @@ __global__ void Marlin(
     }
   };
 
-  auto dequant_data = [&](int q, scalar_t2* frag_b_ptr) {
-    dequant<scalar_t2, w_type_id, dequant_skip_flop>(q, frag_b_ptr);
+  auto dequant_data = [&](int q, scalar_32bit_t* frag_b_ptr, int zp = 0) {
+    if constexpr (a_type.size_bits() != b_type.size_bits()) {
+      if constexpr (is_a_8bit && has_zp) {
+        sub_zp_and_dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(
+            q, frag_b_ptr, zp);
+      } else {
+        dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(q, frag_b_ptr);
+      }
+    }
   };
 
   // Execute the actual tensor core matmul of a sub-tile.
   bool is_first_matmul_in_slice = true;
-  auto matmul = [&](int k) {
+  auto matmul = [&](int k, int pipe) {
+    if (is_a_8bit) return;
     int k2 = k % 2;
+    constexpr int g =
+        group_blocks > 0 ? div_ceil(group_blocks, thread_k_blocks) : 1;
     const bool is_new_zp =
-        ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+        (group_blocks == 0) ||
+        ((group_blocks > 0) && (group_blocks < b_sh_wr_iters || k == 0)) &&
+            (pipe % g == 0) ||
         (group_blocks == -1 && is_first_matmul_in_slice);
     if constexpr (has_zp && !is_zp_float) {
       if (is_new_zp) {
         if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
         int zp_quant_0, zp_quant_1;
 
-        if constexpr (w_type.size_bits() == 4) {
+        if constexpr (b_type.size_bits() == 4) {
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = zp_quant_0 >> 8;
         } else {
-          static_assert(w_type.size_bits() == 8);
+          static_assert(b_type.size_bits() == 8);
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = frag_qzp[k2][1];
         }
 
-        dequant_data(zp_quant_0, reinterpret_cast<scalar_t2*>(&frag_zp));
-        dequant_data(zp_quant_1, reinterpret_cast<scalar_t2*>(&frag_zp) + 2);
+        dequant_data(zp_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_zp));
+        dequant_data(zp_quant_1,
+                     reinterpret_cast<scalar_32bit_t*>(&frag_zp) + 2);
       }
     }
     if constexpr (!dequant_skip_flop && has_zp && is_zp_float) {
@@ -1177,14 +1325,14 @@ __global__ void Marlin(
       }
     }
 
-    if constexpr (w_type == vllm::kFE2M1f) {
+    if constexpr (b_type == vllm::kFE2M1f) {
       int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
       int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
 
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_1, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]) + 2);
     }
 
   // We have the m dimension as the inner loop in order to encourage overlapping
@@ -1195,61 +1343,168 @@ __global__ void Marlin(
       FragB frag_b1;
       int b_quant_0, b_quant_1;
 
-      if constexpr (w_type_id == vllm::kFE2M1f.id()) {
+      if constexpr (b_type_id == vllm::kFE2M1f.id()) {
         b_quant_1 = frag_b_quant[k2][0][j];
         b_quant_0 = b_quant_1 << 8;
-      } else if constexpr (w_type.size_bits() == 4) {
+      } else if constexpr (b_type.size_bits() == 4) {
         b_quant_0 = frag_b_quant[k2][0][j];
         b_quant_1 = b_quant_0 >> 8;
       } else {
-        static_assert(w_type.size_bits() == 8);
+        static_assert(b_type.size_bits() == 8);
         int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
         b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
         b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
       }
 
-      dequant_data(b_quant_0, reinterpret_cast<scalar_t2*>(&frag_b0));
-      dequant_data(b_quant_1, reinterpret_cast<scalar_t2*>(&frag_b1));
+      dequant_data(b_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_b0));
+      dequant_data(b_quant_1, reinterpret_cast<scalar_32bit_t*>(&frag_b1));
 
-      if constexpr (dequant_skip_flop && has_zp && !is_zp_float) {
-        sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
-        sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
+      if constexpr (dequant_skip_flop && has_zp && !is_zp_float && !is_a_8bit) {
+        sub_zp<a_type_id>(frag_b0, frag_zp[j], 0);
+        sub_zp<a_type_id>(frag_b1, frag_zp[j], 1);
       }
 
       // Apply scale to frag_b0
-      if constexpr (has_act_order) {
+      if constexpr (has_act_order && !is_a_8bit) {
         static_assert(group_blocks != -1);
-        scale4<scalar_t>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
-        scale4<scalar_t>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
+        scale4<a_type_id>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<a_type_id>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
       } else if constexpr (!dequant_skip_flop && has_zp && !is_zp_float &&
-                           group_blocks == -1) {
+                           group_blocks == -1 && !is_a_8bit) {
         int idx = (threadIdx.x / 4) % 2;
-        scalar_t2 s2 = Dtype::nums2num2(
+        scalar_t2 s2 = Adtype::nums2num2(
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
         if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
-        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
-      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1) {
+        scale_and_sub<a_type_id>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1 &&
+                           !is_a_8bit) {
         if (is_new_zp)
           frag_zp[j] = __hmul2(frag_zp[j],
                                *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
-        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
-      } else if constexpr (group_blocks != -1) {
-        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
-        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+        scale_and_sub<a_type_id>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
+      } else if constexpr (group_blocks != -1 && !is_a_8bit) {
+        scale<a_type_id>(frag_b0, frag_s[k2][j], 0);
+        scale<a_type_id>(frag_b1, frag_s[k2][j], 1);
       }
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
+                               frag_c[i][j][0]);
         } else {
-          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  auto matmul_a8 = [&](int k) {
+    int k2 = k % 2;
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+      FragB frag_b[2];
+
+      if (is_a_8bit && b_type.size_bits() == 4 && !has_zp) {
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b));
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2);
+      } else if (is_a_8bit && b_type.size_bits() == 4 && has_zp) {
+        int off = (threadIdx.x / 32) % 2 * 2 + j;
+        int zp = (frag_qzp[k2][0] >> (off * 8)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b), zp);
+        zp = (frag_qzp[k2][0] >> (off * 8 + 4)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2, zp);
+      } else {
+        reinterpret_cast<int2*>(&frag_b)[0] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[0];
+        reinterpret_cast<int2*>(&frag_b)[1] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[1];
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+      }
+
+      if constexpr (group_blocks != -1) {
+        if (group_blocks == 2 || k == 1) {
+          if constexpr (a_type == vllm::kS8) {
+            int2 s_vals[2];
+            s_vals[0] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[1]};
+            s_vals[1] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[1]};
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[0])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][0][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][0][g]) *
+                    scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[1])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][1][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][1][g]) *
+                    scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          } else {
+            float2 s_vals[2];
+            if constexpr (s_type_id != vllm::kFE8M0fnu.id()) {
+              static_assert(a_type.size_bits() == 16 ||
+                            s_type.size_bits() == 16);
+              s_vals[0] = Cdtype::num22float2(frag_s[k2][j * 2][0]);
+              s_vals[1] = Cdtype::num22float2(frag_s[k2][j * 2 + 1][0]);
+            } else {
+              int32_t* s_vals_int = reinterpret_cast<int32_t*>(&s_vals[0]);
+              int32_t s_vals_e8m0 =
+                  *reinterpret_cast<int32_t*>(&frag_s[k2][j][0]);
+
+              s_vals_int[0] = (s_vals_e8m0 & 0xFF) << 23;
+              s_vals_int[1] = (s_vals_e8m0 & 0xFF00) << 15;
+              s_vals_int[2] = (s_vals_e8m0 & 0xFF0000) << 7;
+              s_vals_int[3] = (s_vals_e8m0 & 0xFF000000) >> 1;
+            }
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[0])[g % 2];
+                frag_c[i][j][0][g] += frag_c_tmp[i][j][0][g] * scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[1])[g % 2];
+                frag_c[i][j][1][g] += frag_c_tmp[i][j][1][g] * scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          }
         }
       }
     }
@@ -1263,7 +1518,8 @@ __global__ void Marlin(
     constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
       auto red_idx = threadIdx.x / b_sh_stride_threads;
-      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_stride =
+          b_sh_stride_threads * (is_a_8bit ? 2 : 4) * 2;
       constexpr int red_sh_delta = b_sh_stride_threads;
       int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
                       (threadIdx.x % b_sh_stride_threads);
@@ -1278,7 +1534,8 @@ __global__ void Marlin(
         for (int i = red_off; i > 0; i /= 2) {
           if (i <= red_idx && red_idx < 2 * i) {
   #pragma unroll
-            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+            for (int j = 0; j < (is_a_8bit ? 2 : 4) * 2;
+                 j += (m_block_size_8 ? 2 : 1)) {
               int red_sh_wr =
                   red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
               if (i < red_off) {
@@ -1287,24 +1544,26 @@ __global__ void Marlin(
                 float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
   #pragma unroll
                 for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                  reinterpret_cast<FragC*>(
+                      frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j][k] +=
                       c_rd[k] + c_wr[k];
               }
-              sh_red[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+              sh_red[red_sh_wr] = reinterpret_cast<int4*>(
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j];
             }
           }
           __syncthreads();
         }
         if (red_idx == 0) {
   #pragma unroll
-          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+          for (int i = 0; i < (is_a_8bit ? 2 : 4) * 2;
+               i += (m_block_size_8 ? 2 : 1)) {
             float* c_rd =
                 reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
   #pragma unroll
             for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
+              reinterpret_cast<FragC*>(
+                  frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + i][j] += c_rd[j];
           }
         }
         __syncthreads();
@@ -1320,10 +1579,10 @@ __global__ void Marlin(
     // We are very careful here to reduce directly in the output buffer to
     // maximize L2 cache utilization in this step. To do this, we write out
     // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     if (threadIdx.x < active_threads) {
       int c_gl_stride = prob_n / 8;
-      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_o = 8 * c_gl_stride * (is_a_8bit ? 2 : 1);
       int c_gl_wr_delta_i = 4 * (active_threads / 32);
       int c_gl_wr;
       if constexpr (m_block_size_8) {
@@ -1331,9 +1590,9 @@ __global__ void Marlin(
                   4 * (threadIdx.x / 32) + (threadIdx.x % 32) / 8;
         c_gl_wr += (2 * thread_n_blocks) * slice_col;
       } else {
-        c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+        c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) * (is_a_8bit ? 2 : 1) +
                   4 * (threadIdx.x / 32) + threadIdx.x % 4;
-        c_gl_wr += (2 * thread_n_blocks) * slice_col;
+        c_gl_wr += (2 * thread_n_blocks) * slice_col * (is_a_8bit ? 2 : 1);
       }
       constexpr int c_sh_wr_delta = active_threads;
       auto c_sh_wr = threadIdx.x;
@@ -1351,6 +1610,14 @@ __global__ void Marlin(
                            &C[c_gl_wr + i * c_gl_stride +
                               (threadIdx.x % 8) / 4 * c_gl_wr_delta_i],
                            (threadIdx.x % 4) * 2 + i < prob_m);
+          } else if constexpr (is_a_8bit) {
+            int2* sh_red_int2 = reinterpret_cast<int2*>(sh_red);
+            int2* c_int2 = reinterpret_cast<int2*>(C);
+            cp_async2_ca_pred(
+                &sh_red_int2[c_sh_wr + c_sh_wr_delta * i],
+                &c_int2[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                        c_gl_wr_delta_i * (i % 2)],
+                i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
           } else {
             cp_async4_pred(
                 &sh_red[c_sh_wr + c_sh_wr_delta * i],
@@ -1370,36 +1637,51 @@ __global__ void Marlin(
                     (m_block_size_8) && ((threadIdx.x % 4) * 2 + i < prob_m);
         if (mask) {
           if (!first) {
-            int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+            c_scalar_t* c_red_f16;
+            if constexpr (is_a_8bit) {
+              int2 tmp =
+                  reinterpret_cast<int2*>(sh_red)[c_sh_wr + i * c_sh_wr_delta];
+              c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+            } else {
+              int4 tmp = sh_red[c_sh_wr + i * c_sh_wr_delta];
+              c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+            }
   #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
+            for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
               int delta = 0;
               if constexpr (m_block_size_8) {
                 delta = j % 2 == 1 ? -2 : 0;
               }
               reinterpret_cast<float*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
-                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j +
+                           (i % 4) + delta] += Cdtype::num2float(c_red_f16[j]);
             }
           }
           if (!last) {
-            int4 c;
+            c_scalar_t c_f16[is_a_8bit ? 4 : 8];
   #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
+            for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
               int delta = 0;
               if constexpr (m_block_size_8) {
                 delta = j % 2 == 1 ? -2 : 0;
               }
-              reinterpret_cast<scalar_t*>(&c)[j] =
-                  Dtype::float2num(reinterpret_cast<float*>(
-                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+              c_f16[j] = Cdtype::float2num(reinterpret_cast<float*>(
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j +
+                           (i % 4) + delta]);
             }
-            if constexpr (m_block_size_8)
+            if constexpr (m_block_size_8) {
               C[c_gl_wr + i * c_gl_stride +
-                (threadIdx.x % 8) / 4 * c_gl_wr_delta_i] = c;
-            else
+                (threadIdx.x % 8) / 4 * c_gl_wr_delta_i] =
+                  *reinterpret_cast<int4*>(c_f16);
+            } else if constexpr (is_a_8bit) {
+              int2* c_int2 = reinterpret_cast<int2*>(C);
+              c_int2[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                     c_gl_wr_delta_i * (i % 2)] =
+                  *reinterpret_cast<int2*>(c_f16);
+            } else {
               C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
-                c_gl_wr_delta_i * (i % 2)] = c;
+                c_gl_wr_delta_i * (i % 2)] = *reinterpret_cast<int4*>(c_f16);
+            }
           }
         }
       }
@@ -1414,10 +1696,10 @@ __global__ void Marlin(
 
     constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
 
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     bool is_th_active = threadIdx.x < active_threads;
 
-    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int num_floats = thread_m_blocks * (is_a_8bit ? 2 : 4) * 2 * 4;
     constexpr int th_size = num_floats * sizeof(float) / 16;
 
     int c_cur_offset = locks_off * c_size;
@@ -1471,7 +1753,7 @@ __global__ void Marlin(
     } else {
       c_sh_wr =
           (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
-      c_sh_wr += 32 * (threadIdx.x / 32);
+      c_sh_wr += (is_a_8bit ? 16 : 32) * (threadIdx.x / 32);
     }
 
     int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
@@ -1481,47 +1763,47 @@ __global__ void Marlin(
     // We first reorder in shared memory to guarantee the most efficient final
     // global write patterns
     auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
-      scalar_t2 res =
-          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+      c_scalar_t2 res =
+          Cdtype::nums2num2(Cdtype::float2num(c0), Cdtype::float2num(c1));
 
       // For per-column quantization we finally apply the scale here (only for
       // 4-bit)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 4 &&
+      if constexpr (!has_act_order && group_blocks == -1 && !is_a_8bit &&
+                    b_type.size_bits() == 4 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        scalar_t2 tmp_scale = s[0];
+        c_scalar_t2 tmp_scale = s[0];
         if constexpr (m_block_size_8) {
-          tmp_scale = Dtype::num2num2(
+          tmp_scale = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hmul2(res, tmp_scale);
       }
 
-      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+      if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
         res = __hmul2(res, global_scale);
       }
       if (has_bias && last) {
-        scalar_t2 tmp_bias = b_bias[0];
+        c_scalar_t2 tmp_bias = b_bias[0];
         if constexpr (m_block_size_8) {
-          tmp_bias = Dtype::num2num2(
+          tmp_bias = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hadd2(res, tmp_bias);
       }
 
       if constexpr (m_block_size_8) {
-        ((scalar_t*)sh_red)[idx] = res.x;
-        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+        ((c_scalar_t*)sh_red)[idx] = res.x;
+        ((c_scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
       } else {
-        ((scalar_t2*)sh_red)[idx] = res;
+        ((c_scalar_t2*)sh_red)[idx] = res;
       }
     };
 
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+    if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
-        for (int j = 0; j < 4; j++) {
+        for (int j = 0; j < (is_a_8bit ? 2 : 4); j++) {
           if constexpr (m_block_size_8) {
             int wr = c_sh_wr + 16 * j;
             write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
@@ -1557,9 +1839,9 @@ __global__ void Marlin(
          i++) {
       if (c_gl_wr < c_gl_wr_end) {
         if (use_atomic_add && slice_count > 1) {
-          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[c_gl_wr]);
-          scalar_t2* sh_red_half2 =
-              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+          c_scalar_t2* C_half2 = reinterpret_cast<c_scalar_t2*>(&C[c_gl_wr]);
+          c_scalar_t2* sh_red_half2 =
+              reinterpret_cast<c_scalar_t2*>(&sh_red[c_sh_rd]);
   #pragma unroll
           for (int a = 0; a < 4; a++) {
             atomicAdd(&C_half2[a], sh_red_half2[a]);
@@ -1635,7 +1917,13 @@ __global__ void Marlin(
           wait_for_stage();
           init_same_group(pipe % stages);
         }
-        matmul(k);
+
+        if constexpr (!is_a_8bit) {
+          matmul(k, pipe - (k >= b_sh_wr_iters - 2 ? 1 : 0));
+        } else {
+          static_assert(group_blocks != 0 && group_blocks != 1);
+          matmul_a8(k);
+        }
       }
       slice_iters--;
       if (slice_iters == 0) {
@@ -1668,13 +1956,47 @@ __global__ void Marlin(
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      if constexpr (is_a_8bit) {
+        float frag_a_s[2 * thread_m_blocks];
+
+        for (int i = 0; i < 2 * thread_m_blocks; i++)
+          frag_a_s[i] = sh_a_s[i * 8 + (threadIdx.x % 32) / 4];
+
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][0][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][0][g] = c_val * s_val;
+            }
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][1][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][1][g] = c_val * s_val;
+            }
+          }
+        }
+      }
+
       cp_async_wait<0>();
       bool last = slice_idx == slice_count - 1;
       // For per-column scales, we only fetch them here in the final step before
       // write-out
       if constexpr (!has_act_order && group_blocks == -1 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+        if (b_type.size_bits() == 8 || (last || use_atomic_add) || is_a_8bit) {
           if (s_sh_wr_pred) {
             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
@@ -1692,20 +2014,27 @@ __global__ void Marlin(
       }
 
       if constexpr (!has_act_order && group_blocks == -1 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+                    (has_zp && dequant_skip_flop || !has_zp || is_a_8bit)) {
+        if constexpr (is_a_8bit) {
           cp_async_wait<0>();
           __syncthreads();
-          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+          if (threadIdx.x / 32 < tb_n_warps) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+          }
+        } else if (b_type.size_bits() == 8 || (last || use_atomic_add)) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < tb_n_warps) {
             reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
             if constexpr (m_block_size_8) {
               int idx = (threadIdx.x / 4) % 2;
-              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+              c_scalar_t2* frag_s_half2 =
+                  reinterpret_cast<c_scalar_t2*>(frag_s);
   #pragma unroll
               for (int i = 0; i < 8; i++) {
-                frag_s_half2[i] = Dtype::num2num2(
-                    reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+                frag_s_half2[i] = Cdtype::num2num2(
+                    reinterpret_cast<c_scalar_t*>(&frag_s_half2[i])[idx]);
               }
             }
           }
@@ -1715,26 +2044,48 @@ __global__ void Marlin(
       // For 8-bit channelwise, we apply the scale before the global reduction
       // that converts the fp32 results to fp16 (so that we avoid possible
       // overflow in fp16)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 8 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+      if constexpr (!has_act_order && group_blocks == -1 && is_a_8bit) {
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+          float2 aa[2];
+          aa[0] = Cdtype::num22float2(frag_s[0][j * 2][0]);
+          aa[1] = Cdtype::num22float2(frag_s[0][j * 2 + 1][0]);
+
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[0])[g % 2];
+              frag_c[i][j][0][g] *= scale;
+            }
+
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[1])[g % 2];
+              frag_c[i][j][1][g] *= scale;
+            }
+          }
+        }
+      } else if (!has_act_order && group_blocks == -1 &&
+                 b_type.size_bits() == 8 &&
+                 (has_zp && dequant_skip_flop || !has_zp)) {
+        if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
           for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
             for (int j = 0; j < 4; j++) {
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][0]),
                   frag_s[j / 2][2 * (j % 2) + 0]);
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][2]),
                   frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
 
               if constexpr (!m_block_size_8) {
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][0]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][2]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
               }
@@ -1758,7 +2109,8 @@ __global__ void Marlin(
         cp_async_wait<0>();
         __syncthreads();
         reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
-        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        if constexpr (!is_a_8bit)
+          reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
         __syncthreads();
       }
 
@@ -1768,21 +2120,22 @@ __global__ void Marlin(
         // only the last block in a slice actually writes the result
         write_result(last);
       slice_row = 0;
-      slice_col_par++;
-      slice_col++;
+      if (!in_part2) {
+        slice_col_par += gridDim.x;
+      } else {
+        slice_col_par++;
+        slice_col++;
+      }
       is_first_matmul_in_slice = true;
       init_slice();
 
       if (slice_iters) {
         a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                   (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-        }
+        a_gl_rd += a_gl_rd_delta_o * slice_row;
+        b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride) +
+                  (threadIdx.x % b_sh_stride);
+        b_gl_rd += b_sh_stride * slice_col + b_gl_rd_delta_o * slice_row;
 
         bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
         // Update slice k/n for scales loading
@@ -1791,12 +2144,28 @@ __global__ void Marlin(
           slice_k_finish = slice_k_start + tb_k * slice_iters;
           slice_k_start_shared_fetch = slice_k_start;
           slice_n_offset = act_s_col_tb_stride * slice_col;
-
         } else {
-          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          if constexpr (group_blocks == -1) {
+            s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          } else if constexpr (group_blocks >= thread_k_blocks) {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                zp_sh_stride * slice_col + threadIdx.x;
+          } else {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                threadIdx.x / zp_sh_stride) +
+                zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
+          }
         }
-
         start_pipes();
       }
     }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index e9c96bb8b56c..914227838558 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -298,9 +298,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // gptq_marlin Optimized Quantized GEMM for GPTQ.
   ops.def(
       "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
-      "Tensor? b_bias_or_none,"
-      "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? "
-      "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, "
+      "Tensor? b_bias_or_none,Tensor b_scales, "
+      "Tensor? a_scales, Tensor? global_scale, Tensor? b_zeros_or_none, "
+      "Tensor? "
+      "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_type_id, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
       "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
   // conditionally compiled so impl registration is in source file
@@ -308,13 +309,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // gptq_marlin repack from GPTQ.
   ops.def(
       "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
-      "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
+      "SymInt size_k, SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
   // conditionally compiled so impl registrations are in source file
 
   // awq_marlin repack from AWQ.
   ops.def(
       "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
-      "SymInt size_n, int num_bits) -> Tensor");
+      "SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
+  // conditionally compiled so impl registrations are in source file
+
+  // preprocess W-int4A-fp8 weight for marlin kernel
+  ops.def(
+      "marlin_int4_fp8_preprocess(Tensor qweight, "
+      "Tensor? qzeros_or_none, bool inplace) -> Tensor");
   // conditionally compiled so impl registrations are in source file
 
   // CUTLASS w4a8 GEMM
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index e54a9e2bc5e7..44aaa65218cc 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 
 - [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
 - [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
-- [`CompressedTensorsW4A4Nvfp4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoeMethod]
+- [`CompressedTensorsW4A4Nvfp4MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoEMethod]
 - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
 - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
 - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 0550c2d9e212..bacf6f37f2b0 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -21,7 +21,7 @@
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from tests.kernels.moe.utils import fused_moe
-from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
+from tests.kernels.utils import opcheck, stack_and_dev, torch_experts, torch_moe
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.parallel_state import init_distributed_environment
@@ -65,6 +65,64 @@
 EP_SIZE = [1, 4]
 TOP_KS = [2, 6]
 
+MOE_MARLIN_QUANT_TEST_CONFIGS = [
+    # AWQ-INT4
+    {"b_type": scalar_types.uint4, "group_blocks": [-1, 2, 4, 8]},
+    # GPTQ-INT4
+    {
+        "b_type": scalar_types.uint4b8,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT8
+    {
+        "b_type": scalar_types.uint8b128,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # FP8
+    {"b_type": scalar_types.float8_e4m3fn, "group_blocks": [-1, 8]},
+    # NVFP4
+    {"b_type": scalar_types.float4_e2m1f, "group_blocks": [1]},
+    # MXFP4
+    {
+        "a_type": [scalar_types.bfloat16],
+        "b_type": scalar_types.float4_e2m1f,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.float4_e2m1f,
+        "c_type": [scalar_types.bfloat16],
+        "group_blocks": [2],
+    },
+]
+
 FUSED_MOE_MNK_FACTORS = [
     (1, 128, 128),
     (1, 2048, 128),
@@ -505,63 +563,74 @@ def marlin_moe_generate_valid_test_cases():
     m_list = [1, 123, 666]
     n_list = [128, 1024]
     k_list = [256, 2048]
-    e_list = [4, 12]
+    e_list = [5, 12]
     topk_list = [2, 3]
     ep_size_list = [1, 4]
-    dtype_list = [torch.bfloat16]
-    group_size_list = [-1, 32, 128]
     act_order_list = [True, False]
-    quant_type_list = [
-        scalar_types.float4_e2m1f,
-        scalar_types.float8_e4m3fn,
-        scalar_types.uint4,
-        scalar_types.uint4b8,
-        scalar_types.uint8b128,
-    ]
     is_k_full_list = [True, False]
 
     all_combinations = itertools.product(
+        MOE_MARLIN_QUANT_TEST_CONFIGS,
         m_list,
         n_list,
         k_list,
         e_list,
         topk_list,
         ep_size_list,
-        dtype_list,
-        group_size_list,
         act_order_list,
-        quant_type_list,
         is_k_full_list,
     )
 
     def is_invalid(
-        m, n, k, e, topk, ep_size, dtype, group_size, act_order, quant_type, is_k_full
+        a_type,
+        b_type,
+        c_type,
+        group_blocks,
+        m,
+        n,
+        k,
+        e,
+        topk,
+        ep_size,
+        act_order,
+        is_k_full,
     ):
-        if quant_type == scalar_types.float8_e4m3fn and group_size not in [-1, 128]:
-            return False
-        if quant_type == scalar_types.float4_e2m1f:
-            if group_size not in [16, 32]:
-                return False
-            if dtype == torch.float16 and group_size == 32:
-                return False
-        if quant_type != scalar_types.float4_e2m1f and group_size == 16:
+        group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
+        if group_size > 0 and k % group_size != 0:
             return False
 
-        # Filter act_order
-        if act_order:
-            if group_size in (-1, k, n):
-                return False
-            if quant_type not in [scalar_types.uint4b8]:
-                return False
-        elif not is_k_full:
+        if act_order and group_size in [-1, k, n]:
+            return False
+        if group_size in [k, n]:
+            return False
+        if not act_order and is_k_full:
             return False
 
-        return True
+        return a_type.size_bits < 16 or a_type is c_type
 
     cases = []
     for case in all_combinations:
-        if is_invalid(*case):
-            cases.append(case)
+        quant_test_config, m, n, k, _, _, _, act_order, *_ = case
+        if act_order and not quant_test_config.get("support_act_order", False):
+            continue
+
+        f16_types = [scalar_types.float16]
+        inner_combinations = itertools.product(
+            quant_test_config.get("a_type", f16_types),
+            [quant_test_config["b_type"]],
+            quant_test_config.get("c_type", f16_types),
+            quant_test_config["group_blocks"],
+        )
+
+        for sub_case in inner_combinations:
+            if (
+                sub_case[0] == scalar_types.float8_e4m3fn
+                and current_platform.get_device_capability() not in [89, 120]
+            ):
+                continue
+            args = sub_case + (m, n, k) + case[4:]
+            if is_invalid(*args):
+                cases.append(args)
     return cases
 
 
@@ -571,6 +640,7 @@ class MarlinMoEWeightData:
     qweight: torch.Tensor
     scales: torch.Tensor
     global_scale: torch.Tensor | None
+    a_scales_factor: torch.Tensor | None
     g_idx: torch.Tensor | None
     zeros: torch.Tensor | None
     sort_indices: torch.Tensor | None
@@ -583,11 +653,20 @@ def make(
         group_size: int,
         act_order: bool | None = None,
         bias: torch.Tensor | None = None,
+        input_type: ScalarType = None,
     ) -> "MarlinMoEWeightData":
         assert w.ndim == 3
+
         has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
         k = w.shape[-1]
 
+        if input_type == scalar_types.int8:
+            input_dtype = torch.int8
+        elif input_type == scalar_types.float8_e4m3fn:
+            input_dtype = torch.float8_e4m3fn
+        else:
+            input_dtype = w.dtype
+
         w_ref_l: list[torch.Tensor] = []
         qweight_l: list[torch.Tensor] = []
         scales_l: list[torch.Tensor] = []
@@ -601,11 +680,13 @@ def make(
             if quant_type == scalar_types.float4_e2m1f:
                 if group_size == 16:
                     w_ref, qweight, scales, global_scale = (
-                        rand_marlin_weight_nvfp4_like(w[i], group_size)
+                        rand_marlin_weight_nvfp4_like(
+                            w[i], group_size, input_dtype=input_dtype
+                        )
                     )
                 else:
                     w_ref, qweight, scales = rand_marlin_weight_mxfp4_like(
-                        w[i], group_size
+                        w[i], group_size, input_dtype=input_dtype
                     )
                     global_scale = None
 
@@ -615,13 +696,18 @@ def make(
                 if global_scale is not None:
                     global_scale_l.append(global_scale)
             elif quant_type == scalar_types.float8_e4m3fn:
-                w_ref, qweight, scales = marlin_quant_fp8_torch(w[i], group_size)
+                w_ref, qweight, scales = marlin_quant_fp8_torch(
+                    w[i], group_size, input_dtype=input_dtype
+                )
                 w_ref_l.append(w_ref.T)
                 qweight_l.append(qweight)
                 scales_l.append(scales)
             elif has_zp:
                 w_ref, qweight, scales, zeros = awq_marlin_quantize(
-                    w[i].transpose(1, 0), quant_type, group_size
+                    w[i].transpose(1, 0),
+                    quant_type,
+                    group_size,
+                    input_dtype=input_dtype,
                 )
 
                 w_ref_l.append(w_ref.T)
@@ -631,7 +717,12 @@ def make(
             else:
                 test_perm = torch.randperm(k)
                 w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
-                    w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+                    w[i].transpose(1, 0),
+                    quant_type,
+                    group_size,
+                    act_order,
+                    test_perm,
+                    input_dtype=input_dtype,
                 )
 
                 w_ref_l.append(w_ref.T)
@@ -652,11 +743,18 @@ def make(
         sort_indices = stack_and_dev(sort_indices_l) if sort_indices_l else None
         marlin_bias = stack_and_dev(bias_l) if bias_l else None
 
+        a_scales_factor = None
+        if input_type == scalar_types.int8 and group_size != -1:
+            a_scales_factor = 1 / 4096 * scales.max().float()
+            scales = scales / scales.max() * 4096
+            scales = scales.round().to(torch.int16).view(w.dtype)
+
         return MarlinMoEWeightData(
             w_ref=w_ref,
             qweight=qweight,
             scales=scales,
             global_scale=global_scale,
+            a_scales_factor=a_scales_factor,
             g_idx=g_idx,
             zeros=zeros,
             sort_indices=sort_indices,
@@ -666,28 +764,47 @@ def make(
 
 @pytest.mark.flaky(reruns=2)
 @pytest.mark.parametrize(
-    ("m, n, k, e, topk, ep_size, dtype, group_size,act_order, quant_type, is_k_full"),
+    (
+        "a_type, b_type, c_type, group_blocks,"
+        "m, n, k, e, topk, ep_size, act_order, is_k_full"
+    ),
     marlin_moe_generate_valid_test_cases(),
 )
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_fused_marlin_moe(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    ep_size: int,
-    dtype: torch.dtype,
-    group_size: int,
-    act_order: bool,
-    quant_type: ScalarType,
-    is_k_full: bool,
+    a_type,
+    b_type,
+    c_type,
+    group_blocks,
+    m,
+    n,
+    k,
+    e,
+    topk,
+    ep_size,
+    act_order,
+    is_k_full,
 ):
-    torch.cuda.manual_seed(0)
+    torch.cuda.manual_seed(1)
+    group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
+
+    if c_type == scalar_types.float16:
+        dtype = torch.float16
+    elif c_type == scalar_types.bfloat16:
+        dtype = torch.bfloat16
+    else:
+        raise RuntimeError("unsupported c_type")
+
+    if a_type == scalar_types.int8:
+        a_dtype = torch.int8
+    elif a_type == scalar_types.float8_e4m3fn:
+        a_dtype = torch.float8_e4m3fn
+    else:
+        a_dtype = dtype
 
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20
-    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 20
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
     if ep_size > 1:
         local_e = e // ep_size
@@ -700,11 +817,19 @@ def test_fused_marlin_moe(
         e_map = None
 
     w1_data = MarlinMoEWeightData.make(
-        w=w1, quant_type=quant_type, group_size=group_size, act_order=act_order
+        w=w1,
+        quant_type=b_type,
+        group_size=group_size,
+        act_order=act_order,
+        input_type=a_type,
     )
 
     w2_data = MarlinMoEWeightData.make(
-        w=w2, quant_type=quant_type, group_size=group_size, act_order=act_order
+        w=w2,
+        quant_type=b_type,
+        group_size=group_size,
+        act_order=act_order,
+        input_type=a_type,
     )
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
@@ -712,8 +837,18 @@ def test_fused_marlin_moe(
     topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
 
     with set_current_vllm_config(vllm_config):
-        torch_output = torch_moe(
-            a, w1_data.w_ref, w2_data.w_ref, score, topk, expert_map=e_map
+        score = torch.softmax(score, dim=-1, dtype=torch.float32)
+        topk_weight, topk_ids = torch.topk(score, topk)
+        torch_output = torch_experts(
+            a,
+            w1_data.w_ref,
+            w2_data.w_ref,
+            topk_weight=topk_weight,
+            topk_ids=topk_ids,
+            global_num_experts=e,
+            expert_map=e_map,
+            quant_dtype=a_dtype,
+            per_act_token_quant=True,
         )
 
     marlin_output = fused_marlin_moe(
@@ -733,15 +868,18 @@ def test_fused_marlin_moe(
         global_scale2=w2_data.global_scale,
         g_idx1=w1_data.g_idx,
         g_idx2=w2_data.g_idx,
+        input_global_scale1=w1_data.a_scales_factor,
+        input_global_scale2=w2_data.a_scales_factor,
         sort_indices1=w1_data.sort_indices,
         sort_indices2=w2_data.sort_indices,
         w1_zeros=w1_data.zeros,
         w2_zeros=w2_data.zeros,
-        quant_type_id=quant_type.id,
+        input_dtype=a_dtype,
+        quant_type_id=b_type.id,
         is_k_full=is_k_full,
     )
 
-    torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
+    torch.testing.assert_close(marlin_output, torch_output, atol=4e-2, rtol=0)
 
 
 @pytest.mark.flaky(reruns=2)
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 0833115fcf30..59516db1b115 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -5,6 +5,8 @@
 Run `pytest tests/kernels/quantization/test_marlin_gemm.py`.
 """
 
+import itertools
+
 import pytest
 import torch
 
@@ -17,8 +19,10 @@
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
     GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
 )
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_quant_int8,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    MARLIN_SUPPORTED_GROUP_SIZES,
     marlin_make_empty_g_idx,
     marlin_make_workspace_new,
     marlin_permute_bias,
@@ -26,7 +30,6 @@
     query_marlin_supported_quant_types,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    FP4_MARLIN_SUPPORTED_GROUP_SIZES,
     rand_marlin_weight_mxfp4_like,
     rand_marlin_weight_nvfp4_like,
 )
@@ -50,6 +53,7 @@
     quantize_weights,
     sort_weights,
 )
+from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 ACT_ORDER_OPTS = [False, True]
@@ -65,6 +69,12 @@
 
 HQQ_SUPPORTED_GROUP_SIZES = [64]
 
+MARLIN_REPACK_NK_FACTORS = [
+    (4, 8),
+    (7, 5),
+    (13, 11),
+]
+
 MNK_FACTORS = [
     (1, 1, 1),
     (1, 4, 8),
@@ -74,6 +84,64 @@
 
 DTYPES = [torch.float16, torch.bfloat16]
 
+DENSE_MARLIN_QUANT_TEST_CONFIGS = [
+    # AWQ-INT4
+    {"b_type": scalar_types.uint4, "group_blocks": [-1, 2, 4, 8]},
+    # GPTQ-INT4
+    {
+        "b_type": scalar_types.uint4b8,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT8
+    {
+        "b_type": scalar_types.uint8b128,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # FP8
+    {"b_type": scalar_types.float8_e4m3fn, "group_blocks": [-1, 8]},
+    # NVFP4
+    {"b_type": scalar_types.float4_e2m1f, "group_blocks": [1]},
+    # MXFP4
+    {
+        "a_type": [scalar_types.bfloat16],
+        "b_type": scalar_types.float4_e2m1f,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.float4_e2m1f,
+        "c_type": [scalar_types.bfloat16],
+        "group_blocks": [2],
+    },
+]
+
 
 def compute_max_diff(output, output_ref):
     return torch.mean(torch.abs(output - output_ref)) / torch.mean(
@@ -85,6 +153,58 @@ def rand_data(shape, dtype=torch.float16):
     return torch.randn(shape, dtype=dtype, device="cuda")
 
 
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
+def test_marlin_int4_fp8_preprocess_without_zp():
+    qweight_unpacked = torch.randint(
+        0, 16, size=(2048, 2048), dtype=torch.int32, device="cuda"
+    )
+    qweight_packed = qweight_unpacked[:, ::2] * 16 + qweight_unpacked[:, 1::2]
+    qweight_packed = qweight_packed.to(torch.int8).view(torch.int32)
+
+    cuda_res = ops.marlin_int4_fp8_preprocess(qweight_packed)
+
+    torch_res = torch.where(
+        qweight_unpacked >= 8, qweight_unpacked - 8, 15 - qweight_unpacked
+    )
+    torch_res = torch_res[:, ::2] * 16 + torch_res[:, 1::2]
+    torch_res = torch_res.to(torch.int8).view(torch.int32)
+
+    assert (cuda_res == torch_res).all()
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
+def test_marlin_int4_fp8_preprocess_awq():
+    group_size = 128
+
+    qweight_unpacked = torch.randint(
+        0, 16, size=(2048, 2048), dtype=torch.int32, device="cuda"
+    )
+    qzeros_unpacked = torch.randint(
+        0, 16, size=(2048 // group_size, 2048), dtype=torch.int32, device="cuda"
+    )
+
+    qweight_packed = qweight_unpacked[:, ::2] * 16 + qweight_unpacked[:, 1::2]
+    qweight_packed = qweight_packed.to(torch.int8).view(torch.int32)
+    qzeros_packed = qzeros_unpacked[:, ::2] * 16 + qzeros_unpacked[:, 1::2]
+    qzeros_packed = qzeros_packed.to(torch.int8).view(torch.int32)
+
+    cuda_res = ops.marlin_int4_fp8_preprocess(qweight_packed, qzeros_packed)
+
+    repeated_zp = qzeros_unpacked.repeat_interleave(group_size, 0)
+    torch_res = qweight_unpacked - repeated_zp
+    torch_res[torch_res < 0] = 15 - qweight_unpacked[torch_res < 0]
+    torch_res = torch_res[:, ::2] * 16 + torch_res[:, 1::2]
+    torch_res = torch_res.to(torch.int8).view(torch.int32)
+
+    assert (cuda_res == torch_res).all()
+
+
 @pytest.mark.skipif(
     not is_quant_method_supported("gptq_marlin"),
     reason="Marlin is not supported on this GPU type.",
@@ -92,16 +212,17 @@ def rand_data(shape, dtype=torch.float16):
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
 @pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types(False, False))
-@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("is_a_8bit", [True, False])
+@pytest.mark.parametrize("nk_factors", MARLIN_REPACK_NK_FACTORS)
 def test_gptq_marlin_repack(
-    k_chunk, n_chunk, quant_type, group_size, act_order, mnk_factors
+    k_chunk, n_chunk, quant_type, act_order, is_a_8bit, nk_factors
 ):
-    m_factor, n_factor, k_factor = mnk_factors
+    n_factor, k_factor = nk_factors
 
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
+    group_size = 128
 
     # Filter act_order
     if act_order:
@@ -109,6 +230,8 @@ def test_gptq_marlin_repack(
             return
         if group_size == size_k:
             return
+        if is_a_8bit:
+            return
 
     # Normalize group_size
     if group_size == -1:
@@ -133,23 +256,19 @@ def test_gptq_marlin_repack(
         q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
 
     # Pack to Marlin format
-    weight_perm = get_weight_perm(quant_type.size_bits)
+    weight_perm = get_weight_perm(quant_type.size_bits, is_a_8bit)
     marlin_q_w_1 = marlin_weights(
-        q_w, size_k, size_n, quant_type.size_bits, weight_perm
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm, is_a_8bit
     )
 
     opcheck(
         torch.ops._C.gptq_marlin_repack,
-        (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits),
+        (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits, is_a_8bit),
     )
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.gptq_marlin_repack(
-        q_w_gptq,
-        sort_indices,
-        size_k,
-        size_n,
-        quant_type.size_bits,
+        q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits, is_a_8bit
     )
     torch.cuda.synchronize()
 
@@ -163,18 +282,15 @@ def test_gptq_marlin_repack(
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
 @pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types(True))
-@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size, mnk_factors):
-    m_factor, n_factor, k_factor = mnk_factors
+@pytest.mark.parametrize("is_a_8bit", [True, False])
+@pytest.mark.parametrize("nk_factors", MARLIN_REPACK_NK_FACTORS)
+def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, is_a_8bit, nk_factors):
+    n_factor, k_factor = nk_factors
 
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
 
-    # Normalize group_size
-    if group_size == -1:
-        group_size = size_k
-    assert group_size <= size_k
+    group_size = 128
 
     # Create input
     b_weight = rand_data((size_k, size_n))
@@ -188,162 +304,221 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size, mnk_factors
     q_w_awq = awq_pack(q_w, quant_type.size_bits, size_k, size_n)
 
     # Pack to Marlin format
-    weight_perm = get_weight_perm(quant_type.size_bits)
+    weight_perm = get_weight_perm(quant_type.size_bits, is_a_8bit)
     marlin_q_w_1 = marlin_weights(
-        q_w, size_k, size_n, quant_type.size_bits, weight_perm
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm, is_a_8bit
     )
 
     opcheck(
-        torch.ops._C.awq_marlin_repack, (q_w_awq, size_k, size_n, quant_type.size_bits)
+        torch.ops._C.awq_marlin_repack,
+        (q_w_awq, size_k, size_n, quant_type.size_bits, is_a_8bit),
     )
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.awq_marlin_repack(
-        q_w_awq,
-        size_k,
-        size_n,
-        quant_type.size_bits,
+        q_w_awq, size_k, size_n, quant_type.size_bits, is_a_8bit
     )
     torch.cuda.synchronize()
 
     torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
 
 
+def marlin_generate_valid_test_cases():
+    all_combinations = itertools.product(
+        DENSE_MARLIN_QUANT_TEST_CONFIGS,
+        MNK_FACTORS,
+        MARLIN_N_CHUNKS,
+        MARLIN_K_CHUNKS,
+        ACT_ORDER_OPTS,
+        K_FULL_OPTS,
+        USE_ATOMIC_ADD_OPTS,
+        USE_FP32_REDUCE_OPTS,
+    )
+
+    def is_invalid(
+        a_type,
+        b_type,
+        c_type,
+        group_blocks,
+        size_m,
+        size_n,
+        size_k,
+        act_order,
+        is_k_full,
+        use_atomic_add,
+        use_fp32_reduce,
+    ):
+        if use_atomic_add:
+            if use_fp32_reduce:
+                return False
+            if (
+                c_type == scalar_types.bfloat16
+                and torch.cuda.get_device_capability()[0] < 9
+            ):
+                return False
+
+        group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
+        if group_size > 0 and size_k % group_size != 0:
+            return False
+
+        if act_order and group_size in [-1, size_k]:
+            return False
+        if group_size == size_k:
+            return False
+        if not act_order and is_k_full:
+            return False
+
+        return a_type.size_bits < 16 or a_type is c_type
+
+    cases = []
+    for case in all_combinations:
+        quant_test_config, mnk_factors, n_chunk, k_chunk, act_order, *_ = case
+        size_m = mnk_factors[0]
+        size_n = mnk_factors[1] * n_chunk
+        size_k = mnk_factors[2] * k_chunk
+
+        if act_order and not quant_test_config.get("support_act_order", False):
+            continue
+
+        f16_types = [scalar_types.float16, scalar_types.bfloat16]
+        inner_combinations = itertools.product(
+            quant_test_config.get("a_type", f16_types),
+            [quant_test_config["b_type"]],
+            quant_test_config.get("c_type", f16_types),
+            quant_test_config["group_blocks"],
+        )
+
+        for sub_case in inner_combinations:
+            if (
+                sub_case[0] == scalar_types.float8_e4m3fn
+                and current_platform.get_device_capability() not in [89, 120]
+            ):
+                continue
+            args = sub_case + (size_m, size_n, size_k) + case[4:]
+            if is_invalid(*args):
+                cases.append(args)
+    return cases
+
+
 @pytest.mark.skipif(
     not is_quant_method_supported("gptq_marlin"),
     reason="Marlin is not supported on this GPU type.",
 )
-@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
-@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types())
 @pytest.mark.parametrize(
-    "group_size", set(MARLIN_SUPPORTED_GROUP_SIZES + FP4_MARLIN_SUPPORTED_GROUP_SIZES)
+    (
+        "a_type, b_type, c_type, group_blocks,"
+        "size_m, size_n, size_k, act_order, is_k_full,"
+        "use_atomic_add, use_fp32_reduce"
+    ),
+    marlin_generate_valid_test_cases(),
 )
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-@pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
-@pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
-@pytest.mark.parametrize("use_atomic_add", USE_ATOMIC_ADD_OPTS)
-@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
-@pytest.mark.parametrize("dtype", DTYPES)
 def test_gptq_marlin_gemm(
-    k_chunk,
-    n_chunk,
-    quant_type,
-    group_size,
-    mnk_factors,
+    a_type,
+    b_type,
+    c_type,
+    group_blocks,
+    size_m,
+    size_n,
+    size_k,
     act_order,
     is_k_full,
     use_atomic_add,
     use_fp32_reduce,
-    dtype,
 ):
-    m_factor, n_factor, k_factor = mnk_factors
-    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+    has_zp = b_type in [scalar_types.uint4, scalar_types.uint8]
 
-    size_m = m_factor
-    size_k = k_chunk * k_factor
-    size_n = n_chunk * n_factor
-
-    if act_order:
-        if group_size == -1:
-            return
-        if group_size == size_k:
-            return
-        if has_zp:
-            return
+    group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
 
-    if size_k % group_size != 0:
-        return
+    if c_type == scalar_types.float16:
+        dtype = torch.float16
+    elif c_type == scalar_types.bfloat16:
+        dtype = torch.bfloat16
+    else:
+        raise RuntimeError("unsupported c_type")
 
-    a_input = rand_data((size_m, size_k), dtype)
-    b_weight = rand_data((size_k, size_n), dtype)
+    if a_type == scalar_types.int8:
+        a_dtype = torch.int8
+    elif a_type == scalar_types.float8_e4m3fn:
+        a_dtype = torch.float8_e4m3fn
+    else:
+        a_dtype = dtype
 
-    if quant_type == scalar_types.float4_e2m1f:
-        if group_size not in [16, 32] or act_order:
-            return
-        if group_size == 32 and dtype == torch.float16:
-            return
+    a_input = rand_data((size_m, size_k), dtype=dtype)
+    b_weight = rand_data((size_k, size_n), dtype=dtype)
 
+    if b_type == scalar_types.float4_e2m1f:
         if group_size == 16:
             w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_nvfp4_like(
-                b_weight.T, group_size
+                b_weight.T, group_size, input_dtype=a_dtype
             )
         else:
             w_ref, marlin_q_w, marlin_s = rand_marlin_weight_mxfp4_like(
-                b_weight.T, group_size
+                b_weight.T, group_size, input_dtype=a_dtype
             )
             marlin_s2 = None
 
         g_idx = None
         sort_indices = None
         marlin_zp = None
-    elif quant_type == scalar_types.float8_e4m3fn:
-        if group_size not in [-1, 128]:
-            return
-        if act_order:
-            return
-        w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b_weight.T, group_size)
+    elif b_type == scalar_types.float8_e4m3fn:
+        w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(
+            b_weight.T, group_size, input_dtype=a_dtype
+        )
         g_idx = None
         sort_indices = None
         marlin_zp = None
         marlin_s2 = None
     elif has_zp:
-        if group_size == 16:
-            return
         w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
-            b_weight, quant_type, group_size
+            b_weight, b_type, group_size, input_dtype=a_dtype
         )
         g_idx = None
         sort_indices = None
         marlin_s2 = None
     else:
-        if group_size == 16:
-            return
         w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
-            b_weight, quant_type, group_size, act_order
+            b_weight, b_type, group_size, act_order, input_dtype=a_dtype
         )
+
         marlin_zp = None
         marlin_s2 = None
 
     workspace = marlin_make_workspace_new(w_ref.device)
 
-    opcheck(
-        torch.ops._C.gptq_marlin_gemm,
-        (
-            a_input,
-            None,
-            marlin_q_w,
-            None,
-            marlin_s,
-            marlin_s2,
-            marlin_zp,
-            g_idx,
-            sort_indices,
-            workspace,
-            quant_type.id,
-            a_input.shape[0],
-            b_weight.shape[1],
-            a_input.shape[1],
-            is_k_full,
-            use_atomic_add,
-            use_fp32_reduce,
-            False,
-        ),
-        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
-    )
+    if a_type == scalar_types.int8:
+        a_input, a_scales = per_token_quant_int8(a_input)
+        a_input_ref = a_input.to(a_scales.dtype) * a_scales.view(-1, 1)
+        a_input_ref = a_input_ref.to(dtype)
+
+        if group_size != -1:
+            a_scales = a_scales / 4096 * marlin_s.max()
+            a_scales = a_scales.float()
+            marlin_s = marlin_s / marlin_s.max() * 4096
+            marlin_s = marlin_s.round().to(torch.int16).view(dtype)
+    elif a_type == scalar_types.float8_e4m3fn:
+        a_input, a_scales = ops.scaled_fp8_quant(a_input, use_per_token_if_dynamic=True)
+        a_input_ref = a_input.to(a_scales.dtype) * a_scales.view(-1, 1)
+        a_input_ref = a_input_ref.to(dtype)
+    else:
+        assert a_type.size_bits == 16
+        a_input_ref = a_input
+        a_scales = None
+
+    output = torch.empty((size_m, size_n), dtype=dtype, device=a_input.device)
 
     output = ops.gptq_marlin_gemm(
         a_input,
-        None,
+        output,
         marlin_q_w,
         None,
         marlin_s,
+        a_scales,
         marlin_s2,
         marlin_zp,
         g_idx,
         sort_indices,
         workspace,
-        quant_type,
+        b_type,
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
@@ -352,12 +527,9 @@ def test_gptq_marlin_gemm(
         use_fp32_reduce=use_fp32_reduce,
         is_zp_float=False,
     )
-    output_ref = torch.matmul(a_input, w_ref)
-
-    torch.cuda.synchronize()
+    output_ref = torch.matmul(a_input_ref, w_ref)
 
     max_diff = compute_max_diff(output, output_ref)
-
     assert max_diff < 0.04
 
 
@@ -507,6 +679,7 @@ def test_hqq_marlin_gemm(
         None,
         marlin_s,
         None,
+        None,
         marlin_zp,
         g_idx,
         g_idx_sort_indices,
@@ -559,6 +732,7 @@ def test_marlin_gemm_subset_input():
         None,
         marlin_s,
         None,
+        None,
         marlin_zp,
         g_idx,
         sort_indices,
@@ -607,6 +781,7 @@ def test_marlin_gemm_with_bias(size_m):
         marlin_bias,
         marlin_s,
         None,
+        None,
         marlin_zp,
         g_idx,
         sort_indices,
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 75e82f9314e7..98646442391f 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -846,6 +846,13 @@ def torch_experts(
         or (expert_map is not None and global_num_experts == expert_map.shape[0])
     )
 
+    if quant_dtype in [torch.float16, torch.bfloat16]:
+        quant_dtype = None
+    quant_input_only = quant_dtype is not None and w1_scale is None and w2_scale is None
+    if quant_input_only:
+        assert a1_scale is None and a2_scale is None
+        assert per_act_token_quant
+
     M, K = a.shape
     topk = topk_ids.shape[1]
 
@@ -863,6 +870,9 @@ def torch_experts(
         a, a1_scale, quant_dtype, per_act_token_quant, block_shape
     )
 
+    if quant_input_only:
+        a = (a.float() * a_scale.view(-1, 1)).to(w1.dtype)
+
     num_experts = w1.shape[0]
 
     topk_ids = topk_ids.view(-1)
@@ -882,6 +892,14 @@ def torch_experts(
                 out[mask] = tmp2 @ w2[i].transpose(0, 1)
                 if b_bias2 is not None:
                     out[mask] = out[mask] + b_bias2[i].view(1, -1).to(tmp1.dtype)
+            elif quant_input_only:
+                tmp1 = a[mask] @ w1[i].transpose(0, 1)
+                tmp2 = SiluAndMul()(tmp1)
+                tmp2, tmp2_scale = moe_kernel_quantize_input(
+                    tmp2, None, quant_dtype, per_act_token_quant
+                )
+                tmp2 = (tmp2.float() * tmp2_scale.view(-1, 1)).to(w2.dtype)
+                out[mask] = tmp2 @ w2[i].transpose(0, 1)
             elif block_shape is not None:
                 # block quantized
                 assert (
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 4a1bcc761f99..e60158898685 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -554,6 +554,7 @@ def _gptq_marlin_gemm_fake(
         b_q_weight: torch.Tensor,
         b_bias: torch.Tensor | None,
         b_scales: torch.Tensor,
+        a_scales: torch.Tensor | None,
         global_scale: torch.Tensor | None,
         b_zeros: torch.Tensor | None,
         g_idx: torch.Tensor | None,
@@ -568,7 +569,10 @@ def _gptq_marlin_gemm_fake(
         use_fp32_reduce: bool = False,
         is_zp_float: bool = False,
     ) -> torch.Tensor:
-        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
+        dtype = a.dtype
+        if dtype not in [torch.half, torch.bfloat16]:
+            dtype = b_scales.dtype
+        return torch.empty((size_m, size_n), device=a.device, dtype=dtype)
 
     @register_fake("_C::awq_dequantize")
     def _awq_dequantize_fake(
@@ -1167,8 +1171,11 @@ def gptq_marlin_repack(
     size_k: int,
     size_n: int,
     num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
-    return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, num_bits)
+    return torch.ops._C.gptq_marlin_repack(
+        b_q_weight, perm, size_k, size_n, num_bits, is_a_8bit
+    )
 
 
 if hasattr(torch.ops._C, "gptq_marlin_repack"):
@@ -1180,6 +1187,7 @@ def _gptq_marlin_repack_fake(
         size_k: torch.SymInt,
         size_n: torch.SymInt,
         num_bits: int,
+        is_a_8bit: bool = False,
     ) -> torch.Tensor:
         pack_factor = 32 // num_bits
         marlin_tile_size = 16
@@ -1192,9 +1200,15 @@ def _gptq_marlin_repack_fake(
 
 # awq_marlin
 def awq_marlin_repack(
-    b_q_weight: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    b_q_weight: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
-    return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
+    return torch.ops._C.awq_marlin_repack(
+        b_q_weight, size_k, size_n, num_bits, is_a_8bit
+    )
 
 
 if hasattr(torch.ops._C, "awq_marlin_repack"):
@@ -1205,6 +1219,7 @@ def _awq_marlin_repack_fake(
         size_k: torch.SymInt,
         size_n: torch.SymInt,
         num_bits: int,
+        is_a_8bit: bool = False,
     ) -> torch.Tensor:
         pack_factor = 32 // num_bits
         marlin_tile_size = 16
@@ -1221,6 +1236,7 @@ def gptq_marlin_moe_repack(
     size_k: int,
     size_n: int,
     num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
@@ -1231,7 +1247,7 @@ def gptq_marlin_moe_repack(
     )
     for e in range(num_experts):
         output[e] = torch.ops._C.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
+            b_q_weight[e], perm[e], size_k, size_n, num_bits, is_a_8bit
         )
     return output
 
@@ -1242,6 +1258,7 @@ def awq_marlin_moe_repack(
     size_k: int,
     size_n: int,
     num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
@@ -1252,17 +1269,26 @@ def awq_marlin_moe_repack(
     )
     for e in range(num_experts):
         output[e] = torch.ops._C.awq_marlin_repack(
-            b_q_weight[e], size_k, size_n, num_bits
+            b_q_weight[e], size_k, size_n, num_bits, is_a_8bit
         )
     return output
 
 
+def marlin_int4_fp8_preprocess(
+    qweight: torch.Tensor,
+    qzeros_or_none: torch.Tensor | None = None,
+    inplace: bool = False,
+):
+    return torch.ops._C.marlin_int4_fp8_preprocess(qweight, qzeros_or_none, inplace)
+
+
 def gptq_marlin_gemm(
     a: torch.Tensor,
     c: torch.Tensor | None,
     b_q_weight: torch.Tensor,
     b_bias: torch.Tensor | None,
     b_scales: torch.Tensor,
+    a_scales: torch.Tensor | None,
     global_scale: torch.Tensor | None,
     b_zeros: torch.Tensor | None,
     g_idx: torch.Tensor | None,
@@ -1283,6 +1309,7 @@ def gptq_marlin_gemm(
         b_q_weight,
         b_bias,
         b_scales,
+        a_scales,
         global_scale,
         b_zeros,
         g_idx,
@@ -1600,7 +1627,7 @@ def allspark_repack_weight(
             if use asymmetric quantization, has_zp = True.
 
     Returns:
-        tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] :
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] :
             rearranged weight, scale, and optionally zero_point.
     """
     K = qweight.shape[0]
@@ -1683,7 +1710,7 @@ def scaled_int8_quant(
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
 
     Returns:
-      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
+      tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
@@ -2004,6 +2031,7 @@ def moe_wna16_marlin_gemm(
     b_qweight: torch.Tensor,
     b_bias: torch.Tensor | None,
     b_scales: torch.Tensor,
+    a_scales: torch.Tensor | None,
     global_scale: torch.Tensor | None,
     b_qzeros: torch.Tensor | None,
     g_idx: torch.Tensor | None,
@@ -2025,6 +2053,9 @@ def moe_wna16_marlin_gemm(
     use_atomic_add: bool,
     use_fp32_reduce: bool,
     is_zp_float: bool,
+    thread_k: int = -1,
+    thread_n: int = -1,
+    blocks_per_sm: int = -1,
 ) -> torch.Tensor:
     return torch.ops._moe_C.moe_wna16_marlin_gemm(
         input,
@@ -2032,6 +2063,7 @@ def moe_wna16_marlin_gemm(
         b_qweight,
         b_bias,
         b_scales,
+        a_scales,
         global_scale,
         b_qzeros,
         g_idx,
@@ -2053,6 +2085,9 @@ def moe_wna16_marlin_gemm(
         use_atomic_add,
         use_fp32_reduce,
         is_zp_float,
+        thread_k,
+        thread_n,
+        blocks_per_sm,
     )
 
 
@@ -2088,7 +2123,10 @@ def moe_wna16_marlin_gemm_fake(
         input: torch.Tensor,
         output: torch.Tensor | None,
         b_qweight: torch.Tensor,
+        b_bias: torch.Tensor | None,
         b_scales: torch.Tensor,
+        a_scales: torch.Tensor | None,
+        global_scale: torch.Tensor | None,
         b_qzeros: torch.Tensor | None,
         g_idx: torch.Tensor | None,
         perm: torch.Tensor | None,
@@ -2109,7 +2147,7 @@ def moe_wna16_marlin_gemm_fake(
         use_atomic_add: bool,
         use_fp32_reduce: bool,
         is_zp_float: bool,
-    ) -> torch.Tensor:
+    ):
         return torch.empty(
             (size_m * top_k, size_n), dtype=input.dtype, device=input.device
         )
@@ -2583,7 +2621,7 @@ def onednn_scaled_int8_quant(
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
 
     Returns:
-      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
+      tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     token_num = input.numel() // input.shape[-1]
diff --git a/vllm/envs.py b/vllm/envs.py
index 2ac457419a72..8ad62e1b8f50 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -145,6 +145,7 @@
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict"
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
+    VLLM_MARLIN_INPUT_DTYPE: Literal["int8", "fp8"] | None = None
     VLLM_MXFP4_USE_MARLIN: bool | None = None
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
@@ -1122,6 +1123,10 @@ def get_vllm_port() -> int | None:
     "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
         os.environ.get("VLLM_MXFP4_USE_MARLIN", None)
     ),
+    # The activation dtype for marlin kernel
+    "VLLM_MARLIN_INPUT_DTYPE": env_with_choices(
+        "VLLM_MARLIN_INPUT_DTYPE", None, ["int8", "fp8"]
+    ),
     # Whether to turn on the outlines cache for V1
     # This cache is unbounded and on disk, so it's not safe to use in
     # an environment with potentially malicious users.
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 0b0f59f67318..9c377db72013 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -24,7 +24,7 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_make_workspace_new,
     marlin_moe_intermediate_size,
-    maybe_warn_marlin_atomic_add,
+    marlin_quant_input,
 )
 from vllm.scalar_type import ScalarType, scalar_types
 
@@ -65,6 +65,8 @@ def _fused_marlin_moe(
     activation_func: Callable[
         [str, torch.Tensor, torch.Tensor], None
     ] = default_activation_func,
+    input_global_scale1: torch.Tensor | None = None,
+    input_global_scale2: torch.Tensor | None = None,
     global_scale1: torch.Tensor | None = None,
     global_scale2: torch.Tensor | None = None,
     g_idx1: torch.Tensor | None = None,
@@ -77,6 +79,7 @@ def _fused_marlin_moe(
     intermediate_cache13: torch.Tensor | None = None,
     intermediate_cache2: torch.Tensor | None = None,
     output: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
     is_k_full: bool = True,
 ) -> torch.Tensor:
     assert hidden_states.ndim == 2
@@ -106,18 +109,22 @@ def _fused_marlin_moe(
 
     intermediate_cache2 = _resize_cache(intermediate_cache2, (M * num_topk, N))
 
-    maybe_warn_marlin_atomic_add(hidden_states.device, hidden_states.dtype)
-    use_atomic_add = (
-        hidden_states.dtype == torch.half
-        or torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
-    )
+    a_scales1 = None
+    gate_up_input = hidden_states
+    if input_dtype == torch.int8:
+        gate_up_input, a_scales1 = marlin_quant_input(hidden_states, input_dtype)
+        if input_global_scale1 is not None:
+            a_scales1 = a_scales1 * input_global_scale1
+    elif input_dtype == torch.float8_e4m3fn:
+        gate_up_input, a_scales1 = marlin_quant_input(hidden_states, input_dtype)
 
     intermediate_cache1 = ops.moe_wna16_marlin_gemm(
-        hidden_states,
+        gate_up_input,
         intermediate_cache1,
         w1,
         bias1,
         w1_scale,
+        a_scales1,
         global_scale1,
         w1_zeros,
         g_idx1,
@@ -136,7 +143,7 @@ def _fused_marlin_moe(
         size_n=2 * N,
         size_k=K,
         is_k_full=is_k_full,
-        use_atomic_add=use_atomic_add,
+        use_atomic_add=False,
         use_fp32_reduce=True,
         is_zp_float=False,
     )
@@ -151,12 +158,25 @@ def _fused_marlin_moe(
     if expert_map is not None:
         output.zero_()
 
+    a_scales2 = None
+    if input_dtype == torch.int8:
+        intermediate_cache2, a_scales2 = marlin_quant_input(
+            intermediate_cache2, input_dtype
+        )
+        if input_global_scale2 is not None:
+            a_scales2 = a_scales2 * input_global_scale2
+    elif input_dtype == torch.float8_e4m3fn:
+        intermediate_cache2, a_scales2 = marlin_quant_input(
+            intermediate_cache2, input_dtype
+        )
+
     output = ops.moe_wna16_marlin_gemm(
         intermediate_cache2,
         output,
         w2,
         bias2,
         w2_scale,
+        a_scales2,
         global_scale2,
         w2_zeros,
         g_idx2,
@@ -175,7 +195,7 @@ def _fused_marlin_moe(
         size_n=K,
         size_k=N,
         is_k_full=is_k_full,
-        use_atomic_add=use_atomic_add,
+        use_atomic_add=False,
         use_fp32_reduce=True,
         is_zp_float=False,
     )
@@ -203,6 +223,8 @@ def fused_marlin_moe(
     ] = default_activation_func,
     moe_sum: Callable[[torch.Tensor, torch.Tensor], None] | None = None,
     expert_map: torch.Tensor | None = None,
+    input_global_scale1: torch.Tensor | None = None,
+    input_global_scale2: torch.Tensor | None = None,
     global_scale1: torch.Tensor | None = None,
     global_scale2: torch.Tensor | None = None,
     g_idx1: torch.Tensor | None = None,
@@ -216,6 +238,7 @@ def fused_marlin_moe(
     intermediate_cache2: torch.Tensor | None = None,
     is_k_full: bool = True,
     output: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
     inplace: bool = False,
 ) -> torch.Tensor:
     """
@@ -287,6 +310,9 @@ def fused_marlin_moe(
         if M * topk / E / block_size_m < 0.9:
             break
 
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        block_size_m = max(block_size_m, 16)
+
     if global_num_experts == -1:
         global_num_experts = E
     sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
@@ -313,6 +339,8 @@ def fused_marlin_moe(
         num_tokens_post_padded=num_tokens_post_padded,
         activation=activation,
         activation_func=activation_func,
+        input_global_scale1=input_global_scale1,
+        input_global_scale2=input_global_scale2,
         global_scale1=global_scale1,
         global_scale2=global_scale2,
         g_idx1=g_idx1,
@@ -325,6 +353,7 @@ def fused_marlin_moe(
         intermediate_cache13=intermediate_cache13,
         intermediate_cache2=intermediate_cache2,
         output=None,
+        input_dtype=input_dtype,
         is_k_full=is_k_full,
     ).view(-1, topk, K)
 
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index f1943d461187..95e4382c89d7 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -266,7 +266,7 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
             from vllm.model_executor.layers.quantization.awq_marlin import (
                 AWQMarlinConfig,
                 AWQMarlinLinearMethod,
-                AWQMoEMethod,
+                AWQMarlinMoEMethod,
             )
 
             quant_args_marlin = AWQMarlinConfig(
@@ -291,7 +291,7 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
 
         if isinstance(layer, FusedMoE):
             if use_marlin:
-                return AWQMoEMethod(quant_args_marlin, layer.moe_config)
+                return AWQMarlinMoEMethod(quant_args_marlin, layer.moe)
             from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 
             config = {
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 0cf8b69f9f6b..ab68c5dca52c 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -106,7 +106,7 @@ def get_quant_method(
             return AWQLinearMethod(self)
         elif isinstance(layer, FusedMoE):
             # Lazy import to avoid circular import.
-            from .awq_marlin import AWQMarlinConfig, AWQMoEMethod
+            from .awq_marlin import AWQMarlinConfig, AWQMarlinMoEMethod
             from .moe_wna16 import MoeWNA16Config
             from .utils.marlin_utils import check_moe_marlin_supports_layer
 
@@ -136,7 +136,7 @@ def get_quant_method(
             awq_marlin_config = AWQMarlinConfig.from_config(
                 marlin_compatible_config_dict
             )
-            return AWQMoEMethod(awq_marlin_config, layer.moe_config)
+            return AWQMarlinMoEMethod(awq_marlin_config, layer.moe_config)
         return None
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 66945e2d2a7c..d463e181fd2d 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -40,6 +40,8 @@
     check_marlin_supported,
     check_marlin_supports_layer,
     check_moe_marlin_supports_layer,
+    get_marlin_input_dtype,
+    marlin_act_int8_process_scales,
     marlin_make_empty_g_idx,
     marlin_make_workspace_new,
     marlin_moe_permute_scales,
@@ -69,7 +71,6 @@ class AWQMarlinConfig(QuantizationConfig):
     # num_bits -> type
     TYPE_MAP = {
         4: scalar_types.uint4,
-        8: scalar_types.uint8,
     }
 
     def __init__(
@@ -193,7 +194,9 @@ def get_quant_method(
                 return AWQConfig.from_config(self.full_config).get_quant_method(
                     layer, prefix
                 )
-            return AWQMarlinLinearMethod(self)
+            quant_method = AWQMarlinLinearMethod(self)
+            quant_method.input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
         elif isinstance(layer, FusedMoE):
             from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 
@@ -211,7 +214,9 @@ def get_quant_method(
                 return MoeWNA16Config.from_config(self.full_config).get_quant_method(
                     layer, prefix
                 )
-            return AWQMoEMethod(self, layer.moe_config)
+            moe_quant_method = AWQMarlinMoEMethod(self, layer.moe_config)
+            moe_quant_method.input_dtype = get_marlin_input_dtype(prefix)
+            return moe_quant_method
         return None
 
     @classmethod
@@ -270,6 +275,8 @@ class AWQMarlinLinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: AWQMarlinConfig) -> None:
         self.quant_config = quant_config
+        self.quant_type = scalar_types.uint4
+        self.input_dtype = None
 
     def create_weights(
         self,
@@ -312,6 +319,7 @@ def create_weights(
         )
 
         num_groups = input_size_per_partition // group_size
+        layer.num_groups = num_groups
 
         qzeros = PackedvLLMParameter(
             data=torch.empty(
@@ -358,12 +366,19 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Allocate marlin workspace
         layer.workspace = marlin_make_workspace_new(device)
 
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if self.input_dtype == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(layer.qweight, layer.qzeros, inplace=True)
+            layer.scales.data = layer.scales.data * 512
+
         # Repack weights from AWQ format to marlin format.
         marlin_qweight = ops.awq_marlin_repack(
             layer.qweight,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
             num_bits=self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "qweight", marlin_qweight)
 
@@ -373,7 +388,16 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups > 1:
+            marlin_scales, input_global_scale = marlin_act_int8_process_scales(
+                marlin_scales
+            )
+            layer.register_parameter(
+                "input_global_scale", Parameter(input_global_scale, requires_grad=False)
+            )
+
         replace_parameter(layer, "scales", marlin_scales)
 
         # Permute zero-points from AWQ format to marlin format.
@@ -382,6 +406,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.num_groups,
             size_n=layer.output_size_per_partition,
             num_bits=self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "qzeros", marlin_zp)
 
@@ -409,11 +434,13 @@ def apply(
             quant_type=self.quant_config.quant_type,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
+            input_global_scale=getattr(layer, "input_global_scale", None),
             bias=bias,
+            input_dtype=self.input_dtype,
         )
 
 
-class AWQMoEMethod(FusedMoEMethodBase):
+class AWQMarlinMoEMethod(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: AWQMarlinConfig,
@@ -422,8 +449,9 @@ def __init__(
         super().__init__(moe)
         self.quant_config = quant_config
         if self.quant_config.weight_bits != 4:
-            raise ValueError("AWQMoEMethod only supports 4bit now.")
+            raise ValueError("AWQMarlinMoEMethod only supports 4bit now.")
         self.quant_type = scalar_types.uint4
+        self.input_dtype = None
         self.use_marlin = True
 
     def create_weights(
@@ -435,6 +463,7 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        layer.input_dtype = self.input_dtype
         extra_weight_attrs.update(
             {
                 "is_transposed": True,
@@ -468,6 +497,8 @@ def create_weights(
 
         num_groups_w13 = hidden_size // self.quant_config.group_size
         num_groups_w2 = intermediate_size_per_partition // self.quant_config.group_size
+        layer.num_groups_w13 = num_groups_w13
+        layer.num_groups_w2 = num_groups_w2
 
         # WEIGHT_SCALES
         # Allocate 2 scales for w1 and w3 respectively.
@@ -522,6 +553,21 @@ def create_weights(
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         num_experts = layer.w13_qweight.shape[0]
         device = layer.w13_qweight.device
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if self.input_dtype == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(
+                layer.w13_qweight.view(-1, layer.w13_qweight.size(2)),
+                layer.w13_qzeros.view(-1, layer.w13_qzeros.size(2)),
+                inplace=True,
+            )
+            ops.marlin_int4_fp8_preprocess(
+                layer.w2_qweight.view(-1, layer.w2_qweight.size(2)),
+                layer.w2_qzeros.view(-1, layer.w2_qzeros.size(2)),
+                inplace=True,
+            )
+            layer.w13_scales.data = layer.w13_scales.data * 512
+            layer.w2_scales.data = layer.w2_scales.data * 512
 
         layer.w13_g_idx_sort_indices = torch.nn.Parameter(
             torch.empty((num_experts, 0), dtype=torch.int32, device=device),
@@ -538,6 +584,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.w13_qweight.shape[1],
             size_n=layer.w13_qweight.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
 
@@ -547,6 +594,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.w2_qweight.shape[1],
             size_n=layer.w2_qweight.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
 
@@ -556,7 +604,16 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.intermediate_size_per_partition,
             size_n=layer.w13_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w13 > 1:
+            marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w13_scales
+            )
+            layer.register_parameter(
+                "w13_input_global_scale",
+                Parameter(w13_input_global_scale, requires_grad=False),
+            )
 
         replace_parameter(layer, "w13_scales", marlin_w13_scales)
 
@@ -565,7 +622,17 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.intermediate_size_per_partition,
             size_n=layer.w2_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w2 > 1:
+            marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w2_scales
+            )
+            layer.register_parameter(
+                "w2_input_global_scale",
+                Parameter(w2_input_global_scale, requires_grad=False),
+            )
+
         replace_parameter(layer, "w2_scales", marlin_w2_scales)
 
         marlin_w13_zp = moe_awq_to_marlin_zero_points(
@@ -573,6 +640,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.w13_qzeros.shape[1],
             size_n=layer.w13_qzeros.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_qzeros", marlin_w13_zp)
 
@@ -581,6 +649,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.w2_qzeros.shape[1],
             size_n=layer.w2_qzeros.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
 
@@ -636,6 +705,8 @@ def apply(
             router_logits,
             topk_weights,
             topk_ids,
+            input_global_scale1=getattr(layer, "w13_input_global_scale", None),
+            input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
@@ -643,4 +714,5 @@ def apply(
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
             workspace=layer.workspace,
+            input_dtype=self.input_dtype,
         )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index f9d8f5883680..02086c3c0052 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -157,7 +157,9 @@ def get_quant_method(
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
-            return CompressedTensorsMoEMethod.get_moe_method(self, layer, prefix)
+            return CompressedTensorsMoEMethod.get_moe_method(
+                self, layer, layer_name=prefix
+            )
         return None
 
     def _add_fused_moe_to_target_scheme_map(self):
@@ -547,6 +549,7 @@ def _get_scheme_from_parts(
         weight_quant: QuantizationArgs,
         input_quant: QuantizationArgs,
         format: str | None = None,
+        layer_name: str | None = None,
     ) -> "CompressedTensorsScheme":
         # use the per-layer format if defined, otherwise, use global format
         format = format if format is not None else self.quant_format
@@ -585,6 +588,7 @@ def _get_scheme_from_parts(
                     symmetric=weight_quant.symmetric,
                     group_size=weight_quant.group_size,
                     actorder=weight_quant.actorder,
+                    layer_name=layer_name,
                 )
 
         act_quant_format = is_activation_quantization_format(format)
@@ -724,7 +728,10 @@ def get_scheme(
         else:
             # Find the quant_scheme
             scheme = self._get_scheme_from_parts(  # type: ignore
-                weight_quant=weight_quant, input_quant=input_quant, format=format
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+                format=format,
+                layer_name=layer_name,
             )
 
         # Raise error if device does not support the scheme
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c7dfd1787cc8..80ee443d4dd6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -64,6 +64,8 @@
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_moe_marlin_supports_layer,
+    get_marlin_input_dtype,
+    marlin_act_int8_process_scales,
     marlin_make_workspace_new,
     marlin_moe_permute_scales,
 )
@@ -101,7 +103,7 @@ class GPTQMarlinState(Enum):
     "CompressedTensorsW8A8Int8MoEMethod",
     "CompressedTensorsWNA16MarlinMoEMethod",
     "CompressedTensorsWNA16MoEMethod",
-    "CompressedTensorsW4A4Nvfp4MoeMethod",
+    "CompressedTensorsW4A4Nvfp4MoEMethod",
     "CompressedTensorsW4A8Int8MoEMethod",
 ]
 
@@ -111,13 +113,13 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         layer: torch.nn.Module,
-        prefix: str,
+        layer_name: str,
     ) -> "CompressedTensorsMoEMethod":
         # FusedMoE was made by combining multiple Linears so need to
         # make sure quantization config for Linear can target it
         quant_config._add_fused_moe_to_target_scheme_map()
         unfused_names = [
-            prefix + proj_name
+            layer_name + proj_name
             for proj_name in [".0.gate_proj", ".0.up_proj", ".0.down_proj"]
         ]
         # TODO: refactor this to use expert_mapping and check all layer numbers
@@ -158,32 +160,40 @@ def get_moe_method(
                         "WNA16MoE is not supported with actorder=group/dynamic."
                     )
                 logger.info_once("Using CompressedTensorsWNA16MoEMethod")
-                return CompressedTensorsWNA16MoEMethod(quant_config, layer.moe_config)
+                return CompressedTensorsWNA16MoEMethod(
+                    quant_config, layer.moe_config, layer_name
+                )
             else:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MarlinMoEMethod(
-                    quant_config, layer.moe_config
+                    quant_config, layer.moe_config, layer_name
                 )
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
-            return CompressedTensorsW4A4Nvfp4MoeMethod(layer.moe_config)
+            return CompressedTensorsW4A4Nvfp4MoEMethod(layer.moe_config, layer_name)
         elif (
             quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
             or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
             or quant_config._is_fp8_w8a8(weight_quant, input_quant)
         ):
-            return CompressedTensorsW8A8Fp8MoEMethod(quant_config, layer.moe_config)
+            return CompressedTensorsW8A8Fp8MoEMethod(
+                quant_config, layer.moe_config, layer_name
+            )
         elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8Int8MoEMethod(quant_config, layer.moe_config)
+            return CompressedTensorsW8A8Int8MoEMethod(
+                quant_config, layer.moe_config, layer_name
+            )
         elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
-            return CompressedTensorsW4A8Int8MoEMethod(quant_config, layer.moe_config)
+            return CompressedTensorsW4A8Int8MoEMethod(
+                quant_config, layer.moe_config, layer_name
+            )
         else:
             raise RuntimeError(
                 f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
             )
 
 
-class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
-    def __init__(self, moe: FusedMoEConfig):
+class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
+    def __init__(self, moe: FusedMoEConfig, layer_name: str | None = None):
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
             detect_nvfp4_moe_support,
         )
@@ -194,17 +204,21 @@ def __init__(self, moe: FusedMoEConfig):
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
+        self.layer_name = layer_name
+        self.marlin_input_dtype = (
+            get_marlin_input_dtype(layer_name) if self.use_marlin else None
+        )
         self.flashinfer_moe_backend = None
         if self.allow_flashinfer:
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
-                " for CompressedTensorsW4A4Nvfp4MoeMethod."
+                " for CompressedTensorsW4A4Nvfp4MoEMethod."
             )
         elif self.use_marlin:
-            logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoeMethod.")
+            logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoEMethod.")
         else:
-            logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoeMethod.")
+            logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoEMethod.")
 
     def create_weights(
         self,
@@ -354,7 +368,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         )
 
         if self.use_marlin:
-            prepare_moe_fp4_layer_for_marlin(layer)
+            prepare_moe_fp4_layer_for_marlin(layer, input_dtype=self.marlin_input_dtype)
             return
         # w13
         if (
@@ -538,7 +552,7 @@ def apply(
         ):
             if enable_eplb:
                 raise NotImplementedError(
-                    "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
+                    "EPLB not supported for `CompressedTensorsW4A4MoEMethod` yet."
                 )
 
             return flashinfer_trtllm_fp4_moe(
@@ -576,6 +590,7 @@ def apply(
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
 
@@ -610,7 +625,7 @@ def apply(
             assert expert_map is None, (
                 "Expert Parallelism / expert_map "
                 "is currently not supported for "
-                "CompressedTensorsW4A4Nvfp4MoeMethod."
+                "CompressedTensorsW4A4Nvfp4MoEMethod."
             )
             assert self.moe_quant_config is not None
 
@@ -637,6 +652,7 @@ def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -690,6 +706,10 @@ def __init__(
             or self.is_fp8_w8a8_sm100
         )
         self.disable_expert_map = False
+        self.layer_name = layer_name
+        self.marlin_input_dtype = (
+            get_marlin_input_dtype(layer_name) if self.use_marlin else None
+        )
 
     def create_weights(
         self,
@@ -931,7 +951,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
 
         elif self.use_marlin:
-            prepare_moe_fp8_layer_for_marlin(layer, False)
+            prepare_moe_fp8_layer_for_marlin(
+                layer, False, input_dtype=self.marlin_input_dtype
+            )
             # Activations not quantized for marlin.
             del layer.w13_input_scale
             del layer.w2_input_scale
@@ -1144,6 +1166,7 @@ def apply(
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
 
@@ -1240,6 +1263,7 @@ def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -1392,6 +1416,7 @@ def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -1403,6 +1428,8 @@ def __init__(
         self.strategy = config.strategy
         self.group_size = config.group_size
         self.actorder = config.actorder
+        self.layer_name = layer_name
+        self.marlin_input_dtype = get_marlin_input_dtype(layer_name)
         assert config.symmetric, "Only symmetric quantization is supported for MoE"
 
         if not (
@@ -1477,6 +1504,9 @@ def create_weights(
             num_groups_w2 = w2_scales_size // self.group_size
             num_groups_w13 = hidden_size // self.group_size
 
+        layer.num_groups_w13 = num_groups_w13
+        layer.num_groups_w2 = num_groups_w2
+
         w13_scale = torch.nn.Parameter(
             torch.ones(
                 num_experts,
@@ -1560,6 +1590,17 @@ def create_weights(
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         num_experts = layer.w13_weight_g_idx.shape[0]
         device = layer.w13_weight_g_idx.device
+        is_a_8bit = (
+            self.marlin_input_dtype is not None
+            and self.marlin_input_dtype.itemsize == 1
+        )
+
+        if self.marlin_input_dtype == torch.float8_e4m3fn:
+            # NOTE: for non-zp quantization format only
+            ops.marlin_int4_fp8_preprocess(layer.w13_weight_packed, inplace=True)
+            ops.marlin_int4_fp8_preprocess(layer.w2_weight_packed, inplace=True)
+            layer.w13_weight_scale.data = layer.w13_weight_scale.data * 512
+            layer.w2_weight_scale.data = layer.w2_weight_scale.data * 512
 
         # when running models with grouped act order,
         # resort to g_idx values provided in checkpoint
@@ -1610,31 +1651,54 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w13_weight_packed.shape[1] * self.packed_factor,
             layer.w13_weight_packed.shape[2],
             self.num_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_weight_packed", marlin_w13_qweight)
+
         marlin_w2_qweight = ops.gptq_marlin_moe_repack(
             layer.w2_weight_packed,
             layer.w2_g_idx_sort_indices,
             layer.w2_weight_packed.shape[1] * self.packed_factor,
             layer.w2_weight_packed.shape[2],
             self.num_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_weight_packed", marlin_w2_qweight)
+
         # Repack scales
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_weight_scale,
             size_k=layer.w13_weight_packed.shape[2],
             size_n=layer.w13_weight_scale.shape[2],
             group_size=self.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.marlin_input_dtype == torch.int8 and layer.num_groups_w13 > 1:
+            marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w13_scales
+            )
+            layer.register_parameter(
+                "w13_input_global_scale",
+                torch.nn.Parameter(w13_input_global_scale, requires_grad=False),
+            )
         replace_parameter(layer, "w13_weight_scale", marlin_w13_scales)
+
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_weight_scale,
             size_k=layer.w2_weight_scale.shape[1]
             * (self.group_size if self.group_size != -1 else self.packed_factor),
             size_n=layer.w2_weight_scale.shape[2],
             group_size=self.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.marlin_input_dtype == torch.int8 and layer.num_groups_w2 > 1:
+            marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w2_scales
+            )
+            layer.register_parameter(
+                "w2_input_global_scale",
+                torch.nn.Parameter(w2_input_global_scale, requires_grad=False),
+            )
         replace_parameter(layer, "w2_weight_scale", marlin_w2_scales)
 
         layer.workspace = marlin_make_workspace_new(device, 4)
@@ -1729,6 +1793,8 @@ def apply(
             router_logits,
             topk_weights,
             topk_ids,
+            input_global_scale1=getattr(layer, "w13_input_global_scale", None),
+            input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
@@ -1738,6 +1804,7 @@ def apply(
             sort_indices1=layer.w13_g_idx_sort_indices,
             sort_indices2=layer.w2_g_idx_sort_indices,
             workspace=layer.workspace,
+            input_dtype=self.marlin_input_dtype,
             is_k_full=self.is_k_full,
         )
 
@@ -1747,6 +1814,7 @@ def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -1999,6 +2067,7 @@ def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.has_bias = self.moe.has_bias
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 2267395fe67d..7f4dad70287b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -14,7 +14,11 @@
     MPLinearLayerConfig,
     choose_mp_linear_kernel,
 )
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import (
+    MarlinLinearKernel,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
     marlin_repeat_scales_on_all_ranks,
 )
 from vllm.model_executor.parameter import (
@@ -45,12 +49,14 @@ def __init__(
         group_size: int | None = None,
         symmetric: bool | None = True,
         actorder: ActivationOrdering | None = None,
+        layer_name: str | None = None,
     ):
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
         self.symmetric = symmetric
         self.group_size = -1 if group_size is None else group_size
         self.has_g_idx = actorder == ActivationOrdering.GROUP
+        self.layer_name = layer_name
 
         if self.group_size == -1 and self.strategy != "channel":
             raise ValueError(
@@ -108,6 +114,11 @@ def create_weights(
             logger.info("Using %s for CompressedTensorsWNA16", kernel_type.__name__)
             self._kernel_backends_being_used.add(kernel_type.__name__)
 
+        if isinstance(kernel_type, MarlinLinearKernel):
+            input_dtype = get_marlin_input_dtype(self.layer_name)
+            if input_dtype is not None:
+                mp_linear_kernel_config.act_type = input_dtype
+
         # If group_size is -1, we are in channelwise case.
         group_size = self.group_size if self.group_size != -1 else input_size
         row_parallel = input_size != input_size_per_partition
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 7dfc8a9c36c3..48223c9f103e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -69,6 +69,9 @@
     process_fp8_weight_tensor_strategy,
     validate_fp8_block_shape,
 )
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear,
     prepare_fp8_layer_for_marlin,
@@ -316,7 +319,9 @@ def get_quant_method(
                 fused_mapping=self.packed_modules_mapping,
             ):
                 return UnquantizedLinearMethod()
-            return Fp8LinearMethod(self)
+            quant_method = Fp8LinearMethod(self)
+            quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
         elif isinstance(layer, FusedMoE):
             if is_layer_skipped(
                 prefix=prefix,
@@ -324,7 +329,9 @@ def get_quant_method(
                 fused_mapping=self.packed_modules_mapping,
             ):
                 return UnquantizedFusedMoEMethod(layer.moe_config)
-            return Fp8MoEMethod(self, layer)
+            moe_quant_method = Fp8MoEMethod(self, layer)
+            moe_quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return moe_quant_method
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
         return None
@@ -375,6 +382,7 @@ def __init__(self, quant_config: Fp8Config):
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
+        self.marlin_input_dtype = None
         self.use_marlin = (
             not current_platform.has_device_capability(89)
             or envs.VLLM_TEST_FORCE_FP8_MARLIN
@@ -552,7 +560,9 @@ def process_weights_after_loading(self, layer: Module) -> None:
         )
 
         if self.use_marlin:
-            prepare_fp8_layer_for_marlin(layer, size_k_first)
+            prepare_fp8_layer_for_marlin(
+                layer, size_k_first, input_dtype=self.marlin_input_dtype
+            )
             # Activations not quantized for marlin.
             del layer.input_scale
             return
@@ -610,6 +620,7 @@ def apply(
                 workspace=layer.workspace,
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
+                input_dtype=self.marlin_input_dtype,
                 bias=bias,
             )
 
@@ -657,6 +668,7 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
             self.block_quant, layer.moe_parallel_config
         )
 
+        self.marlin_input_dtype = None
         self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
         if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
@@ -1031,7 +1043,9 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 layer.w13_weight.data = w13_weight.data
 
         if self.use_marlin:
-            prepare_moe_fp8_layer_for_marlin(layer, False)
+            prepare_moe_fp8_layer_for_marlin(
+                layer, False, input_dtype=self.marlin_input_dtype
+            )
             # Activations not quantized for marlin.
             del layer.w13_input_scale
             del layer.w2_input_scale
@@ -1270,6 +1284,7 @@ def apply(
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 77b15db373a3..56034e11329d 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -41,6 +41,8 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported,
     check_moe_marlin_supports_layer,
+    get_marlin_input_dtype,
+    marlin_act_int8_process_scales,
     marlin_make_workspace_new,
     marlin_moe_permute_scales,
     marlin_permute_bias,
@@ -251,8 +253,21 @@ def get_quant_method(
                 return MoeWNA16Config.from_config(self.full_config).get_quant_method(
                     layer, prefix
                 )
-            return get_moe_quant_method(self, layer, prefix, GPTQMarlinMoEMethod)
-        return get_linear_quant_method(self, layer, prefix, GPTQMarlinLinearMethod)
+            moe_quant_method = get_moe_quant_method(
+                self, layer, prefix, GPTQMarlinMoEMethod
+            )
+            if moe_quant_method is None:
+                return None
+            moe_quant_method.input_dtype = get_marlin_input_dtype(prefix)
+            return moe_quant_method
+
+        quant_method = get_linear_quant_method(
+            self, layer, prefix, GPTQMarlinLinearMethod
+        )
+        if quant_method is None:
+            return None
+        quant_method.input_dtype = get_marlin_input_dtype(prefix)
+        return quant_method
 
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: dict[str, Any]):
@@ -319,6 +334,8 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: GPTQMarlinConfig) -> None:
         self.quant_config = quant_config
+        self.input_dtype = None
+        self.quant_type = self.quant_config.quant_type
 
         # Verify supported on platform.
         verify_marlin_supported(
@@ -339,6 +356,7 @@ def create_weights(
         output_size_per_partition = sum(output_partition_sizes)
         is_row_parallel = input_size != input_size_per_partition
         weight_loader = extra_weight_attrs.get("weight_loader")
+        input_dtype = self.input_dtype
 
         mp_linear_kernel_config = MPLinearLayerConfig(
             full_weight_shape=(input_size, output_size),
@@ -347,7 +365,7 @@ def create_weights(
                 output_size_per_partition,
             ),
             weight_type=self.quant_config.quant_type,
-            act_type=params_dtype,
+            act_type=params_dtype if input_dtype is None else input_dtype,
             group_size=self.quant_config.group_size,
             zero_points=False,
             has_g_idx=self.quant_config.desc_act,
@@ -482,6 +500,7 @@ def __init__(
             self.quant_type = scalar_types.uint8b128
         else:
             raise ValueError("GPTQMarlinMoEMethod only supports int4 and int8 now.")
+        self.input_dtype = None
         self.use_marlin = True
 
     def create_weights(
@@ -493,6 +512,14 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        layer.input_dtype = self.input_dtype
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if is_a_8bit:
+            assert self.quant_type == scalar_types.uint4b8, (
+                "W8A8-INT8 is not supported by marlin kernel."
+            )
+
         intermediate_size_full = extra_weight_attrs.pop("intermediate_size_full")
 
         self.is_k_full = (not self.quant_config.desc_act) or (
@@ -513,6 +540,9 @@ def create_weights(
             scales_size2 = 1
             strategy = FusedMoeWeightScaleSupported.CHANNEL.value
 
+        layer.num_groups_w13 = scales_size13
+        layer.num_groups_w2 = scales_size2
+
         extra_weight_attrs.update({"quant_method": strategy, "is_transposed": True})
         # Fused gate_up_proj (column parallel)
         w13_qweight = torch.nn.Parameter(
@@ -630,6 +660,19 @@ def create_weights(
         layer.workspace = marlin_make_workspace_new(device, 4)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if is_a_8bit:
+            assert self.quant_type == scalar_types.uint4b8, (
+                "W8A8-INT8 is not supported by marlin kernel."
+            )
+
+        if self.input_dtype == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(layer.w13_qweight, inplace=True)
+            ops.marlin_int4_fp8_preprocess(layer.w2_qweight, inplace=True)
+            layer.w13_scales.data = layer.w13_scales.data * 512
+            layer.w2_scales.data = layer.w2_scales.data * 512
+
         # Process act_order
         if self.quant_config.desc_act:
             # Get sorting based on g_idx
@@ -678,6 +721,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
             layer.w13_qweight.shape[2],
             self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
         marlin_w2_qweight = ops.gptq_marlin_moe_repack(
@@ -686,6 +730,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
             layer.w2_qweight.shape[2],
             self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
         # Repack scales
@@ -694,7 +739,17 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.intermediate_size_per_partition,
             size_n=layer.w13_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w13 > 1:
+            marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w13_scales
+            )
+            layer.register_parameter(
+                "w13_input_global_scale",
+                torch.nn.Parameter(w13_input_global_scale, requires_grad=False),
+            )
+
         replace_parameter(layer, "w13_scales", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
@@ -706,7 +761,17 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             ),
             size_n=layer.w2_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w2 > 1:
+            marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w2_scales
+            )
+            layer.register_parameter(
+                "w2_input_global_scale",
+                torch.nn.Parameter(w2_input_global_scale, requires_grad=False),
+            )
+
         replace_parameter(layer, "w2_scales", marlin_w2_scales)
 
         if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
@@ -761,6 +826,8 @@ def apply(
             router_logits,
             topk_weights,
             topk_ids,
+            input_global_scale1=getattr(layer, "w13_input_global_scale", None),
+            input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
@@ -771,4 +838,5 @@ def apply(
             sort_indices2=layer.w2_g_idx_sort_indices,
             workspace=layer.workspace,
             is_k_full=self.is_k_full,
+            input_dtype=self.input_dtype,
         )
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index 5fb67c35378b..fad8cb10fa8a 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -351,6 +351,7 @@ def apply(
             bias,
             scales,
             None,
+            None,
             zeros,
             layer.g_idx,
             layer.g_idx_sort_indices,
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index ac21286eeffa..faaa45b861de 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -9,6 +9,7 @@
     MARLIN_SUPPORTED_GROUP_SIZES,
     apply_gptq_marlin_linear,
     check_marlin_supports_shape,
+    marlin_act_int8_process_scales,
     marlin_is_k_full,
     marlin_make_empty_g_idx,
     marlin_make_workspace_new,
@@ -21,6 +22,7 @@
 )
 from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
 from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
 
 from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 
@@ -65,6 +67,18 @@ def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         device = getattr(layer, self.w_q_name).device
         c = self.config
+        is_a_8bit = c.act_type is not None and c.act_type.itemsize == 1
+
+        if is_a_8bit:
+            assert c.weight_type == scalar_types.uint4b8, (
+                "W8A8 is not supported by marlin kernel."
+            )
+
+        if c.act_type == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(getattr(layer, self.w_q_name), inplace=True)
+            getattr(layer, self.w_s_name).data = (
+                getattr(layer, self.w_s_name).data * 512
+            )
 
         row_parallel = c.partition_weight_shape[0] != c.full_weight_shape[0]
         self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
@@ -88,6 +102,7 @@ def transform_w_q(x):
                 size_k=c.partition_weight_shape[0],
                 size_n=c.partition_weight_shape[1],
                 num_bits=c.weight_type.size_bits,
+                is_a_8bit=is_a_8bit,
             )
             return x
 
@@ -99,7 +114,22 @@ def transform_w_s(x):
                 size_k=c.partition_weight_shape[0],
                 size_n=c.partition_weight_shape[1],
                 group_size=c.group_size,
+                is_a_8bit=is_a_8bit,
             )
+
+            if c.group_size == -1:
+                num_groups = 1
+            else:
+                num_groups = c.partition_weight_shape[0] // c.group_size
+
+            if c.act_type == torch.int8 and num_groups > 1:
+                x.data, input_global_scale = marlin_act_int8_process_scales(x.data)
+                layer.register_parameter(
+                    "input_global_scale",
+                    torch.nn.Parameter(input_global_scale, requires_grad=False),
+                )
+            else:
+                layer.input_global_scale = None
             return x
 
         if c.has_g_idx:
@@ -129,6 +159,7 @@ def transform_w_s(x):
                     size_k=grouped_k,
                     size_n=c.partition_weight_shape[1],
                     num_bits=c.weight_type.size_bits,
+                    is_a_8bit=is_a_8bit,
                 ),
             )
         else:
@@ -150,6 +181,7 @@ def apply_weights(
 
         # `process_weights_after_loading` will ensure w_zp and w_gidx are not
         #  None for marlin
+
         return apply_gptq_marlin_linear(
             input=x,
             weight=w_q,
@@ -162,5 +194,7 @@ def apply_weights(
             input_size_per_partition=c.partition_weight_shape[0],
             output_size_per_partition=c.partition_weight_shape[1],
             is_k_full=self.is_k_full,
+            input_global_scale=getattr(layer, "input_global_scale", None),
             bias=bias,
+            input_dtype=c.act_type,
         )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 80f8e3a03e7c..709c86175477 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -55,6 +55,9 @@
     select_cutlass_fp8_gemm_impl,
     swap_w13_to_w31,
 )
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     apply_fp4_marlin_linear,
     is_fp4_marlin_supported,
@@ -170,9 +173,15 @@ def get_quant_method(
 
         # now, the layer is quantized, handle it here
         if isinstance(layer, LinearBase):
-            return self.LinearMethodCls(self)
+            quant_method = self.LinearMethodCls(self)
+            if getattr(quant_method, "backend", "") == "marlin":
+                quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
         elif isinstance(layer, FusedMoE):
-            return self.FusedMoEMethodCls(quant_config=self, layer=layer)
+            quant_method = self.FusedMoEMethodCls(quant_config=self, layer=layer)
+            if getattr(quant_method, "backend", "") == "marlin":
+                quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
 
         return None
 
@@ -898,6 +907,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         self.quant_config = quant_config
+        self.marlin_input_dtype = None
 
         self.backend = "none"
         if envs.VLLM_NVFP4_GEMM_BACKEND is None:
@@ -1065,6 +1075,7 @@ def apply(
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
                 bias=bias,
+                input_dtype=self.marlin_input_dtype,
             )
 
         output_dtype = x.dtype
@@ -1124,6 +1135,7 @@ def __init__(
         self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
+        self.marlin_input_dtype = None
         self.flashinfer_moe_backend = None
         if self.allow_flashinfer:
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
@@ -1517,7 +1529,7 @@ def apply(
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                workspace=layer.workspace,
+                input_dtype=self.marlin_input_dtype,
             )
 
         elif self.allow_flashinfer:
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index bc241ac692e2..d271e56e0856 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -38,6 +38,9 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     prepare_moe_fp4_layer_for_marlin,
 )
@@ -205,7 +208,9 @@ def get_quant_method(
             if current_platform.is_xpu():
                 return IpexMxfp4MoEMethod(layer.moe_config)
             else:
-                return Mxfp4MoEMethod(layer.moe_config)
+                quant_method = Mxfp4MoEMethod(layer.moe_config)
+                quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+                return quant_method
         elif isinstance(layer, Attention):
             # TODO: Add support for MXFP4 Attention.
             logger.debug_once(
@@ -220,6 +225,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
         self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
+
+        self.marlin_input_dtype = None
         self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
@@ -385,7 +392,7 @@ def create_weights(
 
     def process_weights_after_loading(self, layer):
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            prepare_moe_fp4_layer_for_marlin(layer)
+            prepare_moe_fp4_layer_for_marlin(layer, input_dtype=self.marlin_input_dtype)
         elif (
             self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
             or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
@@ -914,6 +921,7 @@ def apply(
                 global_num_experts=global_num_experts,
                 activation=activation,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
             )
 
         assert _can_support_mxfp4(
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 071fb4ba1686..14337ee1d7be 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -9,6 +9,11 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_quant_int8,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
@@ -286,10 +291,10 @@ def get_scale_perms():
 
 
 def marlin_permute_scales(
-    s: torch.Tensor, size_k: int, size_n: int, group_size: int
+    s: torch.Tensor, size_k: int, size_n: int, group_size: int, is_a_8bit: bool = False
 ) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
-    if group_size < size_k and group_size != -1:
+    if group_size < size_k and group_size != -1 and not is_a_8bit:
         s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
     else:
         s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
@@ -305,11 +310,15 @@ def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor:
     return s.reshape(*origin_shape).contiguous()
 
 
+def marlin_act_int8_process_scales(s: torch.Tensor):
+    a_scales_scale_factor = 1 / 4096 * s.max().float()
+    s = s / s.max() * 4096
+    s = s.round().to(torch.int16).view(s.dtype)
+    return s, a_scales_scale_factor
+
+
 def marlin_moe_permute_scales(
-    s: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    group_size: int,
+    s: torch.Tensor, size_k: int, size_n: int, group_size: int, is_a_8bit: bool = False
 ):
     num_experts = s.shape[0]
     output = torch.empty(
@@ -319,12 +328,12 @@ def marlin_moe_permute_scales(
     )
 
     for e in range(num_experts):
-        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size, is_a_8bit)
     return output
 
 
 def marlin_zero_points(
-    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int, is_a_8bit: bool = False
 ) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
@@ -339,7 +348,8 @@ def marlin_zero_points(
     else:
         raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
 
-    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
+    if not is_a_8bit:
+        zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
     zp = zp.reshape((-1, size_n)).contiguous()
     zp = pack_cols(zp, num_bits, size_k, size_n)
 
@@ -347,7 +357,11 @@ def marlin_zero_points(
 
 
 def awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    q_zp_packed: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
@@ -366,12 +380,16 @@ def awq_to_marlin_zero_points(
     q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
     q_zp = q_zp.reshape((-1, size_n)).contiguous()
 
-    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
+    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits, is_a_8bit)
     return marlin_zp
 
 
 def moe_awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    q_zp_packed: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+    is_a_8bit: bool = False,
 ):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
@@ -380,7 +398,9 @@ def moe_awq_to_marlin_zero_points(
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
-        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits)
+        output[e] = awq_to_marlin_zero_points(
+            q_zp_packed[e], size_k, size_n, num_bits, is_a_8bit
+        )
     return output
 
 
@@ -432,6 +452,48 @@ def should_use_atomic_add_reduce(
     return True
 
 
+_quant_fp8_method: QuantFP8 | None = None
+
+
+def get__quant_fp8_method() -> QuantFP8:
+    global _quant_fp8_method
+    if _quant_fp8_method is None:
+        _quant_fp8_method = QuantFP8(False, GroupShape.PER_TOKEN)
+    return _quant_fp8_method
+
+
+def get_marlin_input_dtype(prefix):
+    if envs.VLLM_MARLIN_INPUT_DTYPE is None:
+        return
+    elif envs.VLLM_MARLIN_INPUT_DTYPE.lower() == "int8":
+        return torch.int8
+    elif envs.VLLM_MARLIN_INPUT_DTYPE.lower() == "fp8":
+        if not current_platform.is_device_capability(
+            89
+        ) and not current_platform.is_device_capability(120):
+            raise ValueError(
+                "Marlin W4A8-FP8 only support SM89 or SM120 device "
+                "(It is slower than Marlin W4A16 on other devices). "
+                "You can consider using W4A8-INT8 instead"
+                "(set VLLM_MARLIN_INPUT_DTYPE=int8)."
+            )
+
+        _ = get__quant_fp8_method()
+        return torch.float8_e4m3fn
+    else:
+        return
+
+
+def marlin_quant_input(x: torch.Tensor, quant_dtype: torch.dtype):
+    x = x.reshape(-1, x.shape[-1])
+    if quant_dtype == torch.int8:
+        return per_token_quant_int8(x)
+    elif quant_dtype == torch.float8_e4m3fn:
+        return get__quant_fp8_method()(x)
+    else:
+        raise ValueError(f"unsupported quant_dtype {quant_dtype}")
+
+
 def apply_gptq_marlin_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -444,8 +506,10 @@ def apply_gptq_marlin_linear(
     output_size_per_partition: int,
     input_size_per_partition: int,
     is_k_full: bool,
+    input_global_scale: torch.Tensor | None = None,
     bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+    input_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition,)
@@ -458,12 +522,27 @@ def apply_gptq_marlin_linear(
         dtype=input.dtype,
     )
 
+    a_scales = None
+    if input_dtype == torch.int8:
+        assert wtype == scalar_types.uint4b8, (
+            "W8A8-INT8 is not supported by marlin kernel."
+        )
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+        a_scales = a_scales * input_global_scale
+    elif input_dtype == torch.float8_e4m3fn:
+        assert wtype == scalar_types.uint4b8, (
+            "INT8 weight + FP8 activation is not supported."
+        )
+
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+
     output = ops.gptq_marlin_gemm(
         reshaped_x,
         None,
         weight,
         bias,
         weight_scale,
+        a_scales,
         None,
         weight_zp,
         g_idx,
@@ -493,8 +572,10 @@ def apply_awq_marlin_linear(
     quant_type: ScalarType,
     output_size_per_partition: int,
     input_size_per_partition: int,
+    input_global_scale: torch.Tensor | None = None,
     bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+    input_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition,)
@@ -507,12 +588,20 @@ def apply_awq_marlin_linear(
         dtype=input.dtype,
     )
 
+    a_scales = None
+    if input_dtype == torch.int8:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+        a_scales = a_scales * input_global_scale
+    elif input_dtype == torch.float8_e4m3fn:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+
     output = ops.gptq_marlin_gemm(
         reshaped_x,
         None,
         weight,
         bias,
         weight_scale,
+        a_scales,
         None,
         weight_zp,
         g_idx,
@@ -538,8 +627,10 @@ def apply_rtn_marlin_linear(
     quant_type: ScalarType,
     output_size_per_partition: int,
     input_size_per_partition: int,
+    input_global_scale: torch.Tensor | None = None,
     bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+    input_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition,)
@@ -552,12 +643,20 @@ def apply_rtn_marlin_linear(
         dtype=input.dtype,
     )
 
+    a_scales = None
+    if input_dtype == torch.int8:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+        a_scales = a_scales * input_global_scale
+    elif input_dtype == torch.float8_e4m3fn:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+
     output = ops.gptq_marlin_gemm(
         reshaped_x,
         None,
         weight,
         bias,
         weight_scale,
+        a_scales,
         None,
         None,
         None,
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index 842fb9b62267..b94d5bbf3654 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -11,6 +11,7 @@
     marlin_make_workspace_new,
     marlin_permute_bias,
     marlin_permute_scales,
+    marlin_quant_input,
     should_use_atomic_add_reduce,
 )
 from vllm.platforms import current_platform
@@ -37,12 +38,6 @@ def nvfp4_marlin_process_scales(marlin_scales):
     # convert to half first, we would convert to fp8 later
     marlin_scales = marlin_scales.to(torch.half)
 
-    # 8 is the number of scale number using by one thread
-    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
-    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
-        marlin_scales.size(0) * 2, -1
-    )
-
     # fit the layout of fp8 dequantization
     marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
         marlin_scales.size(0), -1
@@ -62,18 +57,20 @@ def nvfp4_marlin_process_scales(marlin_scales):
     return marlin_scales
 
 
-def mxfp4_marlin_process_scales(marlin_scales):
-    # 8 is the number of scale number using by one thread
-    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
-    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
-        marlin_scales.size(0) * 2, -1
-    )
-
+def mxfp4_marlin_process_scales(marlin_scales, input_dtype=None):
     # fit the layout of fp8 dequantization
-    marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
-        marlin_scales.size(0), -1
-    )
+    if input_dtype is None or input_dtype.itemsize == 2:
+        marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
+            marlin_scales.size(0), -1
+        )
+
     marlin_scales = marlin_scales.to(torch.float8_e8m0fnu)
+    if input_dtype == torch.float8_e4m3fn:
+        marlin_scales = marlin_scales.view(torch.uint8)
+        assert marlin_scales.max() <= 249
+        # exponent_bias (fp4->fp8) = 2 ** 3 - 2 ** 1 = 6
+        marlin_scales = marlin_scales + 6
+        marlin_scales = marlin_scales.view(torch.float8_e8m0fnu)
     return marlin_scales
 
 
@@ -99,6 +96,7 @@ def apply_fp4_marlin_linear(
     size_n: int,
     size_k: int,
     bias: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
 ) -> torch.Tensor:
     # For GPUs that lack FP4 hardware support, we can leverage the
@@ -111,12 +109,24 @@ def apply_fp4_marlin_linear(
         m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype
     )
 
+    inputs = reshaped_x
+    a_scales = None
+    is_nvfp4 = weight_scale_2 is not None
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if is_nvfp4:
+            raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+        elif input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
+        inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+
     output = ops.gptq_marlin_gemm(
-        a=reshaped_x,
+        a=inputs,
         c=None,
         b_q_weight=weight,
         b_bias=bias,
         b_scales=weight_scale,
+        a_scales=a_scales,
         global_scale=weight_scale_2,
         b_zeros=None,
         g_idx=None,
@@ -133,7 +143,9 @@ def apply_fp4_marlin_linear(
     return output.reshape(out_shape)
 
 
-def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+def prepare_fp4_layer_for_marlin(
+    layer: torch.nn.Module, input_dtype: torch.dtype | None = None
+) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP4 computation but "
         "FP4 quantization is being used. Weight-only FP4 compression will "
@@ -160,12 +172,14 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     perm = torch.empty(0, dtype=torch.int, device=device)
     qweight = layer.weight.view(torch.int32).T.contiguous()
 
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
     marlin_qweight = ops.gptq_marlin_repack(
         b_q_weight=qweight,
         perm=perm,
         size_k=part_size_k,
         size_n=part_size_n,
         num_bits=4,
+        is_a_8bit=is_a_8bit,
     )
     layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
 
@@ -178,7 +192,11 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
 
     weight_scale = weight_scale.to(param_dtype)
     weight_scale = marlin_permute_scales(
-        s=weight_scale, size_k=part_size_k, size_n=part_size_n, group_size=group_size
+        s=weight_scale,
+        size_k=part_size_k,
+        size_n=part_size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
 
     if is_nvfp4:
@@ -189,7 +207,9 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         weight_scale_2 = nvfp4_marlin_process_global_scale(weight_scale_2)
         layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2, requires_grad=False)
     else:
-        weight_scale = mxfp4_marlin_process_scales(weight_scale)
+        weight_scale = mxfp4_marlin_process_scales(
+            weight_scale, input_dtype=input_dtype
+        )
         layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
 
     if hasattr(layer, "bias") and layer.bias is not None:
@@ -200,7 +220,9 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     return
 
 
-def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+def prepare_moe_fp4_layer_for_marlin(
+    layer: torch.nn.Module, input_dtype: torch.dtype | None = None
+) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP4 computation but "
         "FP4 quantization is being used. Weight-only FP4 compression will "
@@ -220,6 +242,7 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     param_dtype = layer.params_dtype
     layer.workspace = marlin_make_workspace_new(device, 4)
     perm = torch.empty(0, dtype=torch.int, device=device)
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
 
     # WEIGHT
     # Repack weights to marlin format
@@ -237,7 +260,12 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
             qweight = weight[i].view(torch.int32).T.contiguous()
 
             marlin_qweight = ops.gptq_marlin_repack(
-                b_q_weight=qweight, perm=perm, size_k=size_k, size_n=size_n, num_bits=4
+                b_q_weight=qweight,
+                perm=perm,
+                size_k=size_k,
+                size_n=size_n,
+                num_bits=4,
+                is_a_8bit=is_a_8bit,
             )
             tensor_list.append(marlin_qweight)
 
@@ -266,12 +294,18 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
             scale = scales[i].T
 
             marlin_scales = marlin_permute_scales(
-                s=scale, size_k=size_k, size_n=size_n, group_size=group_size
+                s=scale,
+                size_k=size_k,
+                size_n=size_n,
+                group_size=group_size,
+                is_a_8bit=is_a_8bit,
             )
             if is_nvfp4:
                 marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
             else:
-                marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
+                marlin_scales = mxfp4_marlin_process_scales(
+                    marlin_scales, input_dtype=input_dtype
+                )
             tensor_list.append(marlin_scales)
 
         scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
@@ -301,7 +335,10 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         setattr(layer, name, bias)
 
 
-def rand_marlin_weight_nvfp4_like(weight, group_size):
+def rand_marlin_weight_nvfp4_like(weight, group_size, input_dtype=None):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+
+    assert not is_a_8bit, "NVFP4 weight + INT8/FP8 activation is not supported."
     assert group_size > 0
     size_n, size_k = weight.shape
     device = weight.device
@@ -337,10 +374,15 @@ def rand_marlin_weight_nvfp4_like(weight, group_size):
         size_k=size_k,
         size_n=size_n,
         num_bits=4,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = marlin_permute_scales(
-        s=scales.T.to(weight.dtype), size_k=size_k, size_n=size_n, group_size=group_size
+        s=scales.T.to(weight.dtype),
+        size_k=size_k,
+        size_n=size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
     marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
 
@@ -349,14 +391,20 @@ def rand_marlin_weight_nvfp4_like(weight, group_size):
     return weight_ref.T, marlin_qweight, marlin_scales, global_scale
 
 
-def rand_marlin_weight_mxfp4_like(weight, group_size):
+def rand_marlin_weight_mxfp4_like(weight, group_size, input_dtype=None):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+    if is_a_8bit:
+        assert input_dtype == torch.float8_e4m3fn, (
+            "MXFP4 weight + INT8 activation is not supported."
+        )
+
     assert group_size > 0
     size_n, size_k = weight.shape
     device = weight.device
 
     scales = torch.randint(
-        100,
-        125,
+        110,
+        120,
         (size_n, size_k // group_size),
         dtype=torch.uint8,
         device=weight.device,
@@ -380,18 +428,25 @@ def rand_marlin_weight_mxfp4_like(weight, group_size):
     ).view(size_n, size_k)
     weight_ref = weight_ref * scales.repeat_interleave(group_size, 1).to(weight.dtype)
 
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    fp4_weight = fp4_weight.view(torch.int32).T.contiguous()
     marlin_qweight = ops.gptq_marlin_repack(
-        b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
-        perm=torch.empty(0, dtype=torch.int, device=device),
+        b_q_weight=fp4_weight,
+        perm=perm,
         size_k=size_k,
         size_n=size_n,
         num_bits=4,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = marlin_permute_scales(
-        s=scales.T.to(weight.dtype), size_k=size_k, size_n=size_n, group_size=group_size
+        s=scales.T.to(weight.dtype),
+        size_k=size_k,
+        size_n=size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
 
-    marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
+    marlin_scales = mxfp4_marlin_process_scales(marlin_scales, input_dtype=input_dtype)
 
     return weight_ref.T, marlin_qweight, marlin_scales.to(torch.float8_e8m0fnu)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 8c96848a8539..e6b4f567caea 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -11,6 +11,7 @@
     marlin_make_workspace_new,
     marlin_permute_bias,
     marlin_permute_scales,
+    marlin_quant_input,
     should_use_atomic_add_reduce,
 )
 from vllm.platforms import current_platform
@@ -45,6 +46,7 @@ def apply_fp8_marlin_linear(
     size_n: int,
     size_k: int,
     bias: torch.Tensor | None,
+    input_dtype: torch.dtype | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
 ) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
@@ -57,12 +59,21 @@ def apply_fp8_marlin_linear(
         m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype
     )
 
+    inputs = reshaped_x
+    a_scales = None
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("FP8 weight + INT8 activation is not supported.")
+
+        inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+
     output = ops.gptq_marlin_gemm(
         a=reshaped_x,
         c=None,
         b_q_weight=weight,
         b_bias=bias,
         b_scales=weight_scale,
+        a_scales=a_scales,
         global_scale=None,
         b_zeros=None,
         g_idx=None,
@@ -80,7 +91,9 @@ def apply_fp8_marlin_linear(
 
 
 def prepare_fp8_layer_for_marlin(
-    layer: torch.nn.Module, size_k_first: bool = True
+    layer: torch.nn.Module,
+    size_k_first: bool = True,
+    input_dtype: torch.dtype | None = None,
 ) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
@@ -162,7 +175,8 @@ def prepare_fp8_layer_for_marlin(
     marlin_scales = marlin_permute_scales(
         s=scales, size_k=part_size_k, size_n=part_size_n, group_size=group_size
     )
-    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+    if input_dtype != torch.float8_e4m3fn:
+        marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
     layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
 
     if hasattr(layer, "bias") and layer.bias is not None:
@@ -172,7 +186,9 @@ def prepare_fp8_layer_for_marlin(
 
 
 def prepare_moe_fp8_layer_for_marlin(
-    layer: torch.nn.Module, size_k_first: bool = True
+    layer: torch.nn.Module,
+    size_k_first: bool = True,
+    input_dtype: torch.dtype | None = None,
 ) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
@@ -278,7 +294,8 @@ def prepare_moe_fp8_layer_for_marlin(
             tensor_list.append(marlin_scales)
 
         scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
-        scales = fp8_fused_exponent_bias_into_scales(scales)
+        if input_dtype != torch.float8_e4m3fn:
+            scales = fp8_fused_exponent_bias_into_scales(scales)
         scales = torch.nn.Parameter(scales, requires_grad=False)
 
         setattr(layer, name + "_weight_scale", scales)
@@ -318,7 +335,11 @@ def pack_fp8_to_int32(
     return int32_tensor.T.contiguous() if size_k_first else int32_tensor
 
 
-def marlin_quant_fp8_torch(weight, group_size):
+def marlin_quant_fp8_torch(weight, group_size, input_dtype=None):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+    if is_a_8bit:
+        assert input_dtype == torch.float8_e4m3fn
+
     size_n, size_k = weight.shape
     device = weight.device
 
@@ -334,16 +355,22 @@ def marlin_quant_fp8_torch(weight, group_size):
         weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
 
     packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
+    perm = torch.empty(0, dtype=torch.int, device=device)
     marlin_qweight = ops.gptq_marlin_repack(
         b_q_weight=packed_weight,
-        perm=torch.empty(0, dtype=torch.int, device=device),
+        perm=perm,
         size_k=size_k,
         size_n=size_n,
         num_bits=8,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = marlin_permute_scales(
-        s=scales.T, size_k=size_k, size_n=size_n, group_size=group_size
+        s=scales.T,
+        size_k=size_k,
+        size_n=size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 89756c45ef55..9162afe03da9 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -5,7 +5,8 @@
 import numpy as np
 import torch
 
-from vllm.scalar_type import ScalarType
+from vllm import _custom_ops as ops
+from vllm.scalar_type import ScalarType, scalar_types
 
 from .marlin_utils import GPTQ_MARLIN_TILE, marlin_permute_scales, marlin_zero_points
 from .quant_utils import (
@@ -29,13 +30,19 @@ def __init__(self, out_features, min_thread_n, max_parallel):
         self.scratch = torch.zeros(max_workspace_size, dtype=torch.int, device="cuda")
 
 
-def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
+def marlin_permute_weights(
+    q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE, is_a_8bit=False
+):
     assert q_w.shape == (size_k, size_n)
     assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
     assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
 
-    # Permute weights to 16x64 marlin tiles
-    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
+    if is_a_8bit:
+        # Permute weights to 32x32 marlin tiles
+        q_w = q_w.reshape((size_k // (tile * 2), tile * 2, size_n // tile, tile))
+    else:
+        # Permute weights to 16x64 marlin tiles
+        q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
     q_w = q_w.permute((0, 2, 1, 3))
     q_w = q_w.reshape((size_k // tile, size_n * tile))
 
@@ -44,9 +51,9 @@ def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
     return q_w
 
 
-def marlin_weights(q_w, size_k, size_n, num_bits, perm):
+def marlin_weights(q_w, size_k, size_n, num_bits, perm, is_a_8bit=False):
     # Permute
-    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm, is_a_8bit=is_a_8bit)
 
     # Pack
     pack_factor = get_pack_factor(num_bits)
@@ -63,28 +70,53 @@ def marlin_weights(q_w, size_k, size_n, num_bits, perm):
     return q_packed
 
 
-def get_weight_perm(num_bits: int):
+def get_weight_perm(num_bits: int, is_a_8bit: bool = False):
     perm_list: list[int] = []
-    for i in range(32):
-        perm1: list[int] = []
-        col = i // 4
-        for block in [0, 1]:
-            for row in [
-                2 * (i % 4),
-                2 * (i % 4) + 1,
-                2 * (i % 4 + 4),
-                2 * (i % 4 + 4) + 1,
-            ]:
-                perm1.append(16 * row + col + 8 * block)
-        for j in range(4):
-            perm_list.extend([p + 256 * j for p in perm1])
+    if is_a_8bit:
+        for i in range(32):
+            perm1 = []
+            col = i // 4
+            for block in [0, 1]:
+                for row in [
+                    4 * (i % 4),
+                    4 * (i % 4) + 1,
+                    4 * (i % 4) + 2,
+                    4 * (i % 4) + 3,
+                    4 * (i % 4 + 4),
+                    4 * (i % 4 + 4) + 1,
+                    4 * (i % 4 + 4) + 2,
+                    4 * (i % 4 + 4) + 3,
+                ]:
+                    perm1.append(16 * row + col + 8 * block)
+            for j in range(2):
+                perm_list.extend([p + 512 * j for p in perm1])
+    else:
+        for i in range(32):
+            perm1 = []
+            col = i // 4
+            for block in [0, 1]:
+                for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+                ]:
+                    perm1.append(16 * row + col + 8 * block)
+            for j in range(4):
+                perm_list.extend([p + 256 * j for p in perm1])
 
     perm = np.array(perm_list)
 
     if num_bits == 4:
-        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+        if is_a_8bit:  # noqa: SIM108
+            interleave = np.array([0, 4, 1, 5, 2, 6, 3, 7])
+        else:
+            interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
     elif num_bits == 8:
-        interleave = np.array([0, 2, 1, 3])
+        if is_a_8bit:  # noqa: SIM108
+            interleave = np.array([0, 1, 2, 3])
+        else:
+            interleave = np.array([0, 2, 1, 3])
     else:
         raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
 
@@ -99,7 +131,10 @@ def marlin_quantize(
     group_size: int,
     act_order: bool,
     test_perm: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
 ):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+
     size_k, size_n = w.shape
     num_bits = quant_type.size_bits
 
@@ -120,9 +155,15 @@ def marlin_quantize(
         q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
 
     # Reformat to marlin
-    weight_perm = get_weight_perm(num_bits)
-    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
-    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+    weight_perm = get_weight_perm(num_bits, is_a_8bit)
+    marlin_q_w = marlin_weights(
+        q_w, size_k, size_n, num_bits, weight_perm, is_a_8bit=is_a_8bit
+    )
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size, is_a_8bit=is_a_8bit)
+
+    if input_dtype == torch.float8_e4m3fn and quant_type == scalar_types.uint4b8:
+        ops.marlin_int4_fp8_preprocess(marlin_q_w, inplace=True)
+        marlin_s = marlin_s * 512
 
     # Create result
     res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
@@ -132,7 +173,13 @@ def marlin_quantize(
     return res_list
 
 
-def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int):
+def awq_marlin_quantize(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int,
+    input_dtype: torch.dtype | None = None,
+):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
     size_k, size_n = w.shape
 
     # Normalize group_size
@@ -147,11 +194,22 @@ def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int
     # Quantize with zp
     w_ref, q_w, s, zp = quantize_weights(w, quant_type, group_size, zero_points=True)
 
+    if input_dtype == torch.float8_e4m3fn and quant_type == scalar_types.uint4:
+        repeated_zp = zp.repeat_interleave(group_size, 0)
+        q_w_old = q_w
+        q_w = q_w_old - repeated_zp
+        q_w[q_w < 0] = 15 - q_w_old[q_w < 0]
+        s = s * 512
+
     # Reformat to marlin
-    weight_perm = get_weight_perm(quant_type.size_bits)
-    marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits, weight_perm)
-    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
-    marlin_zp = marlin_zero_points(zp, num_groups, size_n, quant_type.size_bits)
+    weight_perm = get_weight_perm(quant_type.size_bits, is_a_8bit)
+    marlin_q_w = marlin_weights(
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm, is_a_8bit=is_a_8bit
+    )
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size, is_a_8bit=is_a_8bit)
+    marlin_zp = marlin_zero_points(
+        zp, num_groups, size_n, quant_type.size_bits, is_a_8bit=is_a_8bit
+    )
 
     # Create result
     res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]

From b9d0504a366b667f8160174af1251bf8da9f006e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 30 Nov 2025 00:35:15 +0800
Subject: [PATCH 553/578] [Bugfix] Revert test_tokenization.py (#29729)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/entrypoints/openai/test_tokenization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index d23628671a87..7fd32e1c7be1 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -10,7 +10,7 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 
 @pytest.fixture(scope="module")

From a491b0911bf9d9ed264ec3ad83bdd9e96c0324a6 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Sat, 29 Nov 2025 18:37:25 -0800
Subject: [PATCH 554/578] [LoRA] Support FusedMoE LoRA Triton kernel for mxfp4
 (#29708)

Signed-off-by: Xin Yang <xyangx@amazon.com>
Signed-off-by: Xin Yang <105740670+xyang16@users.noreply.github.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../moe/test_modular_oai_triton_moe.py        | 249 ++++++++++++++++++
 vllm/lora/layers/fused_moe.py                 |  35 ++-
 .../fused_moe/gpt_oss_triton_kernels_moe.py   | 146 ++++++++++
 .../layers/quantization/mxfp4.py              |  20 +-
 4 files changed, 439 insertions(+), 11 deletions(-)
 create mode 100644 tests/kernels/moe/test_modular_oai_triton_moe.py

diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py
new file mode 100644
index 000000000000..c8616f13bbf8
--- /dev/null
+++ b/tests/kernels/moe/test_modular_oai_triton_moe.py
@@ -0,0 +1,249 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test modular OAI Triton MoE
+"""
+
+import pytest
+import torch
+
+from vllm.utils.import_utils import has_triton_kernels
+
+if not has_triton_kernels():
+    pytest.skip(
+        "triton_kernels not found, skipping all related tests",
+        allow_module_level=True,
+    )
+
+from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+from triton_kernels.numerics import InFlexData
+from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
+from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+from triton_kernels.tensor_details import layout
+from triton_kernels.testing import assert_close
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    OAITritonExperts,
+    UnfusedOAITritonExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
+from vllm.model_executor.layers.utils import shuffle_weight
+from vllm.platforms import current_platform
+
+MNK = [
+    (1, 512, 384),
+    (1, 2880, 2880),
+    (2, 512, 384),
+    (2, 2880, 2880),
+    (16, 2880, 2880),
+]
+
+
+def unshuffle_weight(w: torch.Tensor):
+    first = w[..., ::2]
+    second = w[..., 1::2]
+    return torch.concat((first, second), dim=-1)
+
+
+def make_weights(dtype, k, n, e):
+    w1 = torch.randn((e, k, 2 * n), dtype=dtype, device="cuda")
+    w1_bias = torch.randn((e, 2 * n), dtype=dtype, device="cuda")
+
+    w2 = torch.randn((e, n, k), dtype=dtype, device="cuda")
+    w2_bias = torch.randn((e, k), dtype=dtype, device="cuda")
+
+    w1_tri = w1.clone()
+    w2_tri = w2.clone()
+
+    w1_bias_tri = w1_bias.clone()
+    w2_bias_tri = w2_bias.clone()
+    w1_bias_tri = w1_bias_tri.to(torch.float32)
+    w2_bias_tri = w2_bias_tri.to(torch.float32)
+
+    # shuffle weights
+    w1_tri = shuffle_weight(w1_tri)
+    w1_bias_tri = shuffle_weight(w1_bias_tri)
+
+    # quant triton_weights
+    w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1)
+    w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, dtype, axis=1)
+    w1 = unshuffle_weight(w1)
+
+    w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1)
+    w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, dtype, axis=1)
+
+    num_warps = 8
+    w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
+    w_scale_layout, w_scale_layout_opts = (
+        layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps)
+    )
+
+    w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts)
+    w1_scale_tri = convert_layout(
+        wrap_torch_tensor(w1_scale_tri),
+        w_scale_layout,
+        **w_scale_layout_opts,
+    )
+
+    w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts)
+    w2_scale_tri = convert_layout(
+        wrap_torch_tensor(w2_scale_tri),
+        w_scale_layout,
+        **w_scale_layout_opts,
+    )
+
+    w1_precision_config = PrecisionConfig(
+        weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+    w2_precision_config = PrecisionConfig(
+        weight_scale=w2_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+
+    return (
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        w1_tri,
+        w2_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        w1_precision_config,
+        w2_precision_config,
+    )
+
+
+def swiglu(x, alpha: float = 1.702, limit: float = 1.0):
+    # Note we add an extra bias of 1 to the linear layer
+    x_glu, x_linear = torch.chunk(x, 2, dim=-1)
+    if limit is not None:
+        x_glu = x_glu.clamp(max=limit)
+    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
+    if limit is not None:
+        x_linear = x_linear.clamp(min=-limit, max=limit)
+    return out_glu * (x_linear + 1)
+
+
+def torch_moe_impl(
+    hidden_states: torch.Tensor,  # (M, K)
+    w1: torch.Tensor,  # (E, K, 2N)
+    w2: torch.Tensor,  # (E, N, K)
+    w1_bias: torch.Tensor,  # (E, 2N)
+    w2_bias: torch.Tensor,  # (E, K)
+    topk_weights: torch.Tensor,  # (M, topk)
+    topk_ids: torch.Tensor,  # (M, topk)
+):
+    w1 = w1[topk_ids, ...]
+    w1_bias = w1_bias[topk_ids, ...]
+    hidden_states = torch.einsum("bekc,bk->bec", w1, hidden_states) + w1_bias
+    hidden_states = swiglu(hidden_states, limit=7)
+
+    w2 = w2[topk_ids, ...]
+    w2_bias = w2_bias[topk_ids, ...]
+    hidden_states = torch.einsum("bekc,bek->bec", w2, hidden_states) + w2_bias
+
+    # Weighted sum of experts
+    hidden_states = torch.einsum("bec,be->bc", hidden_states, topk_weights)
+    return hidden_states
+
+
+def oai_triton_moe_impl(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: "PrecisionConfig",
+    w2_scale: "PrecisionConfig",
+    w1_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+    num_experts: int,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    unfused: bool = False,
+) -> torch.Tensor:
+    quant_config = mxfp4_w4a16_moe_quant_config(
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+    )
+
+    if unfused:
+        fused_experts = UnfusedOAITritonExperts(quant_config)
+    else:
+        fused_experts = OAITritonExperts(quant_config)
+
+    mk = FusedMoEModularKernel(MoEPrepareAndFinalizeNoEP(), fused_experts)
+
+    return mk.forward(
+        hidden_states=x,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=True,
+        activation="swigluoai",
+        global_num_experts=num_experts,
+        expert_map=None,
+        apply_router_weight_on_input=False,
+    )
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("m,n,k", MNK)
+@pytest.mark.parametrize("num_experts", [32, 128])
+@pytest.mark.parametrize("topk", [4])
+@pytest.mark.parametrize("unfused", [True, False])
+def test_oai_triton_moe(
+    dtype: torch.dtype,
+    m: int,
+    n: int,
+    k: int,
+    num_experts: int,
+    topk: int,
+    unfused: bool,
+):
+    current_platform.seed_everything(0)
+    (
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        w1_tri,
+        w2_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        w1_precision_config,
+        w2_precision_config,
+    ) = make_weights(dtype, k, n, num_experts)
+
+    x = torch.randn((m, k), dtype=dtype, device="cuda")
+    router_logits = torch.randn(m, num_experts, device="cuda", dtype=dtype)
+    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1, sorted=True)
+    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
+
+    with set_current_vllm_config(VllmConfig()):
+        out_ref = torch_moe_impl(x, w1, w2, w1_bias, w2_bias, topk_weights, topk_ids)
+
+        out = oai_triton_moe_impl(
+            x,
+            w1_tri,
+            w2_tri,
+            w1_precision_config,
+            w2_precision_config,
+            w1_bias_tri,
+            w2_bias_tri,
+            num_experts,
+            topk_weights,
+            topk_ids,
+            unfused,
+        )
+
+    assert_close(ref=out_ref, tri=out, maxtol=0.025, rmstol=0.005)
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 3ad19370962a..24cab79a7244 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -20,15 +20,24 @@
     _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    modular_marlin_fused_moe,
+    MarlinExperts,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    modular_triton_fused_moe,
+    TritonExperts,
     try_get_optimal_moe_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
 )
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    UnfusedOAITritonExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel,
+)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 
 from .utils import _get_lora_device
 
@@ -114,15 +123,23 @@ def _inject_lora_into_fused_moe(self):
         self.base_layer.ensure_moe_quant_config_init()
         quant_config = self.base_layer.quant_method.moe_quant_config
 
-        m_fused_moe_fn = (
-            modular_triton_fused_moe(
-                quant_config, shared_experts=self.base_layer.shared_experts
+        prepare_finalize = MoEPrepareAndFinalizeNoEP()
+        m_fused_moe_fn = FusedMoEModularKernel(
+            prepare_finalize,
+            self.base_layer.quant_method.select_gemm_impl(
+                prepare_finalize, self.base_layer
+            ),
+            self.base_layer.shared_experts,
+            getattr(self.base_layer, "shared_experts_stream", None),
+        )
+        if quant_config.use_mxfp4_w4a16:
+            assert isinstance(
+                m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts)
             )
-            if not quant_config.use_mxfp4_w4a16
-            else modular_marlin_fused_moe(
-                quant_config, shared_experts=self.base_layer.shared_experts
+        else:
+            assert isinstance(
+                m_fused_moe_fn.fused_experts, (MarlinExperts, TritonExperts)
             )
-        )
 
         def fwd_decorator(layer, func):
             def wrapper(*args, **kwargs):
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 128507639fdf..0b006e15632e 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -5,6 +5,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
@@ -376,3 +377,148 @@ def apply(
             intermediate_cache=workspace2,
             a1q_scale=a1q_scale,
         )
+
+
+class UnfusedOAITritonExperts(BaseOAITritonExperts):
+    """
+    A Triton based MoE expert class that operates on expert standard
+    format and explicitly keeps the activation and reduction (moe_sum) steps
+    unfused from the matmul_ogs kernel. This exposes injection points
+    for activation and moe_sum.
+
+    One use case for it is to inject LoRA modules on the activation and moe_sum.
+    """
+
+    def __init__(self, quant_config: FusedMoEQuantConfig):
+        # TODO (varun) : Enable activation quantization
+        assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
+        super().__init__(quant_config)
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # workspace are allocated inside the kernel
+        workspace1 = (M * topk, N // 2)
+        workspace2 = (M * topk, max(N, K))
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def moe_sum(self, input: torch.Tensor, output: torch.Tensor):
+        ops.moe_sum(input, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        if self.quant_config is None:
+            self.quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
+
+        if expert_map is not None:
+            topk_ids = expert_map[topk_ids]
+
+        local_num_experts = w1.size(0)
+        if global_num_experts == -1:
+            global_num_experts = local_num_experts
+
+        routing_data, gather_indx, scatter_indx = self._make_routing_data(
+            topk_ids, topk_weights, local_num_experts
+        )
+
+        topk = topk_ids.size(1)
+
+        # type check, uint8 means mxfp4
+        assert hidden_states.dtype == torch.bfloat16
+        assert (
+            self.quant_config.w1_bias is None
+            or self.quant_config.w1_bias.dtype == torch.float32
+        )
+        assert (
+            self.quant_config.w2_bias is None
+            or self.quant_config.w2_bias.dtype == torch.float32
+        )
+
+        # Shape check, only check non-mxfp4
+        assert hidden_states.ndim == 2
+        assert hidden_states.shape[-1] == w1.shape[-2]
+        assert w2.shape[-1] == w1.shape[1]
+
+        batch_dim = 1
+        M, K = hidden_states.shape
+        E, _, N = w1.shape
+
+        if global_num_experts == -1:
+            global_num_experts = E
+
+        # Note that the output tensor might be in workspace13
+        intermediate_cache1 = _resize_cache(workspace2, (batch_dim, M * topk, N))
+        intermediate_cache3 = _resize_cache(workspace2, (batch_dim, M * topk, K))
+        intermediate_cache2 = _resize_cache(workspace13, (M * topk, N // 2))
+
+        gammas = routing_data.gate_scal if routing_data else None
+
+        matmul_ogs(
+            hidden_states,
+            w1,
+            self.quant_config.w1_bias,
+            routing_data,
+            gather_indx=gather_indx,
+            precision_config=self.quant_config.w1_precision,
+            gammas=gammas if apply_router_weight_on_input else None,
+            fused_activation=None,
+            y=intermediate_cache1,
+        )
+
+        self.activation(
+            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+        )
+
+        # matmul_ogs grouped reduction fuse sum across multiple experts:
+        # y[dst_ind // n_expts_act, :] += x[src_ind, :]
+        # Need to set n_expts_act to 1 to unfuse moe_sum
+        routing_data.n_expts_act = 1
+
+        matmul_ogs(
+            intermediate_cache2,
+            w2,
+            self.quant_config.w2_bias,
+            routing_data,
+            scatter_indx=scatter_indx,
+            precision_config=self.quant_config.w2_precision,
+            gammas=None if apply_router_weight_on_input else gammas,
+            y=intermediate_cache3,
+        )
+
+        self.moe_sum(intermediate_cache3.view(-1, topk, K), output)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index d271e56e0856..5d330e837eea 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -30,6 +30,7 @@
 )
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
     OAITritonExperts,
+    UnfusedOAITritonExperts,
 )
 from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
@@ -86,8 +87,21 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
     if not current_platform.is_cuda():
         return Mxfp4Backend.NONE
 
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-    return Mxfp4Backend.MARLIN
+    # If FlashInfer is not available, try either Marlin or Triton
+    triton_kernels_supported = (
+        has_triton_kernels()
+        and is_torch_equal_or_newer("2.8.0")
+        # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
+        # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
+        # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
+        and (9, 0) <= current_platform.get_device_capability() < (11, 0)
+    )
+    if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
+        logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
+        return Mxfp4Backend.MARLIN
+
+    logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
+    return Mxfp4Backend.TRITON
 
 
 def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
@@ -861,6 +875,8 @@ def select_gemm_impl(
             elif self.mxfp4_backend == Mxfp4Backend.MARLIN:
                 return MarlinExperts(self.moe_quant_config)
             elif self.mxfp4_backend == Mxfp4Backend.TRITON:
+                if self.moe.is_lora_enabled:
+                    return UnfusedOAITritonExperts(self.moe_quant_config)
                 return OAITritonExperts(self.moe_quant_config)
             else:
                 raise NotImplementedError(

From e1464c3a0861384974dd6cfa35f2d6ff729ab29c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 30 Nov 2025 14:04:28 +0800
Subject: [PATCH 555/578] [Quantization] Enable compressed-tensors AWQ for
 Turing GPU (#29732)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../compressed_tensors/schemes/compressed_tensors_wNa16.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 7f4dad70287b..3f1b4d883b79 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -79,8 +79,8 @@ def __init__(
 
     @classmethod
     def get_min_capability(cls) -> int:
-        # ampere and up
-        return 80
+        # Turing and up
+        return 75
 
     def create_weights(
         self,

From 82c795d6f28ee365bfa822f30612e5da35c93fc0 Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Sat, 29 Nov 2025 22:04:55 -0800
Subject: [PATCH 556/578] Fix AttributeError about _use_fi_prefill (#29734)

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 vllm/v1/attention/backends/mla/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index b09541dbf791..180625b6ce89 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -2061,7 +2061,7 @@ def forward(
                     attn_out,
                     lse,
                     get_dcp_group(),
-                    is_lse_base_on_e=not self._use_fi_prefill,
+                    is_lse_base_on_e=not getattr(self, "_use_fi_prefill", False),
                 )
 
             # v_up projection

From 66b5840287792d2cda6d80a4b8e6d748b70dc8ce Mon Sep 17 00:00:00 2001
From: Vensen <vensenmu@gmail.com>
Date: Sun, 30 Nov 2025 14:24:25 +0800
Subject: [PATCH 557/578] [Bugfix][sleepmode][fp8 kv cache]: Fix FP8 KV cache +
 sleep(level=2) gibberish output (#28783)

Signed-off-by: vensen <vensenmu@gmail.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
---
 tests/basic_correctness/test_cumem.py | 33 ++++++++++++++++++-
 tests/utils.py                        |  7 ++++
 vllm/v1/worker/gpu_model_runner.py    | 46 ++++++++++++++++++++++++++-
 vllm/v1/worker/gpu_worker.py          | 10 ++++++
 4 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 754ef20dbeb2..dc9c69bf58b9 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -11,7 +11,7 @@
 from vllm.platforms import current_platform
 from vllm.utils.mem_constants import GiB_bytes
 
-from ..utils import create_new_process_for_each_test
+from ..utils import create_new_process_for_each_test, requires_fp8
 
 
 @create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
@@ -243,3 +243,34 @@ async def test():
         assert output.outputs[0].text == output2.outputs[0].text
 
     asyncio.run(test())
+
+
+@requires_fp8
+def test_deep_sleep_fp8_kvcache():
+    GiB_bytes = 1 << 30
+    model = "Qwen/Qwen2-0.5B"
+    used_bytes_baseline = current_platform.get_current_memory_usage()
+
+    llm = LLM(model, enable_sleep_mode=True, kv_cache_dtype="fp8")
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
+
+    # Put the engine to deep sleep
+    llm.sleep(level=2)
+
+    used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
+    assert used_bytes < 3 * GiB_bytes
+
+    llm.wake_up(tags=["weights"])
+    llm.collective_rpc("reload_weights")
+
+    used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
+    assert used_bytes < 4 * GiB_bytes
+
+    # now allocate kv cache and cuda graph memory
+    llm.wake_up(tags=["kv_cache"])
+    output2 = llm.generate(prompt, sampling_params)
+
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
diff --git a/tests/utils.py b/tests/utils.py
index c31a2aeeb9c8..9565b0ff06e3 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1075,6 +1075,13 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
     )
 
 
+requires_fp8 = pytest.mark.skipif(
+    not current_platform.supports_fp8(),
+    reason="FP8 is not supported on this GPU (requires Hopper or "
+    "Ada architecture, compute capability 8.9+)",
+)
+
+
 def large_gpu_test(*, min_gb: int):
     """
     Decorate a test to be skipped if no GPU is available or it does not have
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9b0fb07297ac..eeae82568c32 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -25,7 +25,7 @@
     AttentionType,
     MultipleOf,
 )
-from vllm.attention.layer import Attention
+from vllm.attention.layer import Attention, MLAAttention
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
@@ -602,6 +602,50 @@ def reset_mm_cache(self) -> None:
         if self.mm_budget:
             self.mm_budget.reset_cache()
 
+    @torch.inference_mode()
+    def init_fp8_kv_scales(self) -> None:
+        """
+        Re-initialize the KV cache and FP8 scales after waking from sleep.
+        1. Zero out the KV cache tensors to remove garbage data from re-allocation.
+        2. Reset Attention layer scaling factors (_k_scale, _v_scale) to 1.0.
+          If these are left at 0.0 (default after wake_up), all KV cache values
+          become effectively zero, causing gibberish output.
+        """
+        if not self.cache_config.cache_dtype.startswith("fp8"):
+            return
+
+        kv_caches = getattr(self, "kv_caches", [])
+        for cache_tensor in kv_caches:
+            if cache_tensor is not None:
+                cache_tensor.zero_()
+
+        k_attr_names = ("_k_scale", "k_scale")
+        v_attr_names = ("_v_scale", "v_scale")
+
+        attn_layers = self.compilation_config.static_forward_context
+        for name, module in attn_layers.items():
+            if isinstance(module, (Attention, MLAAttention)):
+                # TODO: Generally, scale is 1.0 if user uses on-the-fly fp8
+                # kvcache quant. However, to get better accuracy, compression
+                # frameworks like llm-compressors allow users to tune the
+                # scale. We may need to restore the specific calibrated scales
+                # here in the future.
+                k_scale_val, v_scale_val = 1.0, 1.0
+
+                # Processing K Scale
+                for attr in k_attr_names:
+                    if hasattr(module, attr):
+                        param = getattr(module, attr)
+                        if isinstance(param, torch.Tensor):
+                            param.fill_(k_scale_val)
+
+                # Processing V Scale
+                for attr in v_attr_names:
+                    if hasattr(module, attr):
+                        param = getattr(module, attr)
+                        if isinstance(param, torch.Tensor):
+                            param.fill_(v_scale_val)
+
     def _get_positions(self, num_tokens: Any):
         if isinstance(num_tokens, int):
             if self.uses_mrope:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index d0c6091ce2a6..ed6fb32bcb2f 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -141,6 +141,16 @@ def wake_up(self, tags: list[str] | None = None) -> None:
                     buffer.data.copy_(self._sleep_saved_buffers[name].data)
             self._sleep_saved_buffers = {}
 
+        # If the KV cache has just been woken up,
+        # the internal state of cache_engine must be reset,
+        # especially the FP8 scaling factor.
+        if (
+            (tags is None or "kv_cache" in tags)
+            and self.cache_config.cache_dtype.startswith("fp8")
+            and hasattr(self.model_runner, "init_fp8_kv_scales")
+        ):
+            self.model_runner.init_fp8_kv_scales()
+
     def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
         if self.vllm_config.model_config.enable_sleep_mode:
             from vllm.device_allocator.cumem import CuMemAllocator

From 9381b5cde09c542d74df835b2cba3225eff68d0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=9D?= <108347991+BowTen@users.noreply.github.com>
Date: Sun, 30 Nov 2025 14:29:13 +0800
Subject: [PATCH 558/578] [Doc]: Fix typo in fused_moe layer (#29731)

Signed-off-by: BowTen <bowten@qq.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 0ef3130b2633..e180b4f4ba23 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1422,7 +1422,7 @@ def _maybe_make_contiguous(
                 # do nothing.
                 return p
 
-            # Do not update the layer paramater as the layer's MoE operations would
+            # Do not update the layer parameter as the layer's MoE operations would
             # expect the parameter's tensor to the same shape / stride. Instead,
             # make a new torch.nn.Parameter that is used just in the context of
             # EPLB.

From 2afcec4decbbd64cf8da20d49281f828dd136c27 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 30 Nov 2025 14:59:47 +0800
Subject: [PATCH 559/578] [Misc] Update `TokenizerLike` interface and move
 `get_cached_tokenizer` (#29730)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-amd.yaml                      |  13 +-
 .buildkite/test-pipeline.yaml                 |  13 +-
 docs/design/huggingface_integration.md        |   2 +-
 .../{test_cached_tokenizer.py => test_hf.py}  |   2 +-
 tests/tokenizers_/test_mistral.py             |   6 +-
 tests/tokenizers_/test_registry.py            |  16 +-
 tools/pre_commit/check_pickle_imports.py      |   2 +-
 vllm/entrypoints/llm.py                       |   2 +-
 vllm/entrypoints/score_utils.py               |   4 +-
 vllm/tokenizers/__init__.py                   |   3 +-
 vllm/tokenizers/hf.py                         | 122 +++++++++++++++
 vllm/tokenizers/mistral.py                    |  67 +++++---
 vllm/tokenizers/protocol.py                   |  32 ++--
 vllm/transformers_utils/tokenizer.py          | 148 +++++-------------
 vllm/v1/engine/detokenizer.py                 |   2 +-
 15 files changed, 260 insertions(+), 174 deletions(-)
 rename tests/tokenizers_/{test_cached_tokenizer.py => test_hf.py} (95%)
 create mode 100644 vllm/tokenizers/hf.py

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 4d98ee40a4bb..687b6b08507c 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -61,8 +61,8 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
-  timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -72,6 +72,7 @@ steps:
   - tests/test_outputs.py
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -80,6 +81,7 @@ steps:
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
@@ -308,23 +310,20 @@ steps:
   - pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
-- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
   - vllm/
   - tests/engine
-  - tests/tokenizers_
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
   - tests/test_vllm_port
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenizers_
 
 - label: V1 Test e2e + engine # 30min
   timeout_in_minutes: 45
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 16d490754958..9f2107fb1e5a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,14 +57,15 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
-  timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+  timeout_in_minutes: 20
   source_file_dependencies:
   - vllm/
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -73,6 +74,7 @@ steps:
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
@@ -276,21 +278,18 @@ steps:
   - pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
-- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/engine
-  - tests/tokenizers_
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
   - tests/test_vllm_port
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenizers_
 
 - label: V1 Test e2e + engine # 30min
   timeout_in_minutes: 45
diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md
index 412ce658b92a..1109abf6cb93 100644
--- a/docs/design/huggingface_integration.md
+++ b/docs/design/huggingface_integration.md
@@ -21,7 +21,7 @@ Let's say we want to serve the popular Qwen model by running `vllm serve Qwen/Qw
 
 Beyond that, there are two more things vLLM depends on Hugging Face for.
 
-1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
+1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [vllm.tokenizers.hf.get_cached_tokenizer][].
 
 2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
     - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
diff --git a/tests/tokenizers_/test_cached_tokenizer.py b/tests/tokenizers_/test_hf.py
similarity index 95%
rename from tests/tokenizers_/test_cached_tokenizer.py
rename to tests/tokenizers_/test_hf.py
index 48234687ea1e..c1238900ce0d 100644
--- a/tests/tokenizers_/test_cached_tokenizer.py
+++ b/tests/tokenizers_/test_hf.py
@@ -7,7 +7,7 @@
 from transformers import AutoTokenizer
 
 from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.tokenizer import get_cached_tokenizer
+from vllm.tokenizers.hf import get_cached_tokenizer
 
 
 @pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
diff --git a/tests/tokenizers_/test_mistral.py b/tests/tokenizers_/test_mistral.py
index 0706a94791dc..92efac86dff2 100644
--- a/tests/tokenizers_/test_mistral.py
+++ b/tests/tokenizers_/test_mistral.py
@@ -356,8 +356,8 @@ def test_call(self, mistral_tokenizer: MistralTokenizer):
         )
         attn_mask = [1 for _ in range(len(token_ids))]
 
-        # Test 1: default
-        assert mistral_tokenizer("Hello world !") == {
+        # Test 1: no special tokens
+        assert mistral_tokenizer("Hello world !", add_special_tokens=False) == {
             "attention_mask": attn_mask[1:],
             "input_ids": token_ids[1:],
         }
@@ -381,7 +381,7 @@ def test_call(self, mistral_tokenizer: MistralTokenizer):
             "input_ids": token_ids,
         }
         # Test 5: empty string
-        assert mistral_tokenizer("") == {
+        assert mistral_tokenizer("", add_special_tokens=False) == {
             "attention_mask": [],
             "input_ids": [],
         }
diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py
index 1eb19a0996dd..b357669f8378 100644
--- a/tests/tokenizers_/test_registry.py
+++ b/tests/tokenizers_/test_registry.py
@@ -17,20 +17,26 @@ def bos_token_id(self) -> int:
     def eos_token_id(self) -> int:
         return 1
 
+    @property
+    def pad_token_id(self) -> int:
+        return 2
+
+    @property
+    def is_fast(self) -> bool:
+        return True
+
 
 def test_customized_tokenizer():
-    TokenizerRegistry.register(
-        "test_tokenizer",
-        __name__,
-        TestTokenizer.__name__,
-    )
+    TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
 
     tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
     assert isinstance(tokenizer, TestTokenizer)
     assert tokenizer.bos_token_id == 0
     assert tokenizer.eos_token_id == 1
+    assert tokenizer.pad_token_id == 2
 
     tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
     assert isinstance(tokenizer, TestTokenizer)
     assert tokenizer.bos_token_id == 0
     assert tokenizer.eos_token_id == 1
+    assert tokenizer.pad_token_id == 2
diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py
index 2bb468da68c2..13e5a0eda751 100644
--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@@ -27,7 +27,7 @@
     "vllm/distributed/device_communicators/shm_broadcast.py",
     "vllm/distributed/device_communicators/shm_object_storage.py",
     "vllm/utils/hashing.py",
-    "tests/tokenizers_/test_cached_tokenizer.py",
+    "tests/tokenizers_/test_hf.py",
     "tests/utils_/test_hashing.py",
     "benchmarks/kernels/graph_machete_bench.py",
     "benchmarks/kernels/benchmark_lora.py",
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 4ea213752e39..acdf28501cbb 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -72,7 +72,7 @@
 from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
 from vllm.tasks import PoolingTask
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
-from vllm.transformers_utils.tokenizer import get_cached_tokenizer
+from vllm.tokenizers.hf import get_cached_tokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.collection_utils import as_iter, is_list_of
 from vllm.utils.counter import Counter
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 04d5a192918d..602f59ac09f5 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -51,8 +51,8 @@ def _cosine_similarity(
     for emb_1, emb_2 in zip(embed_1, embed_2):
         pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data)
 
-        padding = []
-        if (pad_token_id := getattr(tokenizer, "pad_token_id", None)) is not None:
+        padding: list[int] = []
+        if (pad_token_id := tokenizer.pad_token_id) is not None:
             padding = [pad_token_id]
 
         tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index e26b4e8797ec..03174872146a 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from .hf import HfTokenizer
 from .mistral import MistralTokenizer
 from .protocol import TokenizerLike
 from .registry import TokenizerRegistry
 
-__all__ = ["TokenizerLike", "MistralTokenizer", "TokenizerRegistry"]
+__all__ = ["TokenizerLike", "HfTokenizer", "MistralTokenizer", "TokenizerRegistry"]
diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py
new file mode 100644
index 000000000000..64672fdbb120
--- /dev/null
+++ b/vllm/tokenizers/hf.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import copy
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
+
+from .protocol import TokenizerLike
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
+
+def get_cached_tokenizer(
+    tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast",
+) -> TokenizerLike:
+    """
+    By default, transformers will recompute multiple tokenizer properties
+    each time they are called, leading to a significant slowdown.
+    This proxy caches these properties for faster access.
+    """
+    cached_tokenizer = copy.copy(tokenizer)
+
+    tokenizer_all_special_ids = tokenizer.all_special_ids
+    tokenizer_all_special_tokens = tokenizer.all_special_tokens
+    tokenizer_vocab = tokenizer.get_vocab()
+    tokenizer_len = len(tokenizer)
+
+    max_token_id = max(tokenizer_vocab.values())
+    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
+    # are added and included in the implementation of the vocab_size
+    # property, but not in get_vocab(); if there is an implementation
+    # of vocab size, we should take the greater value.
+    if hasattr(tokenizer, "vocab_size"):
+        with contextlib.suppress(NotImplementedError):
+            max_token_id = max(max_token_id, tokenizer.vocab_size)
+
+    class CachedTokenizer(tokenizer.__class__):  # type: ignore
+        @property
+        def all_special_ids(self) -> list[int]:
+            return tokenizer_all_special_ids
+
+        @property
+        def all_special_tokens(self) -> list[str]:
+            return tokenizer_all_special_tokens
+
+        @property
+        def max_token_id(self) -> int:
+            return max_token_id
+
+        def get_vocab(self) -> dict[str, int]:
+            return tokenizer_vocab
+
+        def __len__(self) -> int:
+            return tokenizer_len
+
+        def __reduce__(self):
+            return get_cached_tokenizer, (tokenizer,)
+
+    CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
+
+    cached_tokenizer.__class__ = CachedTokenizer
+    return cached_tokenizer  # type: ignore
+
+
+class HfTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TokenizerLike":
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                path_or_repo_id,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                cache_dir=download_dir,
+                **kwargs,
+            )
+        except ValueError as e:
+            # If the error pertains to the tokenizer class not existing or not
+            # currently being imported,
+            # suggest using the --trust-remote-code flag.
+            if not trust_remote_code and (
+                "does not exist or is not currently imported." in str(e)
+                or "requires you to execute the tokenizer file" in str(e)
+            ):
+                err_msg = (
+                    "Failed to load the tokenizer. If the tokenizer "
+                    "is a custom tokenizer not yet available in the "
+                    "HuggingFace transformers library, consider "
+                    "setting `trust_remote_code=True` in LLM or using "
+                    "the `--trust-remote-code` flag in the CLI."
+                )
+                raise RuntimeError(err_msg) from e
+            else:
+                raise e
+
+        # The special_tokens in tokenizer should also be
+        # controlled by do_lower_case in encoder_config
+        encoder_config = get_sentence_transformer_tokenizer_config(
+            path_or_repo_id, revision
+        )
+        if isinstance(encoder_config, dict) and encoder_config.get(
+            "do_lower_case", False
+        ):
+            special_tokens_map = {
+                k: v.lower() for k, v in tokenizer.special_tokens_map.items()
+            }
+            tokenizer.add_special_tokens(special_tokens_map)
+
+        return get_cached_tokenizer(tokenizer)
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index a42fb0e1e5f1..de3e5ec43854 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, cast
 
 from vllm.logger import init_logger
@@ -12,6 +12,7 @@
         ChatCompletionRequest as MistralChatCompletionRequest,
     )
     from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+    from transformers import BatchEncoding
     from transformers.tokenization_mistral_common import (
         MistralCommonTokenizer as TransformersMistralTokenizer,
     )
@@ -165,7 +166,35 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
 
 
 class MistralTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "MistralTokenizer":
+        from mistral_common.protocol.instruct.validator import ValidationMode
+        from transformers.tokenization_mistral_common import (
+            MistralCommonTokenizer as TransformersMistralTokenizer,
+        )
+
+        tokenizer = TransformersMistralTokenizer.from_pretrained(
+            path_or_repo_id,
+            *args,
+            mode=ValidationMode.test,
+            cache_dir=download_dir,
+            revision="main" if revision is None else revision,
+            **kwargs,
+        )
+
+        return cls(tokenizer)
+
     def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
+        super().__init__()
+
         from mistral_common.protocol.instruct.validator import ValidationMode
         from mistral_common.tokens.tokenizers.sentencepiece import (
             SentencePieceTokenizer,
@@ -211,22 +240,6 @@ def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
         self._vocab = self.tokenizer._vocab
         self._max_token_id = self.vocab_size - 1
 
-    @classmethod
-    def from_pretrained(
-        cls, path_or_repo_id: str, *, revision: str | None = None
-    ) -> "MistralTokenizer":
-        from mistral_common.protocol.instruct.validator import ValidationMode
-        from transformers.tokenization_mistral_common import (
-            MistralCommonTokenizer as TransformersMistralTokenizer,
-        )
-
-        str_revision = "main" if revision is None else revision
-        return cls(
-            TransformersMistralTokenizer.from_pretrained(
-                path_or_repo_id, revision=str_revision, mode=ValidationMode.test
-            )
-        )
-
     def _get_special_token_ids(self) -> list[int]:
         from mistral_common.tokens.tokenizers.sentencepiece import (
             SentencePieceTokenizer,
@@ -271,6 +284,10 @@ def bos_token_id(self) -> int:
     def eos_token_id(self) -> int:
         return self.tokenizer.eos_id
 
+    @property
+    def pad_token_id(self) -> int:
+        return self.tokenizer.pad_id
+
     @property
     def is_fast(self) -> bool:
         return True
@@ -298,12 +315,12 @@ def __len__(self) -> int:
 
     def __call__(
         self,
-        text: str | list[str] | list[int],
+        text: str | list[str],
         text_pair: str | None = None,
-        add_special_tokens: bool = False,
+        add_special_tokens: bool = True,
         truncation: bool = False,
         max_length: int | None = None,
-    ):
+    ) -> "BatchEncoding":
         if text_pair is not None:
             raise ValueError(
                 "`text_pair` is not supported by `MistralTokenizer.__call__`."
@@ -342,13 +359,11 @@ def encode(
         text: str,
         truncation: bool | None = None,
         max_length: int | None = None,
-        add_special_tokens: bool | None = None,
+        add_special_tokens: bool = True,
     ) -> list[int]:
         # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
         # is in, directly call self.transformers_tokenizer.encode(...).
-        encoded = self.tokenizer.encode(
-            text, bos=add_special_tokens is not False, eos=False
-        )
+        encoded = self.tokenizer.encode(text, bos=add_special_tokens, eos=False)
 
         if truncation is not False and max_length is not None:
             return encoded[:max_length]
@@ -383,7 +398,7 @@ def apply_chat_template(
             return_dict=False,
         )
 
-    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
         # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
         # is in, directly call self.transformers_tokenizer.decode(...).
         if isinstance(ids, int):
@@ -455,7 +470,7 @@ def convert_tokens_to_string(self, tokens: list[str]) -> str:
     def convert_ids_to_tokens(
         self,
         ids: list[int],
-        skip_special_tokens: bool = True,
+        skip_special_tokens: bool = False,
     ) -> list[str]:
         from mistral_common.tokens.tokenizers.base import (
             SpecialTokenPolicy,
diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py
index 58a1a7c23f21..6c807bd99878 100644
--- a/vllm/tokenizers/protocol.py
+++ b/vllm/tokenizers/protocol.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Protocol
 
-from typing_extensions import Self
-
 if TYPE_CHECKING:
+    from transformers import BatchEncoding
+
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
 
@@ -13,11 +13,13 @@ class TokenizerLike(Protocol):
     @classmethod
     def from_pretrained(
         cls,
-        pretrained_model_name_or_path: str,
-        /,
-        *,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
         revision: str | None = None,
-    ) -> Self:
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TokenizerLike":
         raise NotImplementedError
 
     @property
@@ -36,6 +38,10 @@ def bos_token_id(self) -> int:
     def eos_token_id(self) -> int:
         raise NotImplementedError
 
+    @property
+    def pad_token_id(self) -> int:
+        raise NotImplementedError
+
     @property
     def is_fast(self) -> bool:
         raise NotImplementedError
@@ -60,12 +66,12 @@ def __len__(self) -> int:
 
     def __call__(
         self,
-        text: str | list[str] | list[int],
+        text: str | list[str],
         text_pair: str | None = None,
-        add_special_tokens: bool = False,
+        add_special_tokens: bool = True,
         truncation: bool = False,
         max_length: int | None = None,
-    ):
+    ) -> "BatchEncoding":
         raise NotImplementedError
 
     def get_vocab(self) -> dict[str, int]:
@@ -79,7 +85,7 @@ def encode(
         text: str,
         truncation: bool | None = None,
         max_length: int | None = None,
-        add_special_tokens: bool | None = None,
+        add_special_tokens: bool = True,
     ) -> list[int]:
         raise NotImplementedError
 
@@ -94,12 +100,12 @@ def apply_chat_template(
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
         raise NotImplementedError
 
-    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
         raise NotImplementedError
 
     def convert_ids_to_tokens(
         self,
         ids: list[int],
-        skip_special_tokens: bool = True,
+        skip_special_tokens: bool = False,
     ) -> list[str]:
         raise NotImplementedError
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 87d5cc2b483f..622d5c7fe993 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import contextlib
-import copy
 import importlib.util
 import os
 import warnings
@@ -11,14 +9,17 @@
 from typing import TYPE_CHECKING, Any
 
 import huggingface_hub
-from transformers import AutoTokenizer, PreTrainedTokenizerBase
 from typing_extensions import assert_never
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.tokenizers import MistralTokenizer, TokenizerLike, TokenizerRegistry
+from vllm.tokenizers import (
+    HfTokenizer,
+    MistralTokenizer,
+    TokenizerLike,
+    TokenizerRegistry,
+)
 
-from .config import get_sentence_transformer_tokenizer_config
 from .gguf_utils import get_gguf_file_path_from_hf
 from .repo_utils import list_filtered_repo_files
 from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf
@@ -41,6 +42,18 @@ def __getattr__(name: str):
         )
 
         return TokenizerLike
+    if name == "get_cached_tokenizer":
+        from vllm.tokenizers.hf import get_cached_tokenizer
+
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer.get_cached_tokenizer` "
+            "has been moved to `vllm.tokenizers.hf.get_cached_tokenizer`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        return get_cached_tokenizer
 
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
@@ -58,10 +71,12 @@ def decode_tokens(
     `skip_special_tokens=None` means to use the backend's default
     settings.
     """
+    kw_args: dict[str, Any] = {}
+
     if skip_special_tokens is not None:
-        return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+        kw_args["skip_special_tokens"] = skip_special_tokens
 
-    return tokenizer.decode(token_ids)
+    return tokenizer.decode(token_ids, **kw_args)
 
 
 def encode_tokens(
@@ -93,56 +108,6 @@ def encode_tokens(
     return tokenizer.encode(text, **kw_args)
 
 
-def get_cached_tokenizer(tokenizer: TokenizerLike) -> TokenizerLike:
-    """
-    By default, transformers will recompute multiple tokenizer properties
-    each time they are called, leading to a significant slowdown.
-    This proxy caches these properties for faster access.
-    """
-    cached_tokenizer = copy.copy(tokenizer)
-
-    tokenizer_all_special_ids = tokenizer.all_special_ids
-    tokenizer_all_special_tokens = tokenizer.all_special_tokens
-    tokenizer_vocab = tokenizer.get_vocab()
-    tokenizer_len = len(tokenizer)
-
-    max_token_id = max(tokenizer_vocab.values())
-    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
-    # are added and included in the implementation of the vocab_size
-    # property, but not in get_vocab(); if there is an implementation
-    # of vocab size, we should take the greater value.
-    if hasattr(tokenizer, "vocab_size"):
-        with contextlib.suppress(NotImplementedError):
-            max_token_id = max(max_token_id, tokenizer.vocab_size)
-
-    class CachedTokenizer(tokenizer.__class__):  # type: ignore
-        @property
-        def all_special_ids(self) -> list[int]:
-            return tokenizer_all_special_ids
-
-        @property
-        def all_special_tokens(self) -> list[str]:
-            return tokenizer_all_special_tokens
-
-        @property
-        def max_token_id(self) -> int:
-            return max_token_id
-
-        def get_vocab(self) -> dict[str, int]:
-            return tokenizer_vocab
-
-        def __len__(self) -> int:
-            return tokenizer_len
-
-        def __reduce__(self):
-            return get_cached_tokenizer, (tokenizer,)
-
-    CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
-
-    cached_tokenizer.__class__ = CachedTokenizer
-    return cached_tokenizer
-
-
 def get_tokenizer(
     tokenizer_name: str | Path,
     *args,
@@ -217,66 +182,39 @@ def get_tokenizer(
     if tokenizer_mode == "mistral":
         logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
         tokenizer = MistralTokenizer.from_pretrained(
-            str(tokenizer_name), revision=revision
+            tokenizer_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            download_dir=download_dir,
+            **kwargs,
         )
     elif tokenizer_mode == "custom":
         logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
         tokenizer = TokenizerRegistry.get_tokenizer(
             str(tokenizer_name),
             *args,
+            trust_remote_code=trust_remote_code,
             revision=revision,
             download_dir=download_dir,
             **kwargs,
         )
     else:
-        try:
-            logger.debug_once(f"Loading AutoTokenizer from {tokenizer_name}")
-            tokenizer = AutoTokenizer.from_pretrained(
-                tokenizer_name,
-                *args,
-                trust_remote_code=trust_remote_code,
-                revision=revision,
-                **kwargs,
-            )
-        except ValueError as e:
-            # If the error pertains to the tokenizer class not existing or not
-            # currently being imported,
-            # suggest using the --trust-remote-code flag.
-            if not trust_remote_code and (
-                "does not exist or is not currently imported." in str(e)
-                or "requires you to execute the tokenizer file" in str(e)
-            ):
-                err_msg = (
-                    "Failed to load the tokenizer. If the tokenizer "
-                    "is a custom tokenizer not yet available in the "
-                    "HuggingFace transformers library, consider "
-                    "setting `trust_remote_code=True` in LLM or using "
-                    "the `--trust-remote-code` flag in the CLI."
-                )
-                raise RuntimeError(err_msg) from e
-            else:
-                raise e
-
-        # The special_tokens in tokenizer should also be
-        # controlled by do_lower_case in encoder_config
-        encoder_config = get_sentence_transformer_tokenizer_config(
-            tokenizer_name, revision
+        logger.debug_once(f"Loading HfTokenizer from {tokenizer_name}")
+        tokenizer = HfTokenizer.from_pretrained(
+            tokenizer_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            download_dir=download_dir,
+            **kwargs,
+        )
+
+    if not tokenizer.is_fast:
+        logger.warning(
+            "Using a slow tokenizer. This might cause a significant "
+            "slowdown. Consider using a fast tokenizer instead."
         )
-        if isinstance(encoder_config, dict) and encoder_config.get(
-            "do_lower_case", False
-        ):
-            assert isinstance(tokenizer, PreTrainedTokenizerBase)
-            special_tokens_map = {
-                k: v.lower() for k, v in tokenizer.special_tokens_map.items()
-            }
-            tokenizer.add_special_tokens(special_tokens_map)
-
-        if not tokenizer.is_fast:
-            logger.warning(
-                "Using a slow tokenizer. This might cause a significant "
-                "slowdown. Consider using a fast tokenizer instead."
-            )
-        tokenizer = get_cached_tokenizer(tokenizer)
 
     return tokenizer
 
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6c0acd9a9f59..dce8765fcf6b 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -9,8 +9,8 @@
 from transformers import PreTrainedTokenizerFast
 
 from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.detokenizer_utils import (
-    TokenizerLike,
     convert_prompt_ids_to_tokens,
     detokenize_incrementally,
 )

From 47539cfd3e5006159e427ee5bc32823f6cef7ec3 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 30 Nov 2025 17:15:01 +0800
Subject: [PATCH 560/578] [Bugfix] Fix mismatched nvfp4 gemm output shape
 (#29742)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index b603bdb13280..c0b1e3ceeba3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -184,7 +184,7 @@ def apply_weights(
             return out
 
         output_dtype = x.dtype
-        output_shape = [x.shape[0], layer.weight_packed.shape[0]]
+        output_shape = [*x.shape[:-1], layer.weight_packed.shape[0]]
 
         # quantize BF16 or FP16 to (FP4 and interleaved block scale)
         x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)

From 64bc09ba279d42bbb7f8c4fba6a049474c438774 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 30 Nov 2025 17:31:12 +0800
Subject: [PATCH 561/578] [Core] Enable `inputs_embeds_size` separate from
 `hidden_size` (#29741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../models/multimodal/pooling/test_siglip.py  |  7 ++-
 vllm/config/model.py                          | 10 ++++
 vllm/model_executor/models/clip.py            | 51 +++++++++++++++++--
 vllm/model_executor/models/siglip.py          | 49 ++++++++++++++++--
 vllm/v1/spec_decode/eagle.py                  |  5 +-
 vllm/v1/worker/gpu/input_batch.py             |  2 +-
 vllm/v1/worker/gpu/model_runner.py            |  4 +-
 vllm/v1/worker/gpu/spec_decode/eagle.py       |  3 +-
 vllm/v1/worker/gpu_model_runner.py            |  4 +-
 vllm/v1/worker/tpu_model_runner.py            |  6 ++-
 10 files changed, 123 insertions(+), 18 deletions(-)

diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index 3345b10c099a..c973676ba027 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -19,7 +19,12 @@
     }
 )
 
-MODELS = ["google/siglip-base-patch16-224", "google/siglip2-base-patch16-224"]
+MODELS = [
+    "google/siglip-base-patch16-224",
+    "google/siglip2-base-patch16-224",
+    # Different image embedding dim than text_config.hidden_size
+    "google/siglip2-giant-opt-patch16-384",
+]
 
 
 def _run_test(
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 6b3657381354..92cd48402a65 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1202,6 +1202,16 @@ def get_vocab_size(self) -> int:
     def get_hidden_size(self) -> int:
         return getattr(self.hf_text_config, "hidden_size", 0)
 
+    def get_inputs_embeds_size(self) -> int:
+        # The size of inputs_embeds is usually identical to the size
+        # of the hidden states, however there are exceptions, such as
+        # embedding models like CLIP and SigLIP
+        for target_attr in ("projection_dim", "projection_size"):
+            if hasattr(self.hf_text_config, target_attr):
+                return getattr(self.hf_text_config, target_attr)
+
+        return self.get_hidden_size()
+
     @property
     def is_deepseek_mla(self) -> bool:
         if not hasattr(self.hf_text_config, "model_type"):
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index c2993b47dc3f..b8af3050990b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import cached_property
 from typing import Annotated, Literal
 
@@ -903,6 +903,41 @@ def _process_image_inputs(self, inputs: CLIPImagePixelInputs) -> torch.Tensor:
     def get_language_model(self) -> torch.nn.Module:
         return self.text_model
 
+    def _embed_text_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        embed_input_ids: Callable[[torch.Tensor], torch.Tensor],
+        *,
+        is_multimodal: torch.Tensor | None,
+        handle_oov_mm_token: bool,
+    ) -> torch.Tensor:
+        inputs_embeds = super()._embed_text_input_ids(
+            input_ids,
+            embed_input_ids,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+        # NOTE: inputs_embeds in model runner has size text_config.projection_dim
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        inputs_embeds_size = self.projection_dim
+        if inputs_embeds.shape[1] < inputs_embeds_size:
+            inputs_embeds = torch.cat(
+                [
+                    inputs_embeds,
+                    inputs_embeds.new_empty(
+                        inputs_embeds.shape[0],
+                        inputs_embeds_size - inputs_embeds.shape[1],
+                    ),
+                ],
+                dim=1,
+            )
+        elif inputs_embeds.shape[1] > inputs_embeds_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
+        return inputs_embeds
+
     def embed_input_ids(
         self,
         input_ids: torch.Tensor,
@@ -949,10 +984,16 @@ def forward(
         if not self._is_text_input:
             return inputs_embeds
 
-        # Text inputs
-        return self.get_text_features(
-            input_ids=input_ids, position_ids=positions, inputs_embeds=inputs_embeds
-        )
+        # NOTE: inputs_embeds in model runner has size text_config.projection_dim
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        hidden_size = self.text_embed_dim
+        if inputs_embeds.shape[1] > hidden_size:
+            inputs_embeds = inputs_embeds[:, :hidden_size]
+        elif inputs_embeds.shape[1] < hidden_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
+        return self.get_text_features(input_ids, positions, inputs_embeds)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index ce5847bf79a5..9db1423d98e0 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Implementation of SiglipVisionModel intended to be only used
-within a vision language model."""
 
 import math
-from collections.abc import Iterable, Mapping
+from collections.abc import Callable, Iterable, Mapping
 from functools import cached_property
 from typing import Annotated, Literal
 
@@ -976,6 +974,7 @@ def forward(
 
         position_embeddings = self.position_embedding(position_ids)
         embeddings = inputs_embeds + position_embeddings
+
         return embeddings
 
 
@@ -1145,6 +1144,41 @@ def _process_image_inputs(self, inputs: SiglipImagePixelInputs) -> torch.Tensor:
     def get_language_model(self) -> torch.nn.Module:
         return self.text_model
 
+    def _embed_text_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        embed_input_ids: Callable[[torch.Tensor], torch.Tensor],
+        *,
+        is_multimodal: torch.Tensor | None,
+        handle_oov_mm_token: bool,
+    ) -> torch.Tensor:
+        inputs_embeds = super()._embed_text_input_ids(
+            input_ids,
+            embed_input_ids,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+        # NOTE: inputs_embeds in model runner has size text_config.projection_size
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        inputs_embeds_size = self.text_projection_size
+        if inputs_embeds.shape[1] < inputs_embeds_size:
+            inputs_embeds = torch.cat(
+                [
+                    inputs_embeds,
+                    inputs_embeds.new_empty(
+                        inputs_embeds.shape[0],
+                        inputs_embeds_size - inputs_embeds.shape[1],
+                    ),
+                ],
+                dim=1,
+            )
+        elif inputs_embeds.shape[1] > inputs_embeds_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
+        return inputs_embeds
+
     def embed_input_ids(
         self,
         input_ids: torch.Tensor,
@@ -1190,6 +1224,15 @@ def forward(
         if not self._is_text_input:
             return inputs_embeds
 
+        # NOTE: inputs_embeds in model runner has size text_config.projection_size
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        hidden_size = self.text_embed_dim
+        if inputs_embeds.shape[1] > hidden_size:
+            inputs_embeds = inputs_embeds[:, :hidden_size]
+        elif inputs_embeds.shape[1] < hidden_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
         return self.get_text_features(input_ids, positions, inputs_embeds)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 72f9d15bc132..d7111d52dd8a 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -80,6 +80,7 @@ def __init__(
         # the draft model's hidden size can be different from the target model's
         # hidden size (e.g., Llama 3.3 70B).
         self.hidden_size = self.draft_model_config.get_hidden_size()
+        self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
 
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
@@ -151,7 +152,9 @@ def __init__(
         )
 
         self.inputs_embeds = torch.zeros(
-            (self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device
+            (self.max_num_tokens, self.inputs_embeds_size),
+            dtype=self.dtype,
+            device=device,
         )
 
         self.backup_next_token_ids = CpuGpuBuffer(
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 3f8ef03f9644..8ae887fe82cf 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -17,7 +17,7 @@ def __init__(
         self,
         max_num_reqs: int,
         max_num_tokens: int,
-        hidden_size: int,
+        inputs_embeds_size: int,
         vocab_size: int,
         dtype: torch.dtype,
         device: torch.device,
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index fdb930c4dcd7..9bf345053c30 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -98,7 +98,7 @@ def __init__(
         self.max_model_len = self.model_config.max_model_len
         self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
         self.max_num_reqs = self.scheduler_config.max_num_seqs
-        self.hidden_size = self.model_config.get_hidden_size()
+        self.inputs_embeds_size = self.model_config.get_inputs_embeds_size()
 
         self.dp_size = self.parallel_config.data_parallel_size
         self.dp_rank = self.parallel_config.data_parallel_rank
@@ -134,7 +134,7 @@ def __init__(
         self.input_buffers = InputBuffers(
             max_num_reqs=self.max_num_reqs,
             max_num_tokens=self.max_num_tokens,
-            hidden_size=self.hidden_size,
+            inputs_embeds_size=self.inputs_embeds_size,
             vocab_size=self.vocab_size,
             dtype=self.dtype,
             device=self.device,
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index a2d0550326f3..8848e220eb5b 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -44,6 +44,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         # the draft model's hidden size can be different from the target model's
         # hidden size (e.g., Llama 3.3 70B).
         self.hidden_size = self.draft_model_config.get_hidden_size()
+        self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
         self.vocab_size = self.draft_model_config.get_vocab_size()
         self.pin_memory = is_pin_memory_available()
         self.dtype = vllm_config.model_config.dtype
@@ -51,7 +52,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.input_buffers = InputBuffers(
             max_num_reqs=self.max_num_reqs,
             max_num_tokens=self.max_num_tokens,
-            hidden_size=self.hidden_size,
+            inputs_embeds_size=self.inputs_embeds_size,
             vocab_size=self.vocab_size,
             dtype=self.dtype,
             device=device,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index eeae82568c32..2218e4f023f9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -320,7 +320,7 @@ def __init__(
 
         # Model-related.
         self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
-        self.hidden_size = model_config.get_hidden_size()
+        self.inputs_embeds_size = model_config.get_inputs_embeds_size()
         self.attention_chunk_size = model_config.attention_chunk_size
         # Only relevant for models using ALiBi (e.g, MPT)
         self.use_alibi = model_config.uses_alibi
@@ -485,7 +485,7 @@ def __init__(
         # version of this tensor, avoid a RuntimeError by not creating a
         # numpy buffer.
         self.inputs_embeds = self._make_buffer(
-            self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False
+            self.max_num_tokens, self.inputs_embeds_size, dtype=self.dtype, numpy=False
         )
         self.is_token_ids = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
         self.discard_request_mask = self._make_buffer(
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 9c1fbfd24149..f3dd9aa96d2a 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -215,7 +215,7 @@ def __init__(
         self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
-        self.hidden_size = model_config.get_hidden_size()
+        self.inputs_embeds_size = model_config.get_inputs_embeds_size()
         self.vocab_size = model_config.get_vocab_size()
 
         # Multi-modal data support
@@ -1406,7 +1406,9 @@ def _dummy_run(self, num_tokens: int, num_reqs: int, num_blocks: int) -> None:
         if self.supports_mm_inputs:
             input_ids = None
             inputs_embeds = torch.zeros(
-                (num_tokens, self.hidden_size), dtype=self.dtype, device=self.device
+                (num_tokens, self.inputs_embeds_size),
+                dtype=self.dtype,
+                device=self.device,
             )
         else:
             input_ids = torch.zeros((num_tokens), dtype=torch.int32).to(self.device)

From 8c363ed6663f69b97c9f34b0be0091d8135f958c Mon Sep 17 00:00:00 2001
From: Pleaplusone <ygan@amd.com>
Date: Sun, 30 Nov 2025 19:31:50 +0800
Subject: [PATCH 562/578] [ROCm][Attention] Sliding window support for
 `AiterFlashAttentionBackend` (#29234)

Signed-off-by: ganyi <ygan@amd.com>
---
 vllm/v1/attention/backends/rocm_aiter_fa.py | 273 ++++++++++++++++----
 1 file changed, 224 insertions(+), 49 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index ea911af3d19c..b6aa0ae2be48 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -13,8 +13,9 @@
     AttentionType,
     MultipleOf,
 )
+from vllm.attention.layer import Attention
 from vllm.attention.ops.merge_attn_states import merge_attn_states
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
@@ -57,58 +58,55 @@ def cp_mha_gather_cache_kernel(
         head_size,
         x,
         max_block_num,
-        num_tokens,
-        num_programs,
         DEQUANT: tl.constexpr,
         PAGE_SIZE: tl.constexpr,
         CACHE_FORMAT: tl.constexpr,
         BLOCK_SIZE: tl.constexpr,
     ):
-        bid = tl.program_id(0)
+        token_id = tl.program_id(0)
         col_offsets = tl.arange(0, BLOCK_SIZE)
         if DEQUANT:
             k_scale = tl.load(k_scale_ptr)
             v_scale = tl.load(v_scale_ptr)
 
-        for token_id in tl.range(bid, num_tokens, num_programs):
-            key_ptr_offset = key_ptr + token_id * head_size * num_heads
-            value_ptr_offset = value_ptr + token_id * head_size * num_heads
-            batch_idx = tl.load(token_to_batch_ptr + token_id)
-            batch_start = tl.load(seq_start_ptr + batch_idx)
-            token_start = tl.load(cu_seqlens_kv_ptr + batch_idx)
-            batch_offset = token_id - token_start + batch_start
-            block_offset = batch_offset // PAGE_SIZE
-            block_id = tl.load(
-                block_table_ptr + max_block_num * batch_idx + block_offset
+        key_ptr_offset = key_ptr + token_id * head_size * num_heads
+        value_ptr_offset = value_ptr + token_id * head_size * num_heads
+        batch_idx = tl.load(token_to_batch_ptr + token_id)
+        batch_start = tl.load(seq_start_ptr + batch_idx)
+        token_start = tl.load(cu_seqlens_kv_ptr + batch_idx)
+        batch_offset = token_id - token_start + batch_start
+        block_offset = batch_offset // PAGE_SIZE
+        block_id = tl.load(
+            block_table_ptr + max_block_num * batch_idx + block_offset
+        ).to(tl.int64)
+        slot_id = batch_offset % PAGE_SIZE
+
+        if CACHE_FORMAT == "NHD":
+            # for kv cache layout as
+            # K: [num_blocks, page_size, num_head, head_dim]
+            # V: [num_blocks, page_size, num_head, head_dim]
+            key_cache_ptr_offset = (
+                key_cache_ptr
+                + block_id * num_heads * head_size * PAGE_SIZE
+                + slot_id * num_heads * head_size
+            )
+            value_cache_ptr_offset = (
+                value_cache_ptr
+                + block_id * num_heads * head_size * PAGE_SIZE
+                + slot_id * num_heads * head_size
             )
-            slot_id = batch_offset % PAGE_SIZE
-
-            if CACHE_FORMAT == "NHD":
-                # for kv cache layout as
-                # K: [num_blocks, page_size, num_head, head_dim]
-                # V: [num_blocks, page_size, num_head, head_dim]
-                key_cache_ptr_offset = (
-                    key_cache_ptr
-                    + block_id * num_heads * head_size * PAGE_SIZE
-                    + slot_id * num_heads * head_size
-                )
-                value_cache_ptr_offset = (
-                    value_cache_ptr
-                    + block_id * num_heads * head_size * PAGE_SIZE
-                    + slot_id * num_heads * head_size
-                )
 
-                for i in tl.range(0, head_size * num_heads, BLOCK_SIZE):
-                    mask = (col_offsets + i) < head_size * num_heads
-                    k_reg = tl.load(key_cache_ptr_offset + col_offsets + i, mask=mask)
-                    v_reg = tl.load(value_cache_ptr_offset + col_offsets + i, mask=mask)
-                    if DEQUANT:
-                        k_dtype = k_reg.dtype
-                        v_dtype = v_reg.dtype
-                        k_reg = (k_reg.to(tl.float32) * k_scale).to(k_dtype)
-                        v_reg = (v_reg.to(tl.float32) * v_scale).to(v_dtype)
-                    tl.store(key_ptr_offset + col_offsets + i, k_reg, mask=mask)
-                    tl.store(value_ptr_offset + col_offsets + i, v_reg, mask=mask)
+            for i in tl.range(0, head_size * num_heads, BLOCK_SIZE):
+                mask = (col_offsets + i) < head_size * num_heads
+                k_reg = tl.load(key_cache_ptr_offset + col_offsets + i, mask=mask)
+                v_reg = tl.load(value_cache_ptr_offset + col_offsets + i, mask=mask)
+                if DEQUANT:
+                    k_dtype = k_reg.dtype
+                    v_dtype = v_reg.dtype
+                    k_reg = (k_reg.to(tl.float32) * k_scale).to(k_dtype)
+                    v_reg = (v_reg.to(tl.float32) * v_scale).to(v_dtype)
+                tl.store(key_ptr_offset + col_offsets + i, k_reg, mask=mask)
+                tl.store(value_ptr_offset + col_offsets + i, v_reg, mask=mask)
 
     def cp_mha_gather_cache(
         key_cache: torch.Tensor,
@@ -143,9 +141,7 @@ def cp_mha_gather_cache(
         page_size = key_cache.shape[1]
         num_heads = key_cache.shape[2]
 
-        NUM_PRGMS = num_programs(total_tokens)
-        BLOCK_SIZE = block_size(key_cache, head_dim)
-        grid = lambda meta: (NUM_PRGMS,)
+        grid = lambda meta: (total_tokens,)
         cp_mha_gather_cache_kernel[grid](
             key_cache,
             value_cache,
@@ -161,12 +157,10 @@ def cp_mha_gather_cache(
             head_dim,
             x,
             block_tables.size(1),
-            total_tokens,
-            NUM_PRGMS,
             DEQUANT=dequant,
             PAGE_SIZE=page_size,
             CACHE_FORMAT=kv_cache_layout,
-            BLOCK_SIZE=BLOCK_SIZE,
+            BLOCK_SIZE=head_dim,
         )
 
 
@@ -189,6 +183,17 @@ class AiterFlashAttentionPrefillMetadata:
     query_start_loc: torch.Tensor
 
 
+@dataclass
+class AiterChunkSlidingWindowMetadata:
+    swa_seqlens: torch.Tensor
+    swa_cu_seqlens: torch.Tensor
+    swa_seq_starts: torch.Tensor
+    swa_token_to_batch: torch.Tensor
+    swa_max_seqlens: int
+    swa_total_tokens: int
+    swa_workspace: torch.Tensor
+
+
 @dataclass
 class AiterChunkContextMetadata:
     workspace: torch.Tensor
@@ -200,6 +205,7 @@ class AiterChunkContextMetadata:
     seq_lens: torch.Tensor
     num_chunks: int
     total_token_per_batch: list[int]
+    swa_metadata: AiterChunkSlidingWindowMetadata | None
 
 
 @dataclass
@@ -278,6 +284,20 @@ def __init__(
         self.aot_sliding_window: tuple[int, int] | None = None
         self.total_tokens: int = 0
 
+        sliding_window_configs: set[tuple[int, int] | None] = set()
+        layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        for layer in layers.values():
+            assert isinstance(layer.impl, AiterFlashAttentionImpl)
+            sliding_window_configs.add(layer.impl.sliding_window)
+
+        while len(sliding_window_configs) > 0:
+            sliding_window_config = sliding_window_configs.pop()
+            if sliding_window_config is not None and sliding_window_config[0] != -1:
+                assert self.aot_sliding_window is None, (
+                    "Aiter Flash ATTENTION can only support one valid sliding window!"
+                )
+                self.aot_sliding_window = sliding_window_config
+
         self.extend_workspace = torch.empty(
             [2, _CP_TOKENS_PER_ITER_ROCM, self.num_heads_kv, self.headdim],
             dtype=self.model_config.dtype,
@@ -349,6 +369,55 @@ def build(
             query_lens_for_extend = query_lens_cpu[num_extends_slice]
             seq_lens_for_extend = common_attn_metadata.seq_lens_cpu[num_extends_slice]
             computed_kv_lens = seq_lens_for_extend - query_lens_for_extend
+            swa_metadata = None
+            if self.aot_sliding_window is not None:
+                swa_seqlen_for_extend = torch.minimum(
+                    seq_lens_for_extend,
+                    query_lens_for_extend + self.aot_sliding_window[0] + 1,
+                )
+                cu_seq_lens = torch.zeros(
+                    num_extends + 1,
+                    dtype=torch.int32,
+                    device=seq_lens_for_extend.device,
+                )
+                torch.cumsum(
+                    swa_seqlen_for_extend,
+                    dim=0,
+                    dtype=cu_seq_lens.dtype,
+                    out=cu_seq_lens[1:],
+                )
+                token_to_seq = torch.arange(
+                    0,
+                    num_extends,
+                    dtype=torch.int32,
+                    device=seq_lens_for_extend.device,
+                )
+                token_to_seq = torch.repeat_interleave(
+                    token_to_seq, swa_seqlen_for_extend
+                )
+                fetched_shape = cu_seq_lens[-1].item()
+                # TODO(ganyi): Maybe reuse these 2 buffer from extend_workspace
+                swa_workspace = torch.empty(
+                    (2, fetched_shape, self.num_heads_kv, self.headdim),
+                    dtype=self.vllm_config.model_config.dtype,
+                    device=self.device,
+                )
+
+                seq_starts = seq_lens_for_extend - swa_seqlen_for_extend
+                max_seqlen_k = swa_seqlen_for_extend.max().item()
+                total_tokens = cu_seq_lens[-1].item()
+
+                swa_metadata = AiterChunkSlidingWindowMetadata(
+                    swa_seqlens=swa_seqlen_for_extend.to(
+                        self.device, non_blocking=True
+                    ),
+                    swa_cu_seqlens=cu_seq_lens.to(self.device, non_blocking=True),
+                    swa_seq_starts=seq_starts.to(self.device, non_blocking=True),
+                    swa_token_to_batch=token_to_seq.to(self.device, non_blocking=True),
+                    swa_max_seqlens=max_seqlen_k,
+                    swa_total_tokens=total_tokens,
+                    swa_workspace=swa_workspace,
+                )
 
             # allocate the equal amount of workspace for
             # each chunk prefill request
@@ -392,6 +461,7 @@ def build(
                 token_to_batch=token_to_batch_tensor.to(self.device, non_blocking=True),
                 num_chunks=num_chunks,
                 total_token_per_batch=cu_seq_lens_cpu[:, -1].tolist(),
+                swa_metadata=swa_metadata,
             )
 
             query_start_loc_device = common_attn_metadata.query_start_loc[
@@ -504,9 +574,9 @@ def __init__(
             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
         self.alibi_slopes = alibi_slopes
         if sliding_window is None:
-            self.sliding_window = [-1, -1]
+            self.sliding_window = (-1, -1)
         else:
-            self.sliding_window = [sliding_window - 1, 0]
+            self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
@@ -522,6 +592,67 @@ def __init__(
                 "Encoder self-attention is not implemented for FlashAttentionImpl"
             )
 
+    def extend_for_sliding_window(
+        self,
+        attn_metadata: AiterFlashAttentionMetadata,
+        query: torch.Tensor,
+        key_cache,
+        value_cache,
+        output: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        max_seqlen_q: int,
+        block_table: torch.Tensor,
+        k_scale: float,
+        v_scale: float,
+    ):
+        assert attn_metadata.extend_metadata is not None
+        assert attn_metadata.extend_metadata.chunk_context_metadata is not None
+        chunked_metadata = attn_metadata.extend_metadata.chunk_context_metadata
+        swa_metadata = chunked_metadata.swa_metadata
+        assert swa_metadata is not None
+        swa_cu_seqlens = swa_metadata.swa_cu_seqlens
+        swa_seq_starts = swa_metadata.swa_seq_starts
+        swa_token_to_batch = swa_metadata.swa_token_to_batch
+        swa_max_seqlens = swa_metadata.swa_max_seqlens
+        swa_total_tokens = swa_metadata.swa_total_tokens
+        key_fetched, value_fetched = (
+            swa_metadata.swa_workspace[0],
+            swa_metadata.swa_workspace[1],
+        )
+        cp_mha_gather_cache(
+            key_cache=key_cache,
+            value_cache=value_cache,
+            key=key_fetched,
+            value=value_fetched,
+            block_tables=block_table,
+            k_scales=k_scale,
+            v_scales=v_scale,
+            cu_seqlens_kv=swa_cu_seqlens,
+            token_to_batch=swa_token_to_batch,
+            seq_starts=swa_seq_starts,
+            dequant=False,
+            kv_cache_layout="NHD",
+            total_tokens=swa_total_tokens,
+        )
+
+        aiter.flash_attn_varlen_func(
+            q=query,
+            k=key_fetched,
+            v=value_fetched,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=swa_cu_seqlens,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=swa_max_seqlens,
+            min_seqlen_q=1,
+            dropout_p=0.0,
+            softmax_scale=self.scale,
+            causal=True,
+            window_size=self.sliding_window,
+            alibi_slopes=self.alibi_slopes,
+            return_lse=False,
+            out=output,
+        )
+
     def extend_forward(
         self,
         attn_metadata: AiterFlashAttentionMetadata,
@@ -540,6 +671,20 @@ def extend_forward(
         k_scale: float,
         v_scale: float,
     ):
+        if self.sliding_window[0] != -1:
+            self.extend_for_sliding_window(
+                attn_metadata,
+                query,
+                key_cache,
+                value_cache,
+                output,
+                cu_seqlens_q,
+                max_seqlen_q,
+                block_table,
+                k_scale,
+                v_scale,
+            )
+            return
         out, lse = aiter.flash_attn_varlen_func(
             q=query,
             k=key,
@@ -782,6 +927,36 @@ def forward(
             # calculate for decodes
             if num_decodes > 0:
                 assert attn_metadata.decode_metadata is not None
+                if self.sliding_window[0] != -1:
+                    from aiter.ops.triton.unified_attention import (
+                        unified_attention,
+                    )
+
+                    descale_shape = (
+                        attn_metadata.query_start_loc[:num_decodes].shape[0] - 1,
+                        key_cache.shape[2],
+                    )
+                    unified_attention(
+                        q=query[:num_decode_tokens],
+                        k=key_cache,
+                        v=value_cache,
+                        out=output[:num_decode_tokens],
+                        cu_seqlens_q=attn_metadata.query_start_loc[:num_decodes],
+                        max_seqlen_q=1,  # optimize this
+                        seqused_k=attn_metadata.seq_lens[:num_decodes],
+                        max_seqlen_k=attn_metadata.max_seq_len,
+                        softmax_scale=self.scale,
+                        causal=True,
+                        alibi_slopes=self.alibi_slopes,
+                        window_size=self.sliding_window,
+                        block_table=attn_metadata.block_table[:num_decodes],
+                        softcap=self.logits_soft_cap,
+                        q_descale=None,
+                        k_descale=layer._k_scale.expand(descale_shape),
+                        v_descale=layer._v_scale.expand(descale_shape),
+                    )
+                    return
+                assert attn_metadata.decode_metadata is not None
                 _, num_heads, head_size = query.shape
                 nbytes_per_qo_elem = torch.finfo(query.dtype).bits // 8
                 num_seqs = attn_metadata.seq_lens.shape[0]

From cd719de5cb2d0a7c8e1c93fedd206ca8eaa8c517 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sun, 30 Nov 2025 14:29:32 +0000
Subject: [PATCH 563/578] Fix RoPE failures in Transformers nightly (#29700)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/transformers_utils/config.py             | 19 ++-----------------
 vllm/transformers_utils/configs/qwen3_next.py |  2 --
 2 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 8f2cd3315ab9..1bb5791e1901 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -300,25 +300,10 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
 
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
-    # Patch rope_parameters differently based on Transformers version
-    if Version(version("transformers")) >= Version("5.0.0.dev0"):
-        from transformers.modeling_rope_utils import (
-            rope_config_validation,
-            standardize_rope_params,
-        )
-
-        # When Transformers v5 is installed, legacy rope_theta may be present
-        # when using custom code models written for Transformers v4
-        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
-            standardize_rope_params(config, rope_theta=rope_theta)
-            rope_config_validation(config)
-            # Delete rope_theta to avoid confusion in downstream code
-            del config.rope_theta
-    else:
-        # When Transformers v4 is installed, legacy rope_scaling may be present
+    if Version(version("transformers")) < Version("5.0.0.dev0"):
+        # Transformers v4 installed, legacy config fields may be present
         if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
             config.rope_parameters = rope_scaling
-        # When Transformers v4 is installed, legacy rope_theta may be present
         if (rope_theta := getattr(config, "rope_theta", None)) is not None:
             if not hasattr(config, "rope_parameters"):
                 config.rope_parameters = {"rope_type": "default"}
diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py
index d2fe58d48da6..fd36b49245f5 100644
--- a/vllm/transformers_utils/configs/qwen3_next.py
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -17,7 +17,6 @@
 """Qwen3-Next model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig, layer_type_validation
-from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 
 logger = logging.get_logger(__name__)
@@ -245,7 +244,6 @@ def __init__(
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.head_dim = head_dim
-        rope_config_validation(self)
 
         self.layer_types = layer_types
         if self.layer_types is None:

From 39d28108f46ac320bccdf76e84640dbf2bd02bb4 Mon Sep 17 00:00:00 2001
From: Omer Ullman Argov <118735753+omera-nv@users.noreply.github.com>
Date: Sun, 30 Nov 2025 18:02:40 +0200
Subject: [PATCH 564/578] [Feat] Support non-gated activations in NVFP4
 modelopt path (#29004)

---
 tests/kernels/moe/test_flashinfer_moe.py      | 24 +++++--
 tests/kernels/moe/utils.py                    | 11 +++-
 tests/kernels/utils.py                        |  8 ++-
 vllm/model_executor/layers/fused_moe/layer.py | 12 +++-
 .../layers/quantization/modelopt.py           | 65 ++++++++++++++++---
 5 files changed, 98 insertions(+), 22 deletions(-)

diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py
index be3e36865d1a..b2be03ecee2f 100644
--- a/tests/kernels/moe/test_flashinfer_moe.py
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -16,11 +16,11 @@
     FlashInferExperts,
     is_valid_flashinfer_cutlass_fused_moe,
 )
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (
+    create_flashinfer_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
@@ -48,9 +48,10 @@
 @pytest.mark.parametrize("e", [40, 64, 256])
 @pytest.mark.parametrize("topk", [1, 6, 8])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("activation", ["silu_and_mul", "relu2"])
 @torch.inference_mode()
 def test_flashinfer_fp4_moe_no_graph(
-    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, activation: str
 ):
     current_platform.seed_everything(7)
     with set_current_vllm_config(
@@ -59,6 +60,7 @@ def test_flashinfer_fp4_moe_no_graph(
         a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
 
         quant_blocksize = 16
+        is_gated_act = activation == "silu_and_mul"
 
         w1_q, w2_q, quant_config = make_test_quant_config(
             e,
@@ -68,6 +70,7 @@ def test_flashinfer_fp4_moe_no_graph(
             quant_dtype="nvfp4",
             block_shape=None,
             per_act_token_quant=False,
+            make_gate=is_gated_act,
         )
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
@@ -76,16 +79,19 @@ def test_flashinfer_fp4_moe_no_graph(
         assert is_valid_flashinfer_cutlass_fused_moe(a, w1_q, w2_q)
 
         flashinfer_experts = FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+            create_flashinfer_prepare_finalize(use_dp=False, use_nvfp4=True),
             FlashInferExperts(out_dtype=dtype, quant_config=quant_config),
         )
 
+        fi_activation = {"silu_and_mul": "silu", "relu2": "relu2_no_mul"}[activation]
+
         flashinfer_output = flashinfer_experts(
             hidden_states=a,
             w1=w1_q,
             w2=w2_q,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
+            activation=fi_activation,
         )
 
         # Reference check:
@@ -103,7 +109,9 @@ def test_flashinfer_fp4_moe_no_graph(
             block_size=quant_blocksize,
         )
 
-        w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype)
+        w1_d = torch.empty(
+            (e, (2 if is_gated_act else 1) * n, k), device="cuda", dtype=dtype
+        )
         w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
 
         for idx in range(0, e):
@@ -124,7 +132,9 @@ def test_flashinfer_fp4_moe_no_graph(
                 block_size=quant_blocksize,
             )
 
-        torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)
+        torch_output = torch_moe(
+            a_in_dtype, w1_d, w2_d, score, topk, activation=activation
+        )
 
         torch.testing.assert_close(
             torch_output, flashinfer_output, atol=1e-1, rtol=1e-1
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index c7e6c4240e85..f0c8c8033b8e 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -264,13 +264,20 @@ def make_test_weights(
     quant_dtype: torch.dtype | str | None = None,
     block_shape: list[int] | None = None,
     per_out_ch_quant: bool = False,
+    make_gate: bool = True,
 ) -> tuple[
     tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None],
     tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None],
 ]:
     return (
         make_test_weight(
-            e, 2 * n, k, in_dtype, quant_dtype, block_shape, per_out_ch_quant
+            e,
+            (2 if make_gate else 1) * n,
+            k,
+            in_dtype,
+            quant_dtype,
+            block_shape,
+            per_out_ch_quant,
         ),
         make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape, per_out_ch_quant),
     )
@@ -297,6 +304,7 @@ def make_test_quant_config(
     quant_dtype: torch.dtype | str | None = None,
     per_act_token_quant: bool = False,
     block_shape: list[int] | None = None,
+    make_gate: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor, FusedMoEQuantConfig]:
     (_, w1, w1_s, w1_gs), (_, w2, w2_s, w2_gs) = make_test_weights(
         e,
@@ -306,6 +314,7 @@ def make_test_quant_config(
         quant_dtype,
         per_out_ch_quant=per_act_token_quant,
         block_shape=block_shape,
+        make_gate=make_gate,
     )
 
     # Hacky/trivial scales for nvfp4.
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 98646442391f..72c79370d19c 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -14,6 +14,7 @@
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.attention.backends.abstract import AttentionType
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.utils.torch_utils import make_tensor_with_pad
@@ -839,6 +840,7 @@ def torch_experts(
     per_act_token_quant=False,
     block_shape: list[int] | None = None,
     apply_router_weights_on_input: bool = False,
+    activation: str = "silu_and_mul",
 ) -> torch.Tensor:
     assert (
         global_num_experts == -1
@@ -881,6 +883,8 @@ def torch_experts(
 
     f32 = torch.float32
 
+    act = CustomOp.op_registry[activation]
+
     for i in range(num_experts):
         mask = topk_ids == i
         if mask.sum():
@@ -888,7 +892,7 @@ def torch_experts(
                 tmp1 = a[mask] @ w1[i].transpose(0, 1)
                 if b_bias1 is not None:
                     tmp1 = tmp1 + b_bias1[i].view(1, -1).to(tmp1.dtype)
-                tmp2 = SiluAndMul()(tmp1)
+                tmp2 = act()(tmp1)
                 out[mask] = tmp2 @ w2[i].transpose(0, 1)
                 if b_bias2 is not None:
                     out[mask] = out[mask] + b_bias2[i].view(1, -1).to(tmp1.dtype)
@@ -969,6 +973,7 @@ def torch_moe(
     b_bias2: torch.Tensor | None = None,
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
+    activation: str = "silu_and_mul",
 ) -> torch.Tensor:
     score = torch.softmax(score, dim=-1, dtype=torch.float32)
     topk_weight, topk_ids = torch.topk(score, topk)
@@ -982,6 +987,7 @@ def torch_moe(
         b_bias1,
         b_bias2,
         expert_map,
+        activation=activation,
     )
 
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index e180b4f4ba23..902a77987d61 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -600,14 +600,20 @@ def _get_quant_method() -> FusedMoEMethodBase:
             # Avoid circular import
             from vllm.model_executor.layers.quantization.modelopt import (
                 ModelOptFp8MoEMethod,
+                ModelOptNvFp4FusedMoE,
             )
 
             if not isinstance(
-                self.quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod)
+                self.quant_method,
+                (
+                    UnquantizedFusedMoEMethod,
+                    ModelOptFp8MoEMethod,
+                    ModelOptNvFp4FusedMoE,
+                ),
             ):
                 raise NotImplementedError(
                     "is_act_and_mul=False is supported only for unquantized "
-                    "and ModelOpt FP8 moe for now"
+                    ", ModelOpt FP8, and ModelOpt NvFp4 checkpoints"
                 )
             if not current_platform.is_cuda():
                 raise NotImplementedError(
@@ -1277,7 +1283,7 @@ def weight_loader(
                     self._load_combined_w13_weight_scale(
                         shard_dim=shard_dim,
                         loaded_weight=loaded_weight,
-                        param=param,
+                        param=expert_data,
                         tp_rank=self.tp_rank,
                     )
                     return True if return_success else None
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 709c86175477..034e97a713cd 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1216,7 +1216,7 @@ def create_weights(
         w13_weight = ModelWeightParameter(
             data=torch.empty(
                 num_experts,
-                2 * intermediate_size_per_partition,
+                (2 if self.moe.is_act_and_mul else 1) * intermediate_size_per_partition,
                 # 2 fp4 items are packed in the input dimension
                 hidden_size // 2,
                 dtype=weight_dtype,
@@ -1245,7 +1245,7 @@ def create_weights(
         w13_weight_scale = ModelWeightParameter(
             data=torch.empty(
                 num_experts,
-                2 * intermediate_size_per_partition,
+                (2 if self.moe.is_act_and_mul else 1) * intermediate_size_per_partition,
                 # 2 fp4 items are packed in the input dimension
                 hidden_size // self.quant_config.group_size,
                 dtype=weight_scale_dtype,
@@ -1275,7 +1275,9 @@ def create_weights(
         )
 
         w13_weight_scale_2 = PerTensorScaleParameter(
-            data=torch.empty(num_experts, 2, dtype=torch.float32),
+            data=torch.empty(
+                num_experts, 2 if self.moe.is_act_and_mul else 1, dtype=torch.float32
+            ),
             weight_loader=weight_loader,
         )
         layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
@@ -1296,7 +1298,11 @@ def create_weights(
         global_scale_num_experts = global_num_experts if use_global_sf else num_experts
 
         w13_input_scale = PerTensorScaleParameter(
-            data=torch.empty(global_scale_num_experts, 2, dtype=torch.float32),
+            data=torch.empty(
+                global_scale_num_experts,
+                2 if self.moe.is_act_and_mul else 1,
+                dtype=torch.float32,
+            ),
             weight_loader=weight_loader,
         )
         layer.register_parameter("w13_input_scale", w13_input_scale)
@@ -1312,9 +1318,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         gemm1_weight = layer.w13_weight.data
         gemm1_weight_scale = layer.w13_weight_scale.data
 
-        if self.allow_flashinfer and (
-            self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        if (
+            self.allow_flashinfer
+            and (
+                self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+                or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+            )
+            and self.moe.is_act_and_mul
         ):
             gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
                 gemm1_weight, gemm1_weight_scale, dim=-2
@@ -1324,7 +1334,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.w13_weight_scale = Parameter(gemm1_weight_scale, requires_grad=False)
 
         # Common processing for w13_weight_scale_2
-        if not torch.allclose(
+        if self.moe.is_act_and_mul and not torch.allclose(
             layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]
         ):
             logger.warning_once(
@@ -1437,11 +1447,39 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 w13_blockscale_swizzled, requires_grad=False
             )
 
+            w13_weight = layer.w13_weight
+            intermediate_size_pad = w13_blockscale_swizzled.size(1) - w13_weight.size(1)
+            if intermediate_size_pad:
+                # padding gated activations will require to split w1 and w3
+                # and pad them individually
+                assert not self.moe.is_act_and_mul, (
+                    "The intermediate size required padding, "
+                    "but padding is not implemented for gated activations"
+                )
+
+                layer.w13_weight = Parameter(
+                    torch.nn.functional.pad(
+                        w13_weight, (0, 0, 0, intermediate_size_pad)
+                    ),
+                    requires_grad=False,
+                )
+                layer.w2_weight = Parameter(
+                    torch.nn.functional.pad(
+                        layer.w2_weight, (0, intermediate_size_pad // 2, 0, 0)
+                    ),
+                    requires_grad=False,
+                )
+                layer.w2_weight_scale = Parameter(
+                    torch.nn.functional.pad(
+                        layer.w2_weight_scale, (0, intermediate_size_pad // 16)
+                    ),
+                    requires_grad=False,
+                )
+
             w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
             layer.w2_weight_scale = Parameter(
                 w2_blockscale_swizzled, requires_grad=False
             )
-            layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -1484,7 +1522,14 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert activation == "silu", "Only SiLU activation is supported."
+        if not self.moe.is_act_and_mul:
+            assert (
+                self.allow_flashinfer
+                and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+            ), (
+                "Non-gated activations are only supported by the"
+                " flashinfer CUTLASS backend for modelopt checkpoints"
+            )
 
         if (
             self.allow_flashinfer

From 21c26279344871558aaba033e5bd058ab2e69793 Mon Sep 17 00:00:00 2001
From: Xingyu Liu <38244988+charlotte12l@users.noreply.github.com>
Date: Mon, 1 Dec 2025 01:14:23 +0800
Subject: [PATCH 565/578] [Misc]Remove redundant hidden_size property in
 ModelConfig (#29749)

Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/model.py                   | 9 +--------
 vllm/model_executor/models/adapters.py | 2 +-
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 92cd48402a65..b595c8550491 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1726,19 +1726,12 @@ def head_dtype(self) -> torch.dtype:
         logger.debug_once("head dtype: %s", head_dtype)
         return head_dtype
 
-    @property
-    def hidden_size(self):
-        if hasattr(self.hf_config, "hidden_size"):
-            return self.hf_config.hidden_size
-        text_config = self.hf_config.get_text_config()
-        return text_config.hidden_size
-
     @property
     def embedding_size(self):
         dense_modules = try_get_dense_modules(self.model, revision=self.revision)
         if dense_modules is not None:
             return dense_modules[-1]["out_features"]
-        return self.hidden_size
+        return self.get_hidden_size()
 
     def get_and_verify_max_len(self, max_model_len: int):
         # Consider max_model_len in tokenizer_config only when
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 738400ae864a..05f257feea3e 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -301,7 +301,7 @@ def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
             quant_config = vllm_config.quant_config
 
             self.score = ReplicatedLinear(
-                model_config.hidden_size,
+                model_config.get_hidden_size(),
                 text_config.num_labels,
                 bias=False,
                 params_dtype=vllm_config.model_config.head_dtype,

From ec38a7368df7a14331e5f99f5de899df13f3b954 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 30 Nov 2025 14:15:42 -0800
Subject: [PATCH 566/578] [Model Runner V2] Use packed mask for prompt bin
 counts (#29756)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu/sample/metadata.py  |  8 +++---
 vllm/v1/worker/gpu/sample/penalties.py | 36 +++++++++++++++-----------
 vllm/v1/worker/gpu/states.py           | 16 +++++++-----
 3 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/vllm/v1/worker/gpu/sample/metadata.py b/vllm/v1/worker/gpu/sample/metadata.py
index 666649fd0eeb..040771c051bb 100644
--- a/vllm/v1/worker/gpu/sample/metadata.py
+++ b/vllm/v1/worker/gpu/sample/metadata.py
@@ -26,7 +26,7 @@ class SamplingMetadata:
 
     # For penalties
     idx_mapping: torch.Tensor
-    prompt_bin_counts: torch.Tensor
+    prompt_bin_mask: torch.Tensor
     output_bin_counts: torch.Tensor
 
     @classmethod
@@ -57,7 +57,7 @@ def make_dummy(
         # NOTE(woosuk): These are placeholder tensors to avoid None checks in the
         # penalties kernel. We use 2 instead of 1 as vocab_size to avoid Triton
         # specialization and re-compilation at runtime.
-        prompt_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+        prompt_bin_mask = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
         output_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
 
         return cls(
@@ -71,7 +71,7 @@ def make_dummy(
             pos=pos,
             max_num_logprobs=max_num_logprobs,
             idx_mapping=idx_mapping,
-            prompt_bin_counts=prompt_bin_counts,
+            prompt_bin_mask=prompt_bin_mask,
             output_bin_counts=output_bin_counts,
         )
 
@@ -174,6 +174,6 @@ def expand_sampling_metadata(
         max_num_logprobs=sampling_metadata.max_num_logprobs,
         # TODO(woosuk): Support penalties with spec decoding.
         idx_mapping=sampling_metadata.idx_mapping,
-        prompt_bin_counts=sampling_metadata.prompt_bin_counts,
+        prompt_bin_mask=sampling_metadata.prompt_bin_mask,
         output_bin_counts=sampling_metadata.output_bin_counts,
     )
diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
index 69cf9d26ec99..c8d4b7d81841 100644
--- a/vllm/v1/worker/gpu/sample/penalties.py
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -15,8 +15,8 @@ def _penalties_and_temperature_kernel(
     presence_penalty_ptr,
     temperature_ptr,
     idx_mapping_ptr,
-    prompt_bin_counts_ptr,
-    prompt_bin_counts_stride,
+    prompt_bin_mask_ptr,
+    prompt_bin_mask_stride,
     output_bin_counts_ptr,
     output_bin_counts_stride,
     vocab_size,
@@ -54,13 +54,16 @@ def _penalties_and_temperature_kernel(
 
         # Apply repetition penalties.
         if use_rep_penalty:
-            prompt_bin_counts = tl.load(
-                prompt_bin_counts_ptr
-                + req_state_idx * prompt_bin_counts_stride
-                + block,
-                mask=mask,
+            packed_block = block_idx * BLOCK_SIZE // 32 + tl.arange(0, BLOCK_SIZE // 32)
+            packed_mask = tl.load(
+                prompt_bin_mask_ptr
+                + req_state_idx * prompt_bin_mask_stride
+                + packed_block,
+                mask=packed_block < tl.cdiv(vocab_size, 32),
             )
-            prompt_bin_mask = prompt_bin_counts > 0
+            prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1
+            prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)
+
             # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
             scale = tl.where(prompt_bin_mask | output_bin_mask, rep_penalty, 1.0)
             # If logits are positive, divide by penalty, otherwise multiply by penalty.
@@ -93,8 +96,8 @@ def apply_penalties_and_temperature(
         sampling_metadata.presence_penalty,
         sampling_metadata.temperature,
         sampling_metadata.idx_mapping,
-        sampling_metadata.prompt_bin_counts,
-        sampling_metadata.prompt_bin_counts.stride(0),
+        sampling_metadata.prompt_bin_mask,
+        sampling_metadata.prompt_bin_mask.stride(0),
         sampling_metadata.output_bin_counts,
         sampling_metadata.output_bin_counts.stride(0),
         vocab_size,
@@ -107,7 +110,7 @@ def _bincount_kernel(
     prefill_token_ids_ptr,
     prefill_len,
     prompt_len,
-    prompt_bin_counts_ptr,
+    prompt_bin_mask_ptr,
     output_bin_counts_ptr,
     BLOCK_SIZE: tl.constexpr,
 ):
@@ -119,7 +122,10 @@ def _bincount_kernel(
     if block_idx * BLOCK_SIZE < prompt_len:
         mask = block < prompt_len
         prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
-        tl.atomic_add(prompt_bin_counts_ptr + prefill_tokens, 1, mask=mask)
+        idx = prefill_tokens // 32
+        bit_idx = prefill_tokens % 32
+        bit = tl.full((BLOCK_SIZE,), 1, tl.int32) << bit_idx
+        tl.atomic_or(prompt_bin_mask_ptr + idx, bit, mask=mask)
     if (block_idx + 1) * BLOCK_SIZE >= prompt_len:
         mask = block < prefill_len
         mask &= block >= prompt_len
@@ -131,10 +137,10 @@ def bincount(
     prefill_token_ids: torch.Tensor,
     prefill_len: int,
     prompt_len: int,
-    prompt_bin_counts: torch.Tensor,
+    prompt_bin_mask: torch.Tensor,
     output_bin_counts: torch.Tensor,
 ) -> None:
-    prompt_bin_counts.zero_()
+    prompt_bin_mask.zero_()
     output_bin_counts.zero_()
     BLOCK_SIZE = 1024
     num_blocks = triton.cdiv(prefill_len, BLOCK_SIZE)
@@ -142,7 +148,7 @@ def bincount(
         prefill_token_ids,
         prefill_len,
         prompt_len,
-        prompt_bin_counts,
+        prompt_bin_mask,
         output_bin_counts,
         BLOCK_SIZE=BLOCK_SIZE,
     )
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index c3428faab0a3..367348c4a18f 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -7,6 +7,7 @@
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
+from vllm.utils.math_utils import cdiv
 from vllm.utils.platform_utils import is_uva_available
 from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
 from vllm.v1.outputs import LogprobsTensors
@@ -97,11 +98,14 @@ def __init__(
         self.needs_prompt_logprobs = np.zeros(self.max_num_reqs, dtype=bool)
 
         # Statistics for penalties.
-        # TODO(woosuk): These tensors are rarely used but can be extremely large.
-        # Optimize the memory usage.
-        self.prompt_bin_counts = torch.zeros(
-            self.max_num_reqs, self.vocab_size, dtype=torch.int32, device=self.device
+        self.prompt_bin_mask = torch.zeros(
+            self.max_num_reqs,
+            cdiv(self.vocab_size, 32),
+            dtype=torch.int32,
+            device=self.device,
         )
+        # TODO(woosuk): This tensor is rarely used but can be extremely large.
+        # Optimize the memory usage.
         self.output_bin_counts = torch.zeros(
             self.max_num_reqs, self.vocab_size, dtype=torch.int32, device=self.device
         )
@@ -167,7 +171,7 @@ def add_request(
                 self.prefill_token_ids.gpu[req_idx],
                 prefill_len,
                 prompt_len,
-                self.prompt_bin_counts[req_idx],
+                self.prompt_bin_mask[req_idx],
                 self.output_bin_counts[req_idx],
             )
 
@@ -239,7 +243,7 @@ def make_sampling_metadata(
             pos=pos,
             max_num_logprobs=max_num_logprobs,
             idx_mapping=idx_mapping,
-            prompt_bin_counts=self.prompt_bin_counts,
+            prompt_bin_mask=self.prompt_bin_mask,
             output_bin_counts=self.output_bin_counts,
         )
 

From f72a817bdf6bd04b223a9da3af6c4ad1a676a98e Mon Sep 17 00:00:00 2001
From: Shu Wang <shuw@nvidia.com>
Date: Sun, 30 Nov 2025 18:05:32 -0600
Subject: [PATCH 567/578] [MoE] CuteDSL MoE with Nvfp4 DeepEP dispatch 
 (#27141)

Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: root <root@umbriel-b200-017.ipp4a1.colossus.nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/envs.py                                  |  7 ++
 .../fused_moe/deepep_ll_prepare_finalize.py   | 85 +++++++++++++------
 .../fused_moe/flashinfer_cutedsl_moe.py       | 68 ++++++++++-----
 3 files changed, 113 insertions(+), 47 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 8ad62e1b8f50..541d5e20d5aa 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -147,6 +147,7 @@
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_MARLIN_INPUT_DTYPE: Literal["int8", "fp8"] | None = None
     VLLM_MXFP4_USE_MARLIN: bool | None = None
+    VLLM_DEEPEPLL_NVFP4_DISPATCH: bool = False
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_TPU_MOST_MODEL_LEN: int | None = None
@@ -1127,6 +1128,12 @@ def get_vllm_port() -> int | None:
     "VLLM_MARLIN_INPUT_DTYPE": env_with_choices(
         "VLLM_MARLIN_INPUT_DTYPE", None, ["int8", "fp8"]
     ),
+    # Whether to use DeepEPLL kernels for NVFP4 quantization and dispatch method
+    # only supported on Blackwell GPUs and with
+    # https://github.com/deepseek-ai/DeepEP/pull/341
+    "VLLM_DEEPEPLL_NVFP4_DISPATCH": lambda: bool(
+        int(os.getenv("VLLM_DEEPEPLL_NVFP4_DISPATCH", "0"))
+    ),
     # Whether to turn on the outlines cache for V1
     # This cache is unbounded and on disk, so it's not safe to use in
     # an environment with potentially malicious users.
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index fea9f49c04b8..06e4a61133bd 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -184,31 +184,47 @@ def _do_quant(
             x_fp8, x_scales = x
             x = dequant_fp8(x_fp8, x_scales).to(dtype=a1_dtype)
 
-        assert isinstance(x, torch.Tensor)
-
-        num_experts, max_tokens, hidden_dim = x.size()
-
-        # TODO (varun): Optimization - Use a batched version of quant
-        x = x.view((-1, hidden_dim))
+        assert isinstance(x, (torch.Tensor, tuple))
         q_dtype = quant_config.quant_dtype
 
-        if envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm":
+        if q_dtype == "nvfp4" and envs.VLLM_DEEPEPLL_NVFP4_DISPATCH:
             logger.info_once(
-                "Skip quantization when using FlashInfer CUTEDSL(masked_gemm) "
-                "for ModelOptNvFp4FusedMoE."
+                "Since VLLM_DEEPEPLL_NVFP4_DISPATCH==1, make sure "
+                "using the hybrid-ep branch of DeepEP"
+                "(https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep)"
             )
-            q_dtype = None
-
-        x, x_scales = moe_kernel_quantize_input(
-            x,
-            quant_config.a1_scale,
-            q_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-        )
-        x = x.view((num_experts, -1, hidden_dim))
+            assert isinstance(x, tuple)
+            x_scales = x[1]
+            x = x[0].permute(2, 0, 1)
+            num_experts, max_tokens, hidden_dim_by_2 = x.shape
+            hidden_dim = hidden_dim_by_2 * 2
+            assert envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm"
+            logger.info_once(
+                "Quantization is fused with DeepEP nvfp4 dispatch for "
+                "FlashInfer CUTEDSL as VLLM_DEEPEPLL_NVFP4_DISPATCH==1"
+            )
+        else:
+            if q_dtype == "nvfp4":
+                q_dtype = None
+                logger.info_once(
+                    "Using DeepEP bfloat16 dispatch for FlashInfer CUTEDSL as "
+                    "VLLM_DEEPEPLL_NVFP4_DISPATCH==0"
+                )
+            assert isinstance(x, torch.Tensor)
+            num_experts, max_tokens, hidden_dim = x.size()
+
+            # TODO (varun): Optimization - Use a batched version of quant
+            x = x.view((-1, hidden_dim))
+            x, x_scales = moe_kernel_quantize_input(
+                x,
+                quant_config.a1_scale,
+                q_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
+            )
+            x = x.view((num_experts, -1, hidden_dim))
 
-        if q_dtype is not None:
+        if q_dtype is not None and q_dtype != "nvfp4":
             assert x_scales is not None
             x_scales = normalize_batched_scales_shape(x_scales, num_experts)
 
@@ -240,18 +256,28 @@ def prepare_async(
                 "DeepEP kernels quantize the inputs in blocks of shape 128"
             )
 
+        use_nvfp4 = False
+        nvfp4_dispatch = (
+            quant_config.quant_dtype == "nvfp4" and envs.VLLM_DEEPEPLL_NVFP4_DISPATCH
+        )
+        if nvfp4_dispatch:
+            use_nvfp4 = True
+        qc_a1_gscale_or_scale = (
+            quant_config.a1_gscale if nvfp4_dispatch else quant_config.a1_scale
+        )
         has_per_token_scales = (
-            quant_config.a1_scale.numel() != 1
-            if quant_config.a1_scale is not None
+            qc_a1_gscale_or_scale.numel() != 1
+            if qc_a1_gscale_or_scale is not None
             else (
                 quant_config.a2_scale.numel() != 1
                 if quant_config.a2_scale is not None
                 else False
             )
         )
-        assert not has_per_token_scales, (
-            "low_latency kernels doesn't support dispatching per-token scales"
-        )
+        if not use_nvfp4:
+            assert not has_per_token_scales, (
+                "low_latency kernels doesn't support dispatching per-token scales"
+            )
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
@@ -269,9 +295,12 @@ def prepare_async(
             self.max_tokens_per_rank,
             num_experts,
             use_fp8=self.use_fp8_dispatch,
-            # round_scale needs to be set to dispatch in ue8m0
-            round_scale=self.use_ue8m0_dispatch,
-            use_ue8m0=self.use_ue8m0_dispatch,
+            **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
+            **(
+                dict(x_global_scale=qc_a1_gscale_or_scale)
+                if qc_a1_gscale_or_scale is not None
+                else dict()
+            ),
             async_finish=False,
             return_recv_hook=True,
         )
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
index 2747ef04a349..6e0b57156cb3 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
@@ -4,6 +4,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
@@ -109,7 +110,8 @@ def workspace_shapes(
         - Note: in order for activation chunking to work, the first dimension
           of each tuple must be the number of tokens.
         """
-        output_shape = (local_num_experts, M, K)
+        K_dim = K * 2 if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH else K
+        output_shape = (local_num_experts, M, K_dim)
         workspace2 = (local_num_experts, M, N)
         workspace1 = output_shape
         return (workspace1, workspace2, output_shape)
@@ -144,9 +146,18 @@ def apply(
         assert hidden_states.ndim == 3
         assert self.w1_scale.ndim == 3
         assert self.w2_scale.ndim == 3
+
+        input_global_scale = (
+            None if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH else self.a1_gscale
+        )
+        flashinfer_hidden_states = (
+            (hidden_states, a1q_scale)
+            if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH
+            else hidden_states
+        )
         flashinfer_cutedsl_moe_masked(
-            hidden_states=hidden_states,
-            input_global_scale=self.a1_gscale,
+            hidden_states=flashinfer_hidden_states,
+            input_global_scale=input_global_scale,
             w1=w1,
             w1_blockscale=self.w1_scale,
             w1_alpha=self.g1_alphas,
@@ -172,7 +183,7 @@ def get_cute_dtype(input: torch.Tensor) -> str:
 
 
 def flashinfer_cutedsl_moe_masked(
-    hidden_states: torch.Tensor,
+    hidden_states: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
     input_global_scale: torch.Tensor,
     w1: torch.Tensor,
     w1_blockscale: torch.Tensor,
@@ -190,7 +201,10 @@ def flashinfer_cutedsl_moe_masked(
     kernels.
 
     Args:
-        hidden_states (torch.Tensor): [num_experts, m, k], bf16
+        hidden_states: Either of the following case
+            * torch.Tensor: [num_experts, m, k], bf16
+            * tuple[torch.Tensor, torch.Tensor]: [num_experts, m, k // 2],
+                  uint8, [num_experts, m, k // 16], float8_e4m3fn
         input_global_scale (torch.Tensor): (l,)
         w1 (torch.Tensor): fp4 weights, [l, 2 * n, k // 2], uint8
         w1_blockscale (torch.Tensor): blockscale factors, e4m3,
@@ -207,9 +221,6 @@ def flashinfer_cutedsl_moe_masked(
     """
 
     # === Assertions on dtypes ===
-    assert input_global_scale.dtype == torch.float32, (
-        f"input_global_scale must be float32, got {input_global_scale.dtype}"
-    )
     assert w1.dtype == torch.uint8, f"w1 must be uint8, got {w1.dtype}"
     assert w1_blockscale.dtype == torch.float8_e4m3fn, (
         f"w1_blockscale must be float8_e4m3fn, got {w1_blockscale.dtype}"
@@ -230,7 +241,32 @@ def flashinfer_cutedsl_moe_masked(
 
     # === Assertions on shapes ===
     n = w2.shape[-1] * 2  # intermediate dimension
-    num_experts, m, k = hidden_states.shape
+    if isinstance(hidden_states, tuple):
+        assert input_global_scale is None, (
+            "input_global_scale is needed when input needs quant"
+        )
+
+        aq = hidden_states[0].view(torch.uint8)
+        aq_sf = hidden_states[1].view(torch.float8_e4m3fn)
+        # m, k_by_2, num_experts = aq.shape
+        num_experts, m, k_by_2 = aq.shape
+        k = k_by_2 * 2
+        aq = aq.permute(1, 2, 0)
+    else:
+        num_experts, m, k = hidden_states.shape
+
+        assert input_global_scale.dtype == torch.float32, (
+            f"input_global_scale must be float32, got {input_global_scale.dtype}"
+        )
+        assert input_global_scale.shape == (num_experts,), (
+            f"input_global_scale must be (l,), got {input_global_scale.shape}"
+        )
+
+        aq, aq_sf = scaled_fp4_grouped_quantize(
+            hidden_states,
+            masked_m,
+            input_global_scale,
+        )
 
     assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n, got {w1.shape}"
     assert w1.shape[-1] * 2 == k, (
@@ -241,9 +277,6 @@ def flashinfer_cutedsl_moe_masked(
         n // 2,
     ), f"w2 shape mismatch, got {w2.shape[-2:]}, expected {(k, n // 2)}"
 
-    assert input_global_scale.shape == (num_experts,), (
-        f"input_global_scale must be (l,), got {input_global_scale.shape}"
-    )
     assert w1_alpha.shape == (num_experts,), (
         f"w1_alpha must be (l,), got {w1_alpha.shape}"
     )
@@ -254,12 +287,6 @@ def flashinfer_cutedsl_moe_masked(
         f"w2_alpha must be (l,), got {w2_alpha.shape}"
     )
 
-    aq, aq_sf = scaled_fp4_grouped_quantize(
-        hidden_states,
-        masked_m,
-        input_global_scale,
-    )
-
     workspace = workspace.permute(1, 2, 0)  # requirement of kernel
     sf_vec_size = 16
     assert aq_sf.dtype == torch.float8_e4m3fn
@@ -267,7 +294,10 @@ def flashinfer_cutedsl_moe_masked(
     ab_dtype = "float4_e2m1fn"
     sf_dtype = "float8_e4m3fn"
 
-    c_dtype = get_cute_dtype(hidden_states)
+    if isinstance(hidden_states, tuple):
+        c_dtype = "bfloat16"
+    else:
+        c_dtype = get_cute_dtype(hidden_states)
 
     # Gemm1
     flashinfer_cutedsl_grouped_gemm_nt_masked(

From 1ab8fc8197f5aa0ae471ef0ff1c725b733594f7a Mon Sep 17 00:00:00 2001
From: Yifei Zhang <yifei.zhang1992@outlook.com>
Date: Mon, 1 Dec 2025 12:30:46 +0800
Subject: [PATCH 568/578] Make PyTorch profiler gzip and CUDA time dump
 configurable (#29568)

Signed-off-by: Yifei Zhang <yifei.zhang1992@outlook.com>
---
 docs/contributing/profiling.md |  2 ++
 vllm/envs.py                   | 13 +++++++++++++
 vllm/profiler/gpu_profiler.py  | 25 ++++++++++++++-----------
 vllm/v1/engine/async_llm.py    |  4 +++-
 vllm/v1/worker/xpu_worker.py   |  4 +++-
 5 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 7634cc0859ed..65382afbe4f2 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -11,6 +11,8 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
 - `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
 - `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
 - `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
+- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
+- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
 
 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 541d5e20d5aa..46f1aa3222be 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -100,6 +100,8 @@
     VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
     VLLM_PROFILER_DELAY_ITERS: int = 0
     VLLM_PROFILER_MAX_ITERS: int = 0
+    VLLM_TORCH_PROFILER_USE_GZIP: bool = True
+    VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
@@ -890,6 +892,17 @@ def get_vllm_port() -> int | None:
     # Maximum number of iterations to profile when using the torch/torch CUDA profiler.
     # If set to 0, will not limit the number of iterations.
     "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
+    # Control whether torch profiler gzip-compresses profiling files.
+    # Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default).
+    "VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0"
+    ),
+    # Control whether torch profiler dumps the self_cuda_time_total table.
+    # Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping
+    # (enabled by default).
+    "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0"
+    ),
     # If set, vLLM will use Triton implementations of AWQ.
     "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
     # If set, allow loading or unloading lora adapters in runtime,
diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py
index 3e2cbe7296e9..798c615221b9 100644
--- a/vllm/profiler/gpu_profiler.py
+++ b/vllm/profiler/gpu_profiler.py
@@ -162,7 +162,9 @@ def __init__(self, worker_name: str, local_rank: int) -> None:
             with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
             with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
             on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
+                torch_profiler_trace_dir,
+                worker_name=worker_name,
+                use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
             ),
         )
 
@@ -174,18 +176,19 @@ def _start(self) -> None:
     def _stop(self) -> None:
         self.profiler.stop()
 
-        rank = self.local_rank
-        profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
-        profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
-        sort_key = "self_cuda_time_total"
-        table = self.profiler.key_averages().table(sort_by=sort_key)
+        if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL:
+            rank = self.local_rank
+            profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
+            profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
+            sort_key = "self_cuda_time_total"
+            table = self.profiler.key_averages().table(sort_by=sort_key)
 
-        with open(profiler_out_file, "w") as f:
-            print(table, file=f)
+            with open(profiler_out_file, "w") as f:
+                print(table, file=f)
 
-        # only print profiler results on rank 0
-        if rank == 0:
-            print(table)
+            # only print profiler results on rank 0
+            if rank == 0:
+                print(table)
 
     @override
     def annotate_context_manager(self, name: str):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 336d3e9fa1d2..d0708a8a046d 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -190,7 +190,9 @@ def __init__(
                 ],
                 with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    envs.VLLM_TORCH_PROFILER_DIR, worker_name=worker_name, use_gzip=True
+                    envs.VLLM_TORCH_PROFILER_DIR,
+                    worker_name=worker_name,
+                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
                 ),
             )
         else:
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 4d7864e90496..267369c73036 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -64,7 +64,9 @@ def __init__(
                 with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
                 with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
+                    torch_profiler_trace_dir,
+                    worker_name=worker_name,
+                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
                 ),
             )
         else:

From 83805a6078c73f7f72d270952965a637c40ebdf2 Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Sun, 30 Nov 2025 20:38:06 -0800
Subject: [PATCH 569/578] [CI] Skip paddleocr_vl for transformer 4.57.3
 (#29758)

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 tests/models/multimodal/generation/test_common.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 95b64b380db0..deaeea059cca 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -708,6 +708,12 @@
         max_num_seqs=2,
         auto_cls=AutoModelForCausalLM,
         image_size_factors=[(), (0.25,)],
+        marks=[
+            pytest.mark.skipif(
+                Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
+                reason="This model is broken in Transformers v4.57.3",
+            )
+        ],
     ),
     "phi3v": VLMTestInfo(
         models=["microsoft/Phi-3.5-vision-instruct"],

From 62de4f4257255bff553a61f475212e3e9454706c Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Mon, 1 Dec 2025 15:30:43 +0800
Subject: [PATCH 570/578] [Frontend] Resettle pooling entrypoints  (#29634)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 .github/CODEOWNERS                            |   1 +
 docs/design/io_processor_plugins.md           |   2 +-
 docs/serving/openai_compatible_server.md      |  10 +-
 tests/entrypoints/openai/test_run_batch.py    |   2 +-
 .../pooling/classify/test_online.py           |   3 +-
 .../pooling/classify/test_online_vision.py    |   2 +-
 .../entrypoints/pooling/embed/test_online.py  |   6 +-
 .../pooling/embed/test_online_dimensions.py   |   2 +-
 .../pooling/embed/test_online_long_text.py    |   2 +-
 .../pooling/embed/test_online_vision.py       |   2 +-
 .../pooling/pooling/test_online.py            |   2 +-
 .../pooling/score/test_online_rerank.py       |   3 +-
 .../pooling/score/test_online_score.py        |   2 +-
 .../prithvi_io_processor/prithvi_processor.py |   5 +-
 .../test_io_processor_plugins.py              |   2 +-
 vllm/entrypoints/openai/api_server.py         | 307 +-------
 vllm/entrypoints/openai/protocol.py           | 710 +-----------------
 vllm/entrypoints/openai/run_batch.py          | 100 ++-
 vllm/entrypoints/openai/serving_engine.py     |  35 +-
 vllm/entrypoints/openai/utils.py              |  12 +
 vllm/entrypoints/pooling/__init__.py          |  16 +
 vllm/entrypoints/pooling/classify/__init__.py |   0
 .../pooling/classify/api_router.py            |  50 ++
 vllm/entrypoints/pooling/classify/protocol.py | 181 +++++
 .../classify/serving.py}                      |  12 +-
 vllm/entrypoints/pooling/embed/__init__.py    |   0
 vllm/entrypoints/pooling/embed/api_router.py  |  67 ++
 vllm/entrypoints/pooling/embed/protocol.py    | 208 +++++
 .../embed/serving.py}                         |  14 +-
 vllm/entrypoints/pooling/pooling/__init__.py  |   0
 .../entrypoints/pooling/pooling/api_router.py |  63 ++
 vllm/entrypoints/pooling/pooling/protocol.py  | 148 ++++
 .../pooling/serving.py}                       |   8 +-
 vllm/entrypoints/pooling/score/__init__.py    |   0
 vllm/entrypoints/pooling/score/api_router.py  | 149 ++++
 vllm/entrypoints/pooling/score/protocol.py    | 145 ++++
 .../score/serving.py}                         |   8 +-
 vllm/entrypoints/sagemaker/routes.py          |  50 +-
 vllm/plugins/io_processors/interface.py       |   2 +-
 39 files changed, 1264 insertions(+), 1067 deletions(-)
 create mode 100644 vllm/entrypoints/pooling/__init__.py
 create mode 100644 vllm/entrypoints/pooling/classify/__init__.py
 create mode 100644 vllm/entrypoints/pooling/classify/api_router.py
 create mode 100644 vllm/entrypoints/pooling/classify/protocol.py
 rename vllm/entrypoints/{openai/serving_classification.py => pooling/classify/serving.py} (99%)
 create mode 100644 vllm/entrypoints/pooling/embed/__init__.py
 create mode 100644 vllm/entrypoints/pooling/embed/api_router.py
 create mode 100644 vllm/entrypoints/pooling/embed/protocol.py
 rename vllm/entrypoints/{openai/serving_embedding.py => pooling/embed/serving.py} (99%)
 create mode 100644 vllm/entrypoints/pooling/pooling/__init__.py
 create mode 100644 vllm/entrypoints/pooling/pooling/api_router.py
 create mode 100644 vllm/entrypoints/pooling/pooling/protocol.py
 rename vllm/entrypoints/{openai/serving_pooling.py => pooling/pooling/serving.py} (99%)
 create mode 100644 vllm/entrypoints/pooling/score/__init__.py
 create mode 100644 vllm/entrypoints/pooling/score/api_router.py
 create mode 100644 vllm/entrypoints/pooling/score/protocol.py
 rename vllm/entrypoints/{openai/serving_score.py => pooling/score/serving.py} (99%)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 3247408e1163..ecb10d1a450f 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -149,6 +149,7 @@ mkdocs.yaml @hmellor
 /examples/*/pooling/ @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index 91ab4deae71d..b4a30cda35a0 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -77,7 +77,7 @@ The `parse_request` method is used for validating the user prompt and converting
 The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
 The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
 The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
-The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py).
 
 An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples.
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index e3280bd15b55..ac98efb7b88a 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -351,7 +351,7 @@ The following extra parameters are supported by default:
 ??? code
 
     ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
+    --8<-- "vllm/entrypoints/pooling/embed/protocol.py:embedding-extra-params"
     ```
 
 For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
@@ -359,7 +359,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 ??? code
 
     ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
+    --8<-- "vllm/entrypoints/pooling/embed/protocol.py:chat-embedding-extra-params"
     ```
 
 ### Transcriptions API
@@ -629,7 +629,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params"
+--8<-- "vllm/entrypoints/pooling/classify/protocol.py:classification-extra-params"
 ```
 
 ### Score API
@@ -834,7 +834,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params"
+--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```
 
 ### Re-rank API
@@ -915,7 +915,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params"
+--8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params"
 ```
 
 ## Ray Serve LLM
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index f951b57fe726..f6f109990e73 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -7,7 +7,7 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import BatchRequestOutput
+from vllm.entrypoints.openai.run_batch import BatchRequestOutput
 
 MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 
diff --git a/tests/entrypoints/pooling/classify/test_online.py b/tests/entrypoints/pooling/classify/test_online.py
index 25080d4189c2..6fef68858695 100644
--- a/tests/entrypoints/pooling/classify/test_online.py
+++ b/tests/entrypoints/pooling/classify/test_online.py
@@ -7,7 +7,8 @@
 import torch.nn.functional as F
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import ClassificationResponse, PoolingResponse
+from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 
 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 DTYPE = "float32"  # Use float32 to avoid NaN issue
diff --git a/tests/entrypoints/pooling/classify/test_online_vision.py b/tests/entrypoints/pooling/classify/test_online_vision.py
index f2616e057b17..aeb05c64d190 100644
--- a/tests/entrypoints/pooling/classify/test_online_vision.py
+++ b/tests/entrypoints/pooling/classify/test_online_vision.py
@@ -7,7 +7,7 @@
 import requests
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import ClassificationResponse
+from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
 
 VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
 MAXIMUM_VIDEOS = 1
diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py
index 0c88d800e2f9..81f4f9a82c1f 100644
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -15,10 +15,8 @@
 from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
 from tests.models.utils import check_embeddings_close
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import (
-    EmbeddingResponse,
-    PoolingResponse,
-)
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.serial_utils import (
diff --git a/tests/entrypoints/pooling/embed/test_online_dimensions.py b/tests/entrypoints/pooling/embed/test_online_dimensions.py
index 8018dac2d3ff..26aa57742b02 100644
--- a/tests/entrypoints/pooling/embed/test_online_dimensions.py
+++ b/tests/entrypoints/pooling/embed/test_online_dimensions.py
@@ -11,7 +11,7 @@
 from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
 from tests.models.utils import EmbedModelInfo
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform
 
 if current_platform.is_rocm():
diff --git a/tests/entrypoints/pooling/embed/test_online_long_text.py b/tests/entrypoints/pooling/embed/test_online_long_text.py
index a9ade09dad0b..0be7eebc2017 100644
--- a/tests/entrypoints/pooling/embed/test_online_long_text.py
+++ b/tests/entrypoints/pooling/embed/test_online_long_text.py
@@ -15,7 +15,7 @@
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform
 
 if current_platform.is_rocm():
diff --git a/tests/entrypoints/pooling/embed/test_online_vision.py b/tests/entrypoints/pooling/embed/test_online_vision.py
index 1befb5a3cf7a..83e7048b9def 100644
--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@@ -8,7 +8,7 @@
 from transformers import AutoProcessor
 
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py
index 4b20c5b0fa84..bfcb724f1944 100644
--- a/tests/entrypoints/pooling/pooling/test_online.py
+++ b/tests/entrypoints/pooling/pooling/test_online.py
@@ -11,7 +11,7 @@
 
 from tests.models.utils import check_embeddings_close
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import PoolingResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.serial_utils import (
     EMBED_DTYPE_TO_TORCH_DTYPE,
diff --git a/tests/entrypoints/pooling/score/test_online_rerank.py b/tests/entrypoints/pooling/score/test_online_rerank.py
index 5a772e22a741..f262dd4cb06b 100644
--- a/tests/entrypoints/pooling/score/test_online_rerank.py
+++ b/tests/entrypoints/pooling/score/test_online_rerank.py
@@ -7,7 +7,8 @@
 import torch.nn.functional as F
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
+from vllm.entrypoints.pooling.score.protocol import RerankResponse
 from vllm.platforms import current_platform
 
 if current_platform.is_rocm():
diff --git a/tests/entrypoints/pooling/score/test_online_score.py b/tests/entrypoints/pooling/score/test_online_score.py
index ceff9d018182..30ef55c8b675 100644
--- a/tests/entrypoints/pooling/score/test_online_score.py
+++ b/tests/entrypoints/pooling/score/test_online_score.py
@@ -9,7 +9,7 @@
 from torch import tensor
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import ScoreResponse
+from vllm.entrypoints.pooling.score.protocol import ScoreResponse
 from vllm.platforms import current_platform
 
 if current_platform.is_rocm():
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
index 5614f19d1a4f..e38a79de367e 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -18,7 +18,10 @@
 from terratorch.datamodules import Sen1Floods11NonGeoDataModule
 
 from vllm.config import VllmConfig
-from vllm.entrypoints.openai.protocol import IOProcessorRequest, IOProcessorResponse
+from vllm.entrypoints.pooling.pooling.protocol import (
+    IOProcessorRequest,
+    IOProcessorResponse,
+)
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.outputs import PoolingRequestOutput
diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py
index 582cf9a0711b..2088ee36e89a 100644
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -7,7 +7,7 @@
 
 from tests.utils import RemoteOpenAIServer
 from vllm.config import VllmConfig
-from vllm.entrypoints.openai.protocol import IOProcessorResponse
+from vllm.entrypoints.pooling.pooling.protocol import IOProcessorResponse
 from vllm.plugins.io_processors import get_io_processor
 
 MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index a7a2733913b0..6a648822d9b2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -14,7 +14,7 @@
 import tempfile
 import uuid
 from argparse import Namespace
-from collections.abc import AsyncGenerator, AsyncIterator, Awaitable, Callable
+from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
 from contextlib import asynccontextmanager
 from http import HTTPStatus
 from typing import Annotated, Any, Literal
@@ -54,29 +54,16 @@
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
-    ClassificationRequest,
-    ClassificationResponse,
     CompletionRequest,
     CompletionResponse,
     DetokenizeRequest,
     DetokenizeResponse,
-    EmbeddingBytesResponse,
-    EmbeddingRequest,
-    EmbeddingResponse,
     ErrorInfo,
     ErrorResponse,
     GenerateRequest,
     GenerateResponse,
-    IOProcessorResponse,
-    PoolingBytesResponse,
-    PoolingRequest,
-    PoolingResponse,
-    RerankRequest,
-    RerankResponse,
     ResponsesRequest,
     ResponsesResponse,
-    ScoreRequest,
-    ScoreResponse,
     StreamingResponsesResponse,
     TokenizeRequest,
     TokenizeResponse,
@@ -86,17 +73,13 @@
     TranslationResponse,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_classification import ServingClassification
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import (
     BaseModelPath,
     OpenAIServingModels,
 )
-from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
-from vllm.entrypoints.openai.serving_score import ServingScores
 from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization
 from vllm.entrypoints.openai.serving_tokens import ServingTokens
 from vllm.entrypoints.openai.serving_transcription import (
@@ -104,6 +87,11 @@
     OpenAIServingTranslation,
 )
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.classify.serving import ServingClassification
+from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
+from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
+from vllm.entrypoints.pooling.score.serving import ServingScores
 from vllm.entrypoints.tool_server import DemoToolServer, MCPToolServer, ToolServer
 from vllm.entrypoints.utils import (
     cli_env_setup,
@@ -254,15 +242,6 @@ async def build_async_engine_client_from_engine_args(
             async_llm.shutdown()
 
 
-async def validate_json_request(raw_request: Request):
-    content_type = raw_request.headers.get("content-type", "").lower()
-    media_type = content_type.split(";", maxsplit=1)[0]
-    if media_type != "application/json":
-        raise RequestValidationError(
-            errors=["Unsupported Media Type: Only 'application/json' is allowed"]
-        )
-
-
 router = APIRouter()
 
 
@@ -324,26 +303,6 @@ def completion(request: Request) -> OpenAIServingCompletion | None:
     return request.app.state.openai_serving_completion
 
 
-def pooling(request: Request) -> OpenAIServingPooling | None:
-    return request.app.state.openai_serving_pooling
-
-
-def embedding(request: Request) -> OpenAIServingEmbedding | None:
-    return request.app.state.openai_serving_embedding
-
-
-def score(request: Request) -> ServingScores | None:
-    return request.app.state.openai_serving_scores
-
-
-def classify(request: Request) -> ServingClassification | None:
-    return request.app.state.openai_serving_classification
-
-
-def rerank(request: Request) -> ServingScores | None:
-    return request.app.state.openai_serving_scores
-
-
 def tokenization(request: Request) -> OpenAIServingTokenization:
     return request.app.state.openai_serving_tokenization
 
@@ -817,166 +776,6 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post(
-    "/v1/embeddings",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_embedding(
-    request: EmbeddingRequest,
-    raw_request: Request,
-):
-    handler = embedding(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Embeddings API"
-        )
-
-    try:
-        generator = await handler.create_embedding(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, EmbeddingResponse):
-        return JSONResponse(content=generator.model_dump())
-    elif isinstance(generator, EmbeddingBytesResponse):
-        return StreamingResponse(
-            content=generator.body,
-            headers={"metadata": generator.metadata},
-            media_type=generator.media_type,
-        )
-
-    assert_never(generator)
-
-
-@router.post(
-    "/pooling",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_pooling(request: PoolingRequest, raw_request: Request):
-    handler = pooling(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Pooling API"
-        )
-    try:
-        generator = await handler.create_pooling(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, (PoolingResponse, IOProcessorResponse)):
-        return JSONResponse(content=generator.model_dump())
-    elif isinstance(generator, PoolingBytesResponse):
-        return StreamingResponse(
-            content=generator.body,
-            headers={"metadata": generator.metadata},
-            media_type=generator.media_type,
-        )
-
-    assert_never(generator)
-
-
-@router.post("/classify", dependencies=[Depends(validate_json_request)])
-@with_cancellation
-@load_aware_call
-async def create_classify(request: ClassificationRequest, raw_request: Request):
-    handler = classify(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Classification API"
-        )
-
-    try:
-        generator = await handler.create_classify(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-
-    elif isinstance(generator, ClassificationResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post(
-    "/score",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_score(request: ScoreRequest, raw_request: Request):
-    handler = score(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Score API"
-        )
-
-    try:
-        generator = await handler.create_score(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, ScoreResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post(
-    "/v1/score",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_score_v1(request: ScoreRequest, raw_request: Request):
-    logger.warning(
-        "To indicate that Score API is not part of standard OpenAI API, we "
-        "have moved it to `/score`. Please update your client accordingly."
-    )
-
-    return await create_score(request, raw_request)
-
-
 @router.post(
     "/v1/audio/transcriptions",
     responses={
@@ -1055,70 +854,6 @@ async def create_translations(
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post(
-    "/rerank",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def do_rerank(request: RerankRequest, raw_request: Request):
-    handler = rerank(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Rerank (Score) API"
-        )
-    try:
-        generator = await handler.do_rerank(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, RerankResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post(
-    "/v1/rerank",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-async def do_rerank_v1(request: RerankRequest, raw_request: Request):
-    logger.warning_once(
-        "To indicate that the rerank API is not part of the standard OpenAI"
-        " API, we have located it at `/rerank`. Please update your client "
-        "accordingly. (Note: Conforms to JinaAI rerank API)"
-    )
-
-    return await do_rerank(request, raw_request)
-
-
-@router.post(
-    "/v2/rerank",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-async def do_rerank_v2(request: RerankRequest, raw_request: Request):
-    return await do_rerank(request, raw_request)
-
-
 if envs.VLLM_SERVER_DEV_MODE:
     logger.warning(
         "SECURITY WARNING: Development endpoints are enabled! "
@@ -1285,30 +1020,6 @@ async def is_scaling_elastic_ep(raw_request: Request):
     return JSONResponse({"is_scaling_elastic_ep": _scaling_elastic_ep})
 
 
-# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
-# (requires typing_extensions >= 4.13)
-RequestType = Any
-GetHandlerFn = Callable[[Request], OpenAIServing | None]
-EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
-
-# NOTE: Items defined earlier take higher priority
-INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = [
-    (ChatCompletionRequest, (chat, create_chat_completion)),
-    (CompletionRequest, (completion, create_completion)),
-    (EmbeddingRequest, (embedding, create_embedding)),
-    (ClassificationRequest, (classify, create_classify)),
-    (ScoreRequest, (score, create_score)),
-    (RerankRequest, (rerank, do_rerank)),
-    (PoolingRequest, (pooling, create_pooling)),
-]
-
-# NOTE: Construct the TypeAdapters only once
-INVOCATION_VALIDATORS = [
-    (pydantic.TypeAdapter(request_type), (get_handler, endpoint))
-    for request_type, (get_handler, endpoint) in INVOCATION_TYPES
-]
-
-
 @router.post(
     "/inference/v1/generate",
     dependencies=[Depends(validate_json_request)],
@@ -1653,12 +1364,16 @@ def build_app(args: Namespace) -> FastAPI:
     from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
 
     register_sagemaker_routes(router)
-
     app.include_router(router)
+
     app.root_path = args.root_path
 
     mount_metrics(app)
 
+    from vllm.entrypoints.pooling import register_pooling_api_routers
+
+    register_pooling_api_routers(app)
+
     app.add_middleware(
         CORSMiddleware,
         allow_origins=args.allowed_origins,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 688ea9697d9d..fb73416f45b2 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -6,7 +6,7 @@
 import json
 import time
 from http import HTTPStatus
-from typing import Annotated, Any, ClassVar, Generic, Literal, TypeAlias, TypeVar
+from typing import Annotated, Any, ClassVar, Literal, TypeAlias
 
 import regex as re
 import torch
@@ -48,14 +48,6 @@
 )
 from openai_harmony import Message as OpenAIHarmonyMessage
 
-from vllm.config.pooler import get_use_activation
-from vllm.tasks import PoolingTask
-from vllm.utils.serial_utils import (
-    EmbedDType,
-    EncodingFormat,
-    Endianness,
-)
-
 # Backward compatibility for OpenAI client versions
 try:  # For older openai versions (< 1.100.0)
     from openai.types.responses import ResponseTextConfig
@@ -70,19 +62,14 @@
     BaseModel,
     ConfigDict,
     Field,
-    TypeAdapter,
     ValidationError,
-    ValidationInfo,
     field_serializer,
-    field_validator,
     model_validator,
 )
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id
-from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
-from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import (
     BeamSearchParams,
     RequestOutputKind,
@@ -1345,401 +1332,6 @@ def check_cache_salt_support(cls, data):
         return data
 
 
-class EmbeddingCompletionRequest(OpenAIBaseModel):
-    # Ordered by official OpenAI API documentation
-    # https://platform.openai.com/docs/api-reference/embeddings
-    model: str | None = None
-    input: list[int] | list[list[int]] | str | list[str]
-    encoding_format: EncodingFormat = "float"
-    dimensions: int | None = None
-    user: str | None = None
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:embedding-extra-params]
-    add_special_tokens: bool = Field(
-        default=True,
-        description=(
-            "If true (the default), special tokens (e.g. BOS) will be added to "
-            "the prompt."
-        ),
-    )
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    normalize: bool | None = Field(
-        default=None,
-        description="Whether to normalize the embeddings outputs. Default is True.",
-    )
-    embed_dtype: EmbedDType = Field(
-        default="float32",
-        description=(
-            "What dtype to use for encoding. Default to using float32 for base64 "
-            "encoding to match the OpenAI python client behavior. "
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    endianness: Endianness = Field(
-        default="native",
-        description=(
-            "What endianness to use for encoding. Default to using native for "
-            "base64 encoding to match the OpenAI python client behavior."
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    # --8<-- [end:embedding-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-        )
-
-
-class EmbeddingChatRequest(OpenAIBaseModel):
-    model: str | None = None
-    messages: list[ChatCompletionMessageParam]
-
-    encoding_format: EncodingFormat = "float"
-    dimensions: int | None = None
-    user: str | None = None
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:chat-embedding-extra-params]
-    add_generation_prompt: bool = Field(
-        default=False,
-        description=(
-            "If true, the generation prompt will be added to the chat template. "
-            "This is a parameter used by chat template in tokenizer config of the "
-            "model."
-        ),
-    )
-
-    add_special_tokens: bool = Field(
-        default=False,
-        description=(
-            "If true, special tokens (e.g. BOS) will be added to the prompt "
-            "on top of what is added by the chat template. "
-            "For most models, the chat template takes care of adding the "
-            "special tokens so this should be set to false (as is the "
-            "default)."
-        ),
-    )
-    chat_template: str | None = Field(
-        default=None,
-        description=(
-            "A Jinja template to use for this conversion. "
-            "As of transformers v4.44, default chat template is no longer "
-            "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."
-        ),
-    )
-    chat_template_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=(
-            "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."
-        ),
-    )
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    normalize: bool | None = Field(
-        default=None,
-        description="Whether to normalize the embeddings outputs. Default is True.",
-    )
-    embed_dtype: EmbedDType = Field(
-        default="float32",
-        description=(
-            "What dtype to use for encoding. Default to using float32 for base64 "
-            "encoding to match the OpenAI python client behavior. "
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    endianness: Endianness = Field(
-        default="native",
-        description=(
-            "What endianness to use for encoding. Default to using native for "
-            "base64 encoding to match the OpenAI python client behavior."
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    # --8<-- [end:chat-embedding-extra-params]
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_generation_prompt(cls, data):
-        if data.get("continue_final_message") and data.get("add_generation_prompt"):
-            raise ValueError(
-                "Cannot set both `continue_final_message` and "
-                "`add_generation_prompt` to True."
-            )
-        return data
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-        )
-
-
-EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
-
-
-class PoolingCompletionRequest(EmbeddingCompletionRequest):
-    task: PoolingTask | None = None
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "If it is a classify or token_classify task, the default is True; "
-        "for other tasks, this value should be None.",
-    )
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-            use_activation=get_use_activation(self),
-        )
-
-
-class PoolingChatRequest(EmbeddingChatRequest):
-    task: PoolingTask | None = None
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "If it is a classify or token_classify task, the default is True; "
-        "for other tasks, this value should be None.",
-    )
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-            use_activation=get_use_activation(self),
-        )
-
-
-T = TypeVar("T")
-
-
-class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
-    model: str | None = None
-
-    priority: int = Field(default=0)
-    """
-    The priority of the request (lower means earlier handling;
-    default: 0). Any priority other than 0 will raise an error
-    if the served model does not use priority scheduling.
-    """
-    data: T
-
-    task: PoolingTask = "plugin"
-    encoding_format: EncodingFormat = "float"
-    embed_dtype: EmbedDType = Field(
-        default="float32",
-        description=(
-            "What dtype to use for encoding. Default to using float32 for base64 "
-            "encoding to match the OpenAI python client behavior. "
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    endianness: Endianness = Field(
-        default="native",
-        description=(
-            "What endianness to use for encoding. Default to using native for "
-            "base64 encoding to match the OpenAI python client behavior."
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-
-    def to_pooling_params(self):
-        return PoolingParams()
-
-
-class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
-    request_id: str | None = None
-    """
-    The request_id associated with this response
-    """
-    created_at: int = Field(default_factory=lambda: int(time.time()))
-
-    data: T
-    """
-    When using plugins IOProcessor plugins, the actual output is generated
-    by the plugin itself. Hence, we use a generic type for the response data
-    """
-
-
-PoolingRequest: TypeAlias = (
-    PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest
-)
-
-
-class ScoreRequest(OpenAIBaseModel):
-    model: str | None = None
-    text_1: list[str] | str | ScoreMultiModalParam
-    text_2: list[str] | str | ScoreMultiModalParam
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:score-extra-params]
-
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "Default is True.",
-    )
-    # --8<-- [end:score-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=get_use_activation(self),
-        )
-
-
-class RerankRequest(OpenAIBaseModel):
-    model: str | None = None
-    query: str | ScoreMultiModalParam
-    documents: list[str] | ScoreMultiModalParam
-    top_n: int = Field(default_factory=lambda: 0)
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:rerank-extra-params]
-
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "Default is True.",
-    )
-    # --8<-- [end:rerank-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=get_use_activation(self),
-        )
-
-
-class RerankDocument(BaseModel):
-    text: str | None = None
-    multi_modal: ScoreContentPartParam | None = None
-
-
-class RerankResult(BaseModel):
-    index: int
-    document: RerankDocument
-    relevance_score: float
-
-
-class RerankUsage(BaseModel):
-    total_tokens: int
-
-
-class RerankResponse(OpenAIBaseModel):
-    id: str
-    model: str
-    usage: RerankUsage
-    results: list[RerankResult]
-
-
 class CompletionLogProbs(OpenAIBaseModel):
     text_offset: list[int] = Field(default_factory=list)
     token_logprobs: list[float | None] = Field(default_factory=list)
@@ -1809,229 +1401,6 @@ class CompletionStreamResponse(OpenAIBaseModel):
     usage: UsageInfo | None = Field(default=None)
 
 
-class EmbeddingResponseData(OpenAIBaseModel):
-    index: int
-    object: str = "embedding"
-    embedding: list[float] | str
-
-
-class EmbeddingResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[EmbeddingResponseData]
-    usage: UsageInfo
-
-
-class EmbeddingBytesResponse(OpenAIBaseModel):
-    body: list[bytes]
-    metadata: str
-    media_type: str = "application/octet-stream"
-
-
-class PoolingResponseData(OpenAIBaseModel):
-    index: int
-    object: str = "pooling"
-    data: list[list[float]] | list[float] | str
-
-
-class PoolingResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[PoolingResponseData]
-    usage: UsageInfo
-
-
-class PoolingBytesResponse(OpenAIBaseModel):
-    body: list[bytes]
-    metadata: str
-    media_type: str = "application/octet-stream"
-
-
-class ScoreResponseData(OpenAIBaseModel):
-    index: int
-    object: str = "score"
-    score: float
-
-
-class ScoreResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[ScoreResponseData]
-    usage: UsageInfo
-
-
-class ClassificationCompletionRequest(OpenAIBaseModel):
-    model: str | None = None
-    input: list[str] | str
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-    user: str | None = None
-
-    # --8<-- [start:classification-extra-params]
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-    add_special_tokens: bool = Field(
-        default=True,
-        description=(
-            "If true (the default), special tokens (e.g. BOS) will be added to "
-            "the prompt."
-        ),
-    )
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "Default is True.",
-    )
-    # --8<-- [end:classification-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=get_use_activation(self),
-        )
-
-
-class ClassificationChatRequest(OpenAIBaseModel):
-    model: str | None = None
-    messages: list[ChatCompletionMessageParam]
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-    user: str | None = None
-
-    # --8<-- [start:chat-classification-extra-params]
-    add_generation_prompt: bool = Field(
-        default=False,
-        description=(
-            "If true, the generation prompt will be added to the chat template. "
-            "This is a parameter used by chat template in tokenizer config of the "
-            "model."
-        ),
-    )
-
-    add_special_tokens: bool = Field(
-        default=False,
-        description=(
-            "If true, special tokens (e.g. BOS) will be added to the prompt "
-            "on top of what is added by the chat template. "
-            "For most models, the chat template takes care of adding the "
-            "special tokens so this should be set to false (as is the "
-            "default)."
-        ),
-    )
-
-    chat_template: str | None = Field(
-        default=None,
-        description=(
-            "A Jinja template to use for this conversion. "
-            "As of transformers v4.44, default chat template is no longer "
-            "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."
-        ),
-    )
-
-    chat_template_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=(
-            "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."
-        ),
-    )
-
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "Default is True.",
-    )
-    # --8<-- [end:chat-classification-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=get_use_activation(self),
-        )
-
-
-ClassificationRequest: TypeAlias = (
-    ClassificationCompletionRequest | ClassificationChatRequest
-)
-
-
-class ClassificationData(OpenAIBaseModel):
-    index: int
-    label: str | None
-    probs: list[float]
-    num_classes: int
-
-
-class ClassificationResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[ClassificationData]
-    usage: UsageInfo
-
-
 class FunctionCall(OpenAIBaseModel):
     name: str
     arguments: str
@@ -2409,83 +1778,6 @@ class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
     | ResponseCodeInterpreterCallCompletedEvent
 )
 
-BatchRequestInputBody: TypeAlias = (
-    ChatCompletionRequest | EmbeddingRequest | ScoreRequest | RerankRequest
-)
-
-
-class BatchRequestInput(OpenAIBaseModel):
-    """
-    The per-line object of the batch input file.
-
-    NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
-    """
-
-    # A developer-provided per-request id that will be used to match outputs to
-    # inputs. Must be unique for each request in a batch.
-    custom_id: str
-
-    # The HTTP method to be used for the request. Currently only POST is
-    # supported.
-    method: str
-
-    # The OpenAI API relative URL to be used for the request. Currently
-    # /v1/chat/completions is supported.
-    url: str
-
-    # The parameters of the request.
-    body: BatchRequestInputBody
-
-    @field_validator("body", mode="plain")
-    @classmethod
-    def check_type_for_url(cls, value: Any, info: ValidationInfo):
-        # Use url to disambiguate models
-        url: str = info.data["url"]
-        if url == "/v1/chat/completions":
-            return ChatCompletionRequest.model_validate(value)
-        if url == "/v1/embeddings":
-            return TypeAdapter(EmbeddingRequest).validate_python(value)
-        if url.endswith("/score"):
-            return ScoreRequest.model_validate(value)
-        if url.endswith("/rerank"):
-            return RerankRequest.model_validate(value)
-        return TypeAdapter(BatchRequestInputBody).validate_python(value)
-
-
-class BatchResponseData(OpenAIBaseModel):
-    # HTTP status code of the response.
-    status_code: int = 200
-
-    # An unique identifier for the API request.
-    request_id: str
-
-    # The body of the response.
-    body: (
-        ChatCompletionResponse
-        | EmbeddingResponse
-        | ScoreResponse
-        | RerankResponse
-        | None
-    ) = None
-
-
-class BatchRequestOutput(OpenAIBaseModel):
-    """
-    The per-line object of the batch output and error files
-    """
-
-    id: str
-
-    # A developer-provided per-request id that will be used to match outputs to
-    # inputs.
-    custom_id: str
-
-    response: BatchResponseData | None
-
-    # For requests that failed with a non-HTTP error, this will contain more
-    # information on the cause of the failure.
-    error: Any | None
-
 
 class TokenizeCompletionRequest(OpenAIBaseModel):
     model: str | None = None
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 4b9dba085a8e..837e742e6be4 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -7,29 +7,35 @@
 from collections.abc import Awaitable, Callable
 from http import HTTPStatus
 from io import StringIO
+from typing import Any, TypeAlias
 
 import aiohttp
 import torch
 from prometheus_client import start_http_server
+from pydantic import TypeAdapter, field_validator
+from pydantic_core.core_schema import ValidationInfo
 from tqdm import tqdm
 
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
-    BatchRequestInput,
-    BatchRequestOutput,
-    BatchResponseData,
+    ChatCompletionRequest,
     ChatCompletionResponse,
-    EmbeddingResponse,
     ErrorResponse,
-    RerankResponse,
-    ScoreResponse,
+    OpenAIBaseModel,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
-from vllm.entrypoints.openai.serving_score import ServingScores
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse
+from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
+from vllm.entrypoints.pooling.score.protocol import (
+    RerankRequest,
+    RerankResponse,
+    ScoreRequest,
+    ScoreResponse,
+)
+from vllm.entrypoints.pooling.score.serving import ServingScores
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.utils import random_uuid
@@ -39,6 +45,84 @@
 logger = init_logger(__name__)
 
 
+BatchRequestInputBody: TypeAlias = (
+    ChatCompletionRequest | EmbeddingRequest | ScoreRequest | RerankRequest
+)
+
+
+class BatchRequestInput(OpenAIBaseModel):
+    """
+    The per-line object of the batch input file.
+
+    NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
+    """
+
+    # A developer-provided per-request id that will be used to match outputs to
+    # inputs. Must be unique for each request in a batch.
+    custom_id: str
+
+    # The HTTP method to be used for the request. Currently only POST is
+    # supported.
+    method: str
+
+    # The OpenAI API relative URL to be used for the request. Currently
+    # /v1/chat/completions is supported.
+    url: str
+
+    # The parameters of the request.
+    body: BatchRequestInputBody
+
+    @field_validator("body", mode="plain")
+    @classmethod
+    def check_type_for_url(cls, value: Any, info: ValidationInfo):
+        # Use url to disambiguate models
+        url: str = info.data["url"]
+        if url == "/v1/chat/completions":
+            return ChatCompletionRequest.model_validate(value)
+        if url == "/v1/embeddings":
+            return TypeAdapter(EmbeddingRequest).validate_python(value)
+        if url.endswith("/score"):
+            return ScoreRequest.model_validate(value)
+        if url.endswith("/rerank"):
+            return RerankRequest.model_validate(value)
+        return TypeAdapter(BatchRequestInputBody).validate_python(value)
+
+
+class BatchResponseData(OpenAIBaseModel):
+    # HTTP status code of the response.
+    status_code: int = 200
+
+    # An unique identifier for the API request.
+    request_id: str
+
+    # The body of the response.
+    body: (
+        ChatCompletionResponse
+        | EmbeddingResponse
+        | ScoreResponse
+        | RerankResponse
+        | None
+    ) = None
+
+
+class BatchRequestOutput(OpenAIBaseModel):
+    """
+    The per-line object of the batch output and error files
+    """
+
+    id: str
+
+    # A developer-provided per-request id that will be used to match outputs to
+    # inputs.
+    custom_id: str
+
+    response: BatchResponseData | None
+
+    # For requests that failed with a non-HTTP error, this will contain more
+    # information on the cause of the failure.
+    error: Any | None
+
+
 def make_arg_parser(parser: FlexibleArgumentParser):
     parser.add_argument(
         "-i",
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index b6a2478cf8c8..1d89aa011af2 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -18,6 +18,28 @@
 from starlette.datastructures import Headers
 from typing_extensions import TypeIs
 
+from vllm.entrypoints.pooling.classify.protocol import (
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
+    ClassificationRequest,
+    ClassificationResponse,
+)
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+    EmbeddingRequest,
+    EmbeddingResponse,
+)
+from vllm.entrypoints.pooling.pooling.protocol import (
+    IOProcessorRequest,
+    PoolingResponse,
+)
+from vllm.entrypoints.pooling.score.protocol import (
+    RerankRequest,
+    ScoreRequest,
+    ScoreResponse,
+)
+
 if sys.version_info >= (3, 12):
     from typing import TypedDict
 else:
@@ -45,29 +67,16 @@
     ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionResponse,
-    ClassificationChatRequest,
-    ClassificationCompletionRequest,
-    ClassificationRequest,
-    ClassificationResponse,
     CompletionRequest,
     CompletionResponse,
     DetokenizeRequest,
-    EmbeddingChatRequest,
-    EmbeddingCompletionRequest,
-    EmbeddingRequest,
-    EmbeddingResponse,
     ErrorInfo,
     ErrorResponse,
     FunctionCall,
     FunctionDefinition,
     GenerateRequest,
     GenerateResponse,
-    IOProcessorRequest,
-    PoolingResponse,
-    RerankRequest,
     ResponsesRequest,
-    ScoreRequest,
-    ScoreResponse,
     TokenizeChatRequest,
     TokenizeCompletionRequest,
     TokenizeResponse,
diff --git a/vllm/entrypoints/openai/utils.py b/vllm/entrypoints/openai/utils.py
index 6f37f6adff4c..29db601af67f 100644
--- a/vllm/entrypoints/openai/utils.py
+++ b/vllm/entrypoints/openai/utils.py
@@ -2,6 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import TypeVar
 
+from fastapi import Request
+from fastapi.exceptions import RequestValidationError
+
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponseChoice,
@@ -35,3 +38,12 @@ def maybe_filter_parallel_tool_calls(
         ]
 
     return choice
+
+
+async def validate_json_request(raw_request: Request):
+    content_type = raw_request.headers.get("content-type", "").lower()
+    media_type = content_type.split(";", maxsplit=1)[0]
+    if media_type != "application/json":
+        raise RequestValidationError(
+            errors=["Unsupported Media Type: Only 'application/json' is allowed"]
+        )
diff --git a/vllm/entrypoints/pooling/__init__.py b/vllm/entrypoints/pooling/__init__.py
new file mode 100644
index 000000000000..789fd8bd262b
--- /dev/null
+++ b/vllm/entrypoints/pooling/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from fastapi import FastAPI
+
+
+def register_pooling_api_routers(app: FastAPI):
+    from vllm.entrypoints.pooling.classify.api_router import router as classify_router
+    from vllm.entrypoints.pooling.embed.api_router import router as embed_router
+    from vllm.entrypoints.pooling.pooling.api_router import router as pooling_router
+    from vllm.entrypoints.pooling.score.api_router import router as score_router
+
+    app.include_router(classify_router)
+    app.include_router(embed_router)
+    app.include_router(score_router)
+    app.include_router(pooling_router)
diff --git a/vllm/entrypoints/pooling/classify/__init__.py b/vllm/entrypoints/pooling/classify/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/entrypoints/pooling/classify/api_router.py b/vllm/entrypoints/pooling/classify/api_router.py
new file mode 100644
index 000000000000..d6ced73c88eb
--- /dev/null
+++ b/vllm/entrypoints/pooling/classify/api_router.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from starlette.responses import JSONResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.classify.protocol import (
+    ClassificationRequest,
+    ClassificationResponse,
+)
+from vllm.entrypoints.pooling.classify.serving import ServingClassification
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def classify(request: Request) -> ServingClassification | None:
+    return request.app.state.openai_serving_classification
+
+
+@router.post("/classify", dependencies=[Depends(validate_json_request)])
+@with_cancellation
+@load_aware_call
+async def create_classify(request: ClassificationRequest, raw_request: Request):
+    handler = classify(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Classification API"
+        )
+
+    try:
+        generator = await handler.create_classify(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+
+    elif isinstance(generator, ClassificationResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
diff --git a/vllm/entrypoints/pooling/classify/protocol.py b/vllm/entrypoints/pooling/classify/protocol.py
new file mode 100644
index 000000000000..273bdd29ee58
--- /dev/null
+++ b/vllm/entrypoints/pooling/classify/protocol.py
@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from typing import Annotated, Any, TypeAlias
+
+from pydantic import (
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.utils import random_uuid
+
+
+class ClassificationCompletionRequest(OpenAIBaseModel):
+    model: str | None = None
+    input: list[str] | str
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:classification-extra-params]
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class ClassificationChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:chat-classification-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:chat-classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+ClassificationRequest: TypeAlias = (
+    ClassificationCompletionRequest | ClassificationChatRequest
+)
+
+
+class ClassificationData(OpenAIBaseModel):
+    index: int
+    label: str | None
+    probs: list[float]
+    num_classes: int
+
+
+class ClassificationResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[ClassificationData]
+    usage: UsageInfo
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/pooling/classify/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_classification.py
rename to vllm/entrypoints/pooling/classify/serving.py
index 3b973eb125a8..d6d3825daf7b 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/pooling/classify/serving.py
@@ -13,11 +13,6 @@
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
-    ClassificationChatRequest,
-    ClassificationCompletionRequest,
-    ClassificationData,
-    ClassificationRequest,
-    ClassificationResponse,
     ErrorResponse,
     UsageInfo,
 )
@@ -27,6 +22,13 @@
     ServeContext,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.classify.protocol import (
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
+    ClassificationData,
+    ClassificationRequest,
+    ClassificationResponse,
+)
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.logger import init_logger
 from vllm.outputs import ClassificationOutput, PoolingRequestOutput
diff --git a/vllm/entrypoints/pooling/embed/__init__.py b/vllm/entrypoints/pooling/embed/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py
new file mode 100644
index 000000000000..5b10a32e79f8
--- /dev/null
+++ b/vllm/entrypoints/pooling/embed/api_router.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingBytesResponse,
+    EmbeddingRequest,
+    EmbeddingResponse,
+)
+from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def embedding(request: Request) -> OpenAIServingEmbedding | None:
+    return request.app.state.openai_serving_embedding
+
+
+@router.post(
+    "/v1/embeddings",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_embedding(
+    request: EmbeddingRequest,
+    raw_request: Request,
+):
+    handler = embedding(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Embeddings API"
+        )
+
+    try:
+        generator = await handler.create_embedding(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, EmbeddingResponse):
+        return JSONResponse(content=generator.model_dump())
+    elif isinstance(generator, EmbeddingBytesResponse):
+        return StreamingResponse(
+            content=generator.body,
+            headers={"metadata": generator.metadata},
+            media_type=generator.media_type,
+        )
+
+    assert_never(generator)
diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py
new file mode 100644
index 000000000000..7eb53e14d5d8
--- /dev/null
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Annotated, Any, TypeAlias
+
+from pydantic import (
+    Field,
+    model_validator,
+)
+
+from vllm import PoolingParams
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.utils import random_uuid
+from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
+
+
+class EmbeddingCompletionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/embeddings
+    model: str | None = None
+    input: list[int] | list[list[int]] | str | list[str]
+    encoding_format: EncodingFormat = "float"
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:embedding-extra-params]
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    normalize: bool | None = Field(
+        default=None,
+        description="Whether to normalize the embeddings outputs. Default is True.",
+    )
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    # --8<-- [end:embedding-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+        )
+
+
+class EmbeddingChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+
+    encoding_format: EncodingFormat = "float"
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:chat-embedding-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    normalize: bool | None = Field(
+        default=None,
+        description="Whether to normalize the embeddings outputs. Default is True.",
+    )
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    # --8<-- [end:chat-embedding-extra-params]
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get("add_generation_prompt"):
+            raise ValueError(
+                "Cannot set both `continue_final_message` and "
+                "`add_generation_prompt` to True."
+            )
+        return data
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+        )
+
+
+EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
+
+
+class EmbeddingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "embedding"
+    embedding: list[float] | str
+
+
+class EmbeddingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[EmbeddingResponseData]
+    usage: UsageInfo
+
+
+class EmbeddingBytesResponse(OpenAIBaseModel):
+    body: list[bytes]
+    metadata: str
+    media_type: str = "application/octet-stream"
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/pooling/embed/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_embedding.py
rename to vllm/entrypoints/pooling/embed/serving.py
index 51f6106acec3..868a3cb017a6 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -13,12 +13,6 @@
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
-    EmbeddingBytesResponse,
-    EmbeddingChatRequest,
-    EmbeddingCompletionRequest,
-    EmbeddingRequest,
-    EmbeddingResponse,
-    EmbeddingResponseData,
     ErrorResponse,
     UsageInfo,
 )
@@ -29,6 +23,14 @@
     TextTokensPrompt,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingBytesResponse,
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+    EmbeddingRequest,
+    EmbeddingResponse,
+    EmbeddingResponseData,
+)
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
diff --git a/vllm/entrypoints/pooling/pooling/__init__.py b/vllm/entrypoints/pooling/pooling/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py
new file mode 100644
index 000000000000..674da94d126c
--- /dev/null
+++ b/vllm/entrypoints/pooling/pooling/api_router.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.pooling.protocol import (
+    IOProcessorResponse,
+    PoolingBytesResponse,
+    PoolingRequest,
+    PoolingResponse,
+)
+from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def pooling(request: Request) -> OpenAIServingPooling | None:
+    return request.app.state.openai_serving_pooling
+
+
+@router.post(
+    "/pooling",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_pooling(request: PoolingRequest, raw_request: Request):
+    handler = pooling(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Pooling API"
+        )
+    try:
+        generator = await handler.create_pooling(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, (PoolingResponse, IOProcessorResponse)):
+        return JSONResponse(content=generator.model_dump())
+    elif isinstance(generator, PoolingBytesResponse):
+        return StreamingResponse(
+            content=generator.body,
+            headers={"metadata": generator.metadata},
+            media_type=generator.media_type,
+        )
+
+    assert_never(generator)
diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py
new file mode 100644
index 000000000000..364cd93738b8
--- /dev/null
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Generic, TypeAlias, TypeVar
+
+from pydantic import (
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+)
+from vllm.tasks import PoolingTask
+from vllm.utils import random_uuid
+from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
+
+
+class PoolingCompletionRequest(EmbeddingCompletionRequest):
+    task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            use_activation=get_use_activation(self),
+        )
+
+
+class PoolingChatRequest(EmbeddingChatRequest):
+    task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            use_activation=get_use_activation(self),
+        )
+
+
+T = TypeVar("T")
+
+
+class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
+    model: str | None = None
+
+    priority: int = Field(default=0)
+    """
+    The priority of the request (lower means earlier handling;
+    default: 0). Any priority other than 0 will raise an error
+    if the served model does not use priority scheduling.
+    """
+    data: T
+
+    task: PoolingTask = "plugin"
+    encoding_format: EncodingFormat = "float"
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams()
+
+
+class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
+    request_id: str | None = None
+    """
+    The request_id associated with this response
+    """
+    created_at: int = Field(default_factory=lambda: int(time.time()))
+
+    data: T
+    """
+    When using plugins IOProcessor plugins, the actual output is generated
+    by the plugin itself. Hence, we use a generic type for the response data
+    """
+
+
+PoolingRequest: TypeAlias = (
+    PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest
+)
+
+
+class PoolingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "pooling"
+    data: list[list[float]] | list[float] | str
+
+
+class PoolingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[PoolingResponseData]
+    usage: UsageInfo
+
+
+class PoolingBytesResponse(OpenAIBaseModel):
+    body: list[bytes]
+    metadata: str
+    media_type: str = "application/octet-stream"
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/pooling/pooling/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_pooling.py
rename to vllm/entrypoints/pooling/pooling/serving.py
index ee4c5c8bacaa..7fb767e26d01 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -16,6 +16,11 @@
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.pooling.protocol import (
     IOProcessorRequest,
     IOProcessorResponse,
     PoolingBytesResponse,
@@ -24,10 +29,7 @@
     PoolingRequest,
     PoolingResponse,
     PoolingResponseData,
-    UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
-from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
diff --git a/vllm/entrypoints/pooling/score/__init__.py b/vllm/entrypoints/pooling/score/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/entrypoints/pooling/score/api_router.py b/vllm/entrypoints/pooling/score/api_router.py
new file mode 100644
index 000000000000..c7481ed9fa96
--- /dev/null
+++ b/vllm/entrypoints/pooling/score/api_router.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.score.protocol import (
+    RerankRequest,
+    RerankResponse,
+    ScoreRequest,
+    ScoreResponse,
+)
+from vllm.entrypoints.pooling.score.serving import ServingScores
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+from vllm.logger import init_logger
+
+router = APIRouter()
+
+logger = init_logger(__name__)
+
+
+def score(request: Request) -> ServingScores | None:
+    return request.app.state.openai_serving_scores
+
+
+def rerank(request: Request) -> ServingScores | None:
+    return request.app.state.openai_serving_scores
+
+
+@router.post(
+    "/score",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_score(request: ScoreRequest, raw_request: Request):
+    handler = score(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Score API"
+        )
+
+    try:
+        generator = await handler.create_score(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, ScoreResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post(
+    "/v1/score",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_score_v1(request: ScoreRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that Score API is not part of standard OpenAI API, we "
+        "have moved it to `/score`. Please update your client accordingly."
+    )
+
+    return await create_score(request, raw_request)
+
+
+@router.post(
+    "/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def do_rerank(request: RerankRequest, raw_request: Request):
+    handler = rerank(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Rerank (Score) API"
+        )
+    try:
+        generator = await handler.do_rerank(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, RerankResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post(
+    "/v1/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def do_rerank_v1(request: RerankRequest, raw_request: Request):
+    logger.warning_once(
+        "To indicate that the rerank API is not part of the standard OpenAI"
+        " API, we have located it at `/rerank`. Please update your client "
+        "accordingly. (Note: Conforms to JinaAI rerank API)"
+    )
+
+    return await do_rerank(request, raw_request)
+
+
+@router.post(
+    "/v2/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def do_rerank_v2(request: RerankRequest, raw_request: Request):
+    return await do_rerank(request, raw_request)
diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py
new file mode 100644
index 000000000000..a22219707c35
--- /dev/null
+++ b/vllm/entrypoints/pooling/score/protocol.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Annotated, Any
+
+from pydantic import (
+    BaseModel,
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
+from vllm.utils import random_uuid
+
+
+class ScoreRequest(OpenAIBaseModel):
+    model: str | None = None
+    text_1: list[str] | str | ScoreMultiModalParam
+    text_2: list[str] | str | ScoreMultiModalParam
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:score-extra-params]
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:score-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class RerankRequest(OpenAIBaseModel):
+    model: str | None = None
+    query: str | ScoreMultiModalParam
+    documents: list[str] | ScoreMultiModalParam
+    top_n: int = Field(default_factory=lambda: 0)
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:rerank-extra-params]
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:rerank-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class RerankDocument(BaseModel):
+    text: str | None = None
+    multi_modal: ScoreContentPartParam | None = None
+
+
+class RerankResult(BaseModel):
+    index: int
+    document: RerankDocument
+    relevance_score: float
+
+
+class RerankUsage(BaseModel):
+    total_tokens: int
+
+
+class RerankResponse(OpenAIBaseModel):
+    id: str
+    model: str
+    usage: RerankUsage
+    results: list[RerankResult]
+
+
+class ScoreResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "score"
+    score: float
+
+
+class ScoreResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[ScoreResponseData]
+    usage: UsageInfo
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/pooling/score/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_score.py
rename to vllm/entrypoints/pooling/score/serving.py
index 0874c01c1f2a..e5a66783005a 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -11,6 +11,11 @@
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.score.protocol import (
     RerankDocument,
     RerankRequest,
     RerankResponse,
@@ -19,10 +24,7 @@
     ScoreRequest,
     ScoreResponse,
     ScoreResponseData,
-    UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
-from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.score_utils import (
     ScoreContentPartParam,
     ScoreMultiModalParam,
diff --git a/vllm/entrypoints/sagemaker/routes.py b/vllm/entrypoints/sagemaker/routes.py
index 498b7294f0d8..108fdd773e32 100644
--- a/vllm/entrypoints/sagemaker/routes.py
+++ b/vllm/entrypoints/sagemaker/routes.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
+from collections.abc import Awaitable, Callable
 from http import HTTPStatus
+from typing import Any
 
 import model_hosting_container_standards.sagemaker as sagemaker_standards
 import pydantic
@@ -9,12 +11,56 @@
 from fastapi.responses import JSONResponse, Response
 
 from vllm.entrypoints.openai.api_server import (
-    INVOCATION_VALIDATORS,
     base,
+    chat,
+    completion,
+    create_chat_completion,
+    create_completion,
     health,
     validate_json_request,
 )
-from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    CompletionRequest,
+    ErrorResponse,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.pooling.classify.api_router import classify, create_classify
+from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
+from vllm.entrypoints.pooling.embed.api_router import create_embedding, embedding
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest
+from vllm.entrypoints.pooling.pooling.api_router import create_pooling, pooling
+from vllm.entrypoints.pooling.pooling.protocol import PoolingRequest
+from vllm.entrypoints.pooling.score.api_router import (
+    create_score,
+    do_rerank,
+    rerank,
+    score,
+)
+from vllm.entrypoints.pooling.score.protocol import RerankRequest, ScoreRequest
+
+# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
+# (requires typing_extensions >= 4.13)
+RequestType = Any
+GetHandlerFn = Callable[[Request], OpenAIServing | None]
+EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
+
+# NOTE: Items defined earlier take higher priority
+INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = [
+    (ChatCompletionRequest, (chat, create_chat_completion)),
+    (CompletionRequest, (completion, create_completion)),
+    (EmbeddingRequest, (embedding, create_embedding)),
+    (ClassificationRequest, (classify, create_classify)),
+    (ScoreRequest, (score, create_score)),
+    (RerankRequest, (rerank, do_rerank)),
+    (PoolingRequest, (pooling, create_pooling)),
+]
+
+# NOTE: Construct the TypeAdapters only once
+INVOCATION_VALIDATORS = [
+    (pydantic.TypeAdapter(request_type), (get_handler, endpoint))
+    for request_type, (get_handler, endpoint) in INVOCATION_TYPES
+]
 
 
 def register_sagemaker_routes(router: APIRouter):
diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py
index e0488e48614d..d2dd8b1bdc1f 100644
--- a/vllm/plugins/io_processors/interface.py
+++ b/vllm/plugins/io_processors/interface.py
@@ -6,7 +6,7 @@
 from typing import Any, Generic, TypeVar
 
 from vllm.config import VllmConfig
-from vllm.entrypoints.openai.protocol import IOProcessorResponse
+from vllm.entrypoints.pooling.pooling.protocol import IOProcessorResponse
 from vllm.inputs.data import PromptType
 from vllm.outputs import PoolingRequestOutput
 from vllm.pooling_params import PoolingParams

From 014ece97c7aa49084a1119dca792af081a18dbc1 Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Mon, 1 Dec 2025 03:03:57 -0500
Subject: [PATCH 571/578] [Frontend] Add tool filtering support to ToolServer
 (#29224)

Signed-off-by: Daniel Salib <danielsalib@meta.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 .../openai_responses_client_with_mcp_tools.py | 184 ++++++++++++++++++
 .../openai/test_response_api_mcp_tools.py     | 100 ++++++++++
 .../openai/test_serving_responses.py          |  96 +++++++++
 vllm/entrypoints/openai/serving_responses.py  |  89 +++++++--
 vllm/entrypoints/tool_server.py               |  33 +++-
 5 files changed, 477 insertions(+), 25 deletions(-)
 create mode 100644 examples/online_serving/openai_responses_client_with_mcp_tools.py

diff --git a/examples/online_serving/openai_responses_client_with_mcp_tools.py b/examples/online_serving/openai_responses_client_with_mcp_tools.py
new file mode 100644
index 000000000000..cafe19a2d195
--- /dev/null
+++ b/examples/online_serving/openai_responses_client_with_mcp_tools.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example demonstrating MCP (Model Context Protocol) tools with the Responses API.
+
+This example shows how to use MCP tools with different allowed_tools configurations:
+1. No filter (allows all tools from the MCP server)
+2. Wildcard "*" (explicitly allows all tools)
+3. Specific tool names (filters to only those tools)
+
+Set up this example by starting a vLLM OpenAI-compatible server with MCP tools enabled.
+For example:
+vllm serve openai/gpt-oss-20b --enforce-eager --tool-server demo
+
+Environment variables:
+- VLLM_ENABLE_RESPONSES_API_STORE=1
+- VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=code_interpreter,container
+- VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS=1
+"""
+
+from openai import OpenAI
+from utils import get_first_model
+
+
+def example_no_filter():
+    """Example with no allowed_tools filter - allows all tools."""
+    print("=" * 60)
+    print("Example 1: No allowed_tools filter (allows all tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Execute this code: print('Hello from Python!')",
+        instructions="Use the Python tool to execute code.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # No allowed_tools specified - all tools are available
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_wildcard():
+    """Example with allowed_tools=['*'] - explicitly allows all tools."""
+    print("=" * 60)
+    print("Example 2: allowed_tools=['*'] (select all tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Execute this code: print('Hello from Python with wildcard!')",
+        instructions="Use the Python tool to execute code.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # Using "*" to explicitly allow all tools from this MCP server
+                # This is equivalent to not specifying allowed_tools
+                "allowed_tools": ["*"],
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_specific_tools():
+    """Example with specific allowed_tools list - filters available tools.
+
+    Note: This example uses 'web_search_preview' (browser) which has multiple
+    sub-tools: 'search', 'open', 'find'. The code_interpreter (python) doesn't
+    have sub-tools, so filtering doesn't apply there.
+    """
+    print("=" * 60)
+    print("Example 3: allowed_tools=['search'] (filter browser to specific tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Search for 'Python programming tutorials'",
+        instructions="Use the browser tool to search.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "web_search_preview",
+                "server_url": "http://localhost:8888",
+                # Browser has tools: 'search', 'open', 'find'
+                # Only allow 'search' - blocks 'open' and 'find'
+                "allowed_tools": ["search"],
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_object_format():
+    """Example using object format for allowed_tools with browser tools."""
+    print("=" * 60)
+    print("Example 4: allowed_tools with object format")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Search for 'machine learning' and open the first result",
+        instructions="Use the browser tool.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "web_search_preview",
+                "server_url": "http://localhost:8888",
+                # Object format with tool_names field
+                # Can also include read_only and other fields
+                # Browser has tools: 'search', 'open', 'find'
+                "allowed_tools": {
+                    "tool_names": [
+                        "search",
+                        "open",
+                    ],  # Allow search and open, block find
+                    "read_only": False,
+                },
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def main():
+    """Run all examples."""
+    print("\n" + "=" * 60)
+    print("MCP Tools with allowed_tools Examples")
+    print("=" * 60 + "\n")
+
+    # Run all examples
+    example_no_filter()
+    example_wildcard()
+    example_specific_tools()
+    example_object_format()
+
+    print("=" * 60)
+    print("Summary:")
+    print("  - No filter or '*' → All tools available from server")
+    print("  - Specific list → Only those sub-tools available")
+    print("  - Object format → More control with tool_names field")
+    print("")
+    print("Note: allowed_tools filters SUB-TOOLS within an MCP server:")
+    print("  - code_interpreter (python): No sub-tools to filter")
+    print("  - web_search_preview (browser): Has 'search', 'open', 'find'")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/entrypoints/openai/test_response_api_mcp_tools.py b/tests/entrypoints/openai/test_response_api_mcp_tools.py
index 0dc2430caef7..cd338b5555c5 100644
--- a/tests/entrypoints/openai/test_response_api_mcp_tools.py
+++ b/tests/entrypoints/openai/test_response_api_mcp_tools.py
@@ -4,6 +4,9 @@
 import pytest
 import pytest_asyncio
 from openai import OpenAI
+from openai_harmony import ToolDescription, ToolNamespaceConfig
+
+from vllm.entrypoints.tool_server import MCPToolServer
 
 from ...utils import RemoteOpenAIServer
 
@@ -111,6 +114,48 @@ async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, model_name:
         )
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_mcp_tool_with_allowed_tools_star(
+    mcp_enabled_client: OpenAI, model_name: str
+):
+    """Test MCP tool with allowed_tools=['*'] to select all available tools.
+
+    This E2E test verifies that the "*" wildcard works end-to-end.
+    See test_serving_responses.py for detailed unit tests of "*" normalization.
+    """
+    response = await mcp_enabled_client.responses.create(
+        model=model_name,
+        input=(
+            "Execute the following code: "
+            "import random; print(random.randint(1, 1000000))"
+        ),
+        instructions=(
+            "You must use the Python tool to execute code. Never simulate execution."
+        ),
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # Using "*" to allow all tools from this MCP server
+                "allowed_tools": ["*"],
+            }
+        ],
+        extra_body={"enable_response_messages": True},
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # Verify tool calls work with allowed_tools=["*"]
+    tool_call_found = False
+    for message in response.output_messages:
+        recipient = message.get("recipient")
+        if recipient and recipient.startswith("python"):
+            tool_call_found = True
+            break
+    assert tool_call_found, "Should have found at least one Python tool call with '*'"
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_name: str):
@@ -159,3 +204,58 @@ async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_nam
         assert message.get("author").get("role") != "developer", (
             "No developer messages should be present without a valid tool"
         )
+
+
+def test_get_tool_description():
+    """Test MCPToolServer.get_tool_description filtering logic.
+
+    Note: The wildcard "*" is normalized to None by
+    _extract_allowed_tools_from_mcp_requests before reaching this layer,
+    so we only test None and specific tool filtering here.
+    See test_serving_responses.py for "*" normalization tests.
+    """
+    pytest.importorskip("mcp")
+
+    server = MCPToolServer()
+    tool1 = ToolDescription.new(
+        name="tool1", description="First", parameters={"type": "object"}
+    )
+    tool2 = ToolDescription.new(
+        name="tool2", description="Second", parameters={"type": "object"}
+    )
+    tool3 = ToolDescription.new(
+        name="tool3", description="Third", parameters={"type": "object"}
+    )
+
+    server.harmony_tool_descriptions = {
+        "test_server": ToolNamespaceConfig(
+            name="test_server", description="test", tools=[tool1, tool2, tool3]
+        )
+    }
+
+    # Nonexistent server
+    assert server.get_tool_description("nonexistent") is None
+
+    # None (no filter) - returns all tools
+    result = server.get_tool_description("test_server", allowed_tools=None)
+    assert len(result.tools) == 3
+
+    # Filter to specific tools
+    result = server.get_tool_description(
+        "test_server", allowed_tools=["tool1", "tool3"]
+    )
+    assert len(result.tools) == 2
+    assert result.tools[0].name == "tool1"
+    assert result.tools[1].name == "tool3"
+
+    # Single tool
+    result = server.get_tool_description("test_server", allowed_tools=["tool2"])
+    assert len(result.tools) == 1
+    assert result.tools[0].name == "tool2"
+
+    # No matching tools - returns None
+    result = server.get_tool_description("test_server", allowed_tools=["nonexistent"])
+    assert result is None
+
+    # Empty list - returns None
+    assert server.get_tool_description("test_server", allowed_tools=[]) is None
diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py
index 6af32774cc5c..cf00f0a04224 100644
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -17,6 +17,7 @@
 from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest
 from vllm.entrypoints.openai.serving_responses import (
     OpenAIServingResponses,
+    _extract_allowed_tools_from_mcp_requests,
     extract_tool_types,
 )
 from vllm.entrypoints.tool_server import ToolServer
@@ -254,3 +255,98 @@ def test_validate_generator_input(self, serving_responses_instance):
         # Should return an ErrorResponse
         assert result is not None
         assert isinstance(result, ErrorResponse)
+
+
+class TestExtractAllowedToolsFromMcpRequests:
+    """Test class for _extract_allowed_tools_from_mcp_requests function"""
+
+    def test_extract_allowed_tools_basic_formats(self):
+        """Test extraction with list format, object format, and None."""
+        from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
+
+        tools = [
+            # List format
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["tool1", "tool2"],
+            ),
+            # Object format
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=McpAllowedToolsMcpToolFilter(
+                    tool_names=["tool3", "tool4"]
+                ),
+            ),
+            # None (no filter)
+            Mcp(
+                type="mcp",
+                server_label="server3",
+                allowed_tools=None,
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        assert result == {
+            "server1": ["tool1", "tool2"],
+            "server2": ["tool3", "tool4"],
+            "server3": None,
+        }
+
+    def test_extract_allowed_tools_star_normalization(self):
+        """Test that '*' wildcard is normalized to None (select all tools).
+
+        This is the key test requested by reviewers to explicitly demonstrate
+        that the "*" select-all scenario is handled correctly.
+        """
+        from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
+
+        tools = [
+            # Star in list format
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["*"],
+            ),
+            # Star mixed with other tools in list
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=["tool1", "*"],
+            ),
+            # Star in object format
+            Mcp(
+                type="mcp",
+                server_label="server3",
+                allowed_tools=McpAllowedToolsMcpToolFilter(tool_names=["*"]),
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        # All should be normalized to None (allows all tools)
+        assert result == {
+            "server1": None,
+            "server2": None,
+            "server3": None,
+        }
+
+    def test_extract_allowed_tools_filters_non_mcp(self):
+        """Test that non-MCP tools are ignored during extraction."""
+        tools = [
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["tool1"],
+            ),
+            LocalShell(type="local_shell"),  # Non-MCP tool should be ignored
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=["tool2"],
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        # Non-MCP tools should be ignored
+        assert result == {
+            "server1": ["tool1"],
+            "server2": ["tool2"],
+        }
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 5144916ba71e..81495a077754 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -48,6 +48,7 @@
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent,
 )
+from openai.types.responses.tool import Mcp, Tool
 from openai_harmony import Message as OpenAIHarmonyMessage
 
 from vllm import envs
@@ -111,6 +112,45 @@
 logger = init_logger(__name__)
 
 
+def _extract_allowed_tools_from_mcp_requests(
+    tools: list[Tool],
+) -> dict[str, list[str] | None]:
+    """
+    Extract allowed_tools mapping from MCP tool requests.
+
+    Returns a dictionary mapping server_label to allowed_tools list.
+    Handles both list format and McpAllowedToolsMcpToolFilter object format.
+
+    Special handling:
+    - If allowed_tools is None, returns None (allows all tools)
+    - If allowed_tools contains "*", returns None (allows all tools)
+    - Otherwise, returns the list of specific tool names
+
+    This function can be reused for both harmony and non-harmony MCP calls.
+    """
+    allowed_tools_map: dict[str, list[str] | None] = {}
+    for tool in tools:
+        if not isinstance(tool, Mcp):
+            continue
+
+        # allowed_tools can be a list or an object with tool_names
+        # Extract the actual list of tool names
+        allowed_tools_val = None
+        if tool.allowed_tools is not None:
+            if isinstance(tool.allowed_tools, list):
+                allowed_tools_val = tool.allowed_tools
+            elif hasattr(tool.allowed_tools, "tool_names"):
+                # It's an McpAllowedToolsMcpToolFilter object
+                allowed_tools_val = tool.allowed_tools.tool_names
+
+        # Normalize "*" to None (both mean "allow all tools")
+        if allowed_tools_val is not None and "*" in allowed_tools_val:
+            allowed_tools_val = None
+
+        allowed_tools_map[tool.server_label] = allowed_tools_val
+    return allowed_tools_map
+
+
 class OpenAIServingResponses(OpenAIServing):
     def __init__(
         self,
@@ -878,38 +918,45 @@ def _construct_harmony_system_input_message(
         self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str]
     ) -> OpenAIHarmonyMessage:
         reasoning_effort = request.reasoning.effort if request.reasoning else None
-        enable_browser = (
-            "web_search_preview" in tool_types
+
+        # Extract allowed_tools from MCP tool requests
+        allowed_tools_map = _extract_allowed_tools_from_mcp_requests(request.tools)
+
+        # Get filtered tool descriptions first.
+        # If get_tool_description returns None (due to filtering), the tool is disabled.
+        browser_description = (
+            self.tool_server.get_tool_description(
+                "browser", allowed_tools_map.get("web_search_preview")
+            )
+            if "web_search_preview" in tool_types
             and self.tool_server is not None
             and self.tool_server.has_tool("browser")
+            else None
         )
-        enable_code_interpreter = (
-            "code_interpreter" in tool_types
+        python_description = (
+            self.tool_server.get_tool_description(
+                "python", allowed_tools_map.get("code_interpreter")
+            )
+            if "code_interpreter" in tool_types
             and self.tool_server is not None
             and self.tool_server.has_tool("python")
+            else None
         )
-        enable_container = (
-            "container" in tool_types
+        container_description = (
+            self.tool_server.get_tool_description(
+                "container", allowed_tools_map.get("container")
+            )
+            if "container" in tool_types
             and self.tool_server is not None
             and self.tool_server.has_tool("container")
+            else None
         )
+
         sys_msg = get_system_message(
             reasoning_effort=reasoning_effort,
-            browser_description=(
-                self.tool_server.get_tool_description("browser")
-                if enable_browser and self.tool_server is not None
-                else None
-            ),
-            python_description=(
-                self.tool_server.get_tool_description("python")
-                if enable_code_interpreter and self.tool_server is not None
-                else None
-            ),
-            container_description=(
-                self.tool_server.get_tool_description("container")
-                if enable_container and self.tool_server is not None
-                else None
-            ),
+            browser_description=browser_description,
+            python_description=python_description,
+            container_description=container_description,
             instructions=request.instructions,
             with_custom_tools=with_custom_tools,
         )
diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py
index 0d83031ef69f..99db86702518 100644
--- a/vllm/entrypoints/tool_server.py
+++ b/vllm/entrypoints/tool_server.py
@@ -80,7 +80,9 @@ def has_tool(self, tool_name: str) -> bool:
         pass
 
     @abstractmethod
-    def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None:
+    def get_tool_description(
+        self, tool_name: str, allowed_tools: list[str] | None = None
+    ) -> ToolNamespaceConfig | None:
         """
         Return the tool description for the given tool name.
         If the tool is not supported, return None.
@@ -147,8 +149,29 @@ async def add_tool_server(self, server_url: str):
     def has_tool(self, tool_name: str):
         return tool_name in self.harmony_tool_descriptions
 
-    def get_tool_description(self, tool_name: str):
-        return self.harmony_tool_descriptions.get(tool_name)
+    def get_tool_description(
+        self,
+        server_label: str,
+        allowed_tools: list[str] | None = None,
+    ) -> ToolNamespaceConfig | None:
+        cfg = self.harmony_tool_descriptions.get(server_label)
+        if cfg is None:
+            return None
+
+        # No restrictions: all tools from this MCP server
+        if allowed_tools is None:
+            return cfg
+
+        filtered = [t for t in cfg.tools if t.name in allowed_tools]
+
+        if not filtered:
+            return None
+
+        return ToolNamespaceConfig(
+            name=cfg.name,
+            description=cfg.description,
+            tools=filtered,
+        )
 
     @asynccontextmanager
     async def new_session(
@@ -190,7 +213,9 @@ async def init_and_validate(self):
     def has_tool(self, tool_name: str) -> bool:
         return tool_name in self.tools
 
-    def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None:
+    def get_tool_description(
+        self, tool_name: str, allowed_tools: list[str] | None = None
+    ) -> ToolNamespaceConfig | None:
         if tool_name not in self.tools:
             return None
         if tool_name == "browser":

From 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Seznec?= <mickael@mistral.ai>
Date: Mon, 1 Dec 2025 10:29:33 +0100
Subject: [PATCH 572/578] [crashfix] Eagle + multimodal can crash on mm cache
 miss (#29750)

Signed-off-by: Mickael Seznec <mickael@mistral.ai>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/v1/core/sched/scheduler.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index e3ec8440a932..4e38b991326d 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -263,6 +263,7 @@ def schedule(self) -> SchedulerOutput:
                     request.num_computed_tokens,
                     num_new_tokens,
                     encoder_compute_budget,
+                    shift_computed_tokens=1 if self.use_eagle else 0,
                 )
 
             if num_new_tokens == 0:
@@ -532,6 +533,7 @@ def schedule(self) -> SchedulerOutput:
                             num_computed_tokens,
                             num_new_tokens,
                             encoder_compute_budget,
+                            shift_computed_tokens=1 if self.use_eagle else 0,
                         )
                         if num_new_tokens == 0:
                             # The request cannot be scheduled.
@@ -829,6 +831,7 @@ def _try_schedule_encoder_inputs(
         num_computed_tokens: int,
         num_new_tokens: int,
         encoder_compute_budget: int,
+        shift_computed_tokens: int = 0,
     ) -> tuple[list[int], int, int, list[int]]:
         """
         Determine which encoder inputs need to be scheduled in the current step,
@@ -873,7 +876,10 @@ def _try_schedule_encoder_inputs(
             # The encoder output is needed if the two ranges overlap:
             # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
             # [start_pos, start_pos + num_encoder_tokens)
-            if start_pos >= num_computed_tokens + num_new_tokens:
+            if (
+                start_pos
+                >= num_computed_tokens + num_new_tokens + shift_computed_tokens
+            ):
                 # The encoder input is not needed in this step.
                 break
 
@@ -929,10 +935,12 @@ def _try_schedule_encoder_inputs(
                 # NOTE(woosuk): We assume that the encoder input tokens should
                 # be processed altogether, as the encoder usually uses
                 # bidirectional attention.
-                if num_computed_tokens < start_pos:
+                if num_computed_tokens + shift_computed_tokens < start_pos:
                     # We only schedule the decoder tokens just before the
                     # encoder input.
-                    num_new_tokens = start_pos - num_computed_tokens
+                    num_new_tokens = start_pos - (
+                        num_computed_tokens + shift_computed_tokens
+                    )
                 else:
                     # Because of prefix caching, num_computed_tokens is greater
                     # than start_pos even though its encoder input is not

From f0a28bf6610602a4f064ac59e1449f5acdf04c7a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 1 Dec 2025 19:34:58 +0800
Subject: [PATCH 573/578] [Misc] Unify tokenizer registration (#29767)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_tokenization.py |  10 +-
 .../entrypoints/pooling/embed/test_online.py  |   2 +-
 .../pooling/pooling/test_online.py            |   6 +-
 tests/models/registry.py                      |   2 +-
 tests/tokenizers_/test_registry.py            |  25 ++-
 .../llm/test_struct_output_generate.py        |   9 +-
 vllm/config/model.py                          |  22 +-
 vllm/engine/arg_utils.py                      |   2 +-
 vllm/entrypoints/llm.py                       |   2 +-
 vllm/tokenizers/__init__.py                   |  10 +-
 vllm/tokenizers/hf.py                         |   2 +
 vllm/tokenizers/mistral.py                    |   2 +
 vllm/tokenizers/registry.py                   | 199 ++++++++++++++++--
 vllm/transformers_utils/tokenizer.py          | 127 +----------
 14 files changed, 237 insertions(+), 183 deletions(-)

diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 7fd32e1c7be1..751f94319eb9 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -53,7 +53,7 @@ async def test_tokenize_completions(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     for add_special in [False, True]:
         prompt = "vllm1 This is a test prompt."
@@ -87,7 +87,7 @@ async def test_tokenize_chat(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     for add_generation in [False, True]:
         for add_special in [False, True]:
@@ -140,7 +140,7 @@ async def test_tokenize_chat_with_tools(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     for add_generation in [False, True]:
         for add_special in [False, True]:
@@ -210,7 +210,7 @@ async def test_tokenize_with_return_token_strs(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     prompt = "This is a token_strs test prompt! vllm1"
     response = requests.post(
@@ -240,7 +240,7 @@ async def test_detokenize(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     prompt = "This is a test prompt. vllm1"
     tokens = tokenizer.encode(prompt, add_special_tokens=False)
diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py
index 81f4f9a82c1f..6aac649bc303 100644
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -197,7 +197,7 @@ async def test_conversation_embedding(
     chat_response.raise_for_status()
     chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
 
-    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=model_name)
     prompt = tokenizer.apply_chat_template(
         messages,
         chat_template=DUMMY_CHAT_TEMPLATE,
diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py
index bfcb724f1944..977c74d54a35 100644
--- a/tests/entrypoints/pooling/pooling/test_online.py
+++ b/tests/entrypoints/pooling/pooling/test_online.py
@@ -158,11 +158,7 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
     chat_response.raise_for_status()
     chat_poolings = PoolingResponse.model_validate(chat_response.json())
 
-    tokenizer = get_tokenizer(
-        tokenizer_name=model_name,
-        tokenizer_mode="fast",
-        trust_remote_code=True,
-    )
+    tokenizer = get_tokenizer(tokenizer_name=model_name, trust_remote_code=True)
     prompt = tokenizer.apply_chat_template(
         messages,
         chat_template=DUMMY_CHAT_TEMPLATE,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 1f4a106c06b4..d90f3a4d4f78 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -23,7 +23,7 @@ class _HfExamplesInfo:
     tokenizer: str | None = None
     """Set the tokenizer to load for this architecture."""
 
-    tokenizer_mode: TokenizerMode = "auto"
+    tokenizer_mode: TokenizerMode | str = "auto"
     """Set the tokenizer type for this architecture."""
 
     speculative_model: str | None = None
diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py
index b357669f8378..57b6a14a54b3 100644
--- a/tests/tokenizers_/test_registry.py
+++ b/tests/tokenizers_/test_registry.py
@@ -1,13 +1,28 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+
 from vllm.tokenizers import TokenizerLike, TokenizerRegistry
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 
 class TestTokenizer(TokenizerLike):
     @classmethod
-    def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
-        return TestTokenizer()  # type: ignore
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TestTokenizer":
+        return TestTokenizer(path_or_repo_id)  # type: ignore
+
+    def __init__(self, path_or_repo_id: str | Path) -> None:
+        super().__init__()
+
+        self.path_or_repo_id = path_or_repo_id
 
     @property
     def bos_token_id(self) -> int:
@@ -29,14 +44,16 @@ def is_fast(self) -> bool:
 def test_customized_tokenizer():
     TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
 
-    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
+    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
     assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.path_or_repo_id == "abc"
     assert tokenizer.bos_token_id == 0
     assert tokenizer.eos_token_id == 1
     assert tokenizer.pad_token_id == 2
 
-    tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
+    tokenizer = get_tokenizer("abc", tokenizer_mode="test_tokenizer")
     assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.path_or_repo_id == "abc"
     assert tokenizer.bos_token_id == 0
     assert tokenizer.eos_token_id == 1
     assert tokenizer.pad_token_id == 2
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 85f108786c05..9cc5a6e073a6 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -4,7 +4,7 @@
 
 import json
 from enum import Enum
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import jsonschema
 import pytest
@@ -24,11 +24,6 @@
     StructuredOutputsParams,
 )
 
-if TYPE_CHECKING:
-    from vllm.config.model import TokenizerMode
-else:
-    TokenizerMode = str
-
 NGRAM_SPEC_CONFIG = {
     "model": "[ngram]",
     "num_speculative_tokens": 5,
@@ -627,7 +622,7 @@ def test_structured_output(
 )
 def test_structured_output_with_reasoning_matrices(
     backend: str,
-    tokenizer_mode: TokenizerMode,
+    tokenizer_mode: str,
     reasoning_parser: str,
     model_name: str,
     speculative_config: dict[str, Any] | None,
diff --git a/vllm/config/model.py b/vllm/config/model.py
index b595c8550491..ef592ac00153 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -86,7 +86,7 @@
     "transcription",
     "draft",
 ]
-TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"]
+TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -137,13 +137,13 @@ class ModelConfig:
     tokenizer: SkipValidation[str] = None  # type: ignore
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
-    tokenizer_mode: TokenizerMode = "auto"
+    tokenizer_mode: TokenizerMode | str = "auto"
     """Tokenizer mode:\n
     - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
     - "hf" will use the fast tokenizer if available.\n
     - "slow" will always use the slow tokenizer.\n
     - "mistral" will always use the tokenizer from `mistral_common`.\n
-    - "custom" will use --tokenizer to select the preregistered tokenizer."""
+    - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
     and tokenizer."""
@@ -708,9 +708,6 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
             # can be correctly capped to sliding window size
             self.hf_text_config.sliding_window = None
 
-        if not self.skip_tokenizer_init:
-            self._verify_tokenizer_mode()
-
         # Avoid running try_verify_and_update_config multiple times
         self.config_updated = False
 
@@ -718,6 +715,10 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
+    @field_validator("tokenizer_mode", mode="after")
+    def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str:
+        return tokenizer_mode.lower()
+
     @field_validator("quantization", mode="before")
     @classmethod
     def validate_quantization_before(cls, value: Any) -> Any:
@@ -829,15 +830,6 @@ def _get_encoder_config(self):
             model, _ = split_remote_gguf(model)
         return get_sentence_transformer_tokenizer_config(model, self.revision)
 
-    def _verify_tokenizer_mode(self) -> None:
-        tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
-        if tokenizer_mode not in get_args(TokenizerMode):
-            raise ValueError(
-                f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
-                f"one of {get_args(TokenizerMode)}."
-            )
-        self.tokenizer_mode = tokenizer_mode
-
     def _get_default_runner_type(
         self,
         architectures: list[str],
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 186a2a414187..96b1b971552c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -360,7 +360,7 @@ class EngineArgs:
     task: TaskOption | None = ModelConfig.task
     skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
-    tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
+    tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
     trust_remote_code: bool = ModelConfig.trust_remote_code
     allowed_local_media_path: str = ModelConfig.allowed_local_media_path
     allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index acdf28501cbb..f005605c08d7 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -188,7 +188,7 @@ def __init__(
         runner: RunnerOption = "auto",
         convert: ConvertOption = "auto",
         tokenizer: str | None = None,
-        tokenizer_mode: TokenizerMode = "auto",
+        tokenizer_mode: TokenizerMode | str = "auto",
         skip_tokenizer_init: bool = False,
         trust_remote_code: bool = False,
         allowed_local_media_path: str = "",
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index 03174872146a..14f0148cf7ba 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -4,6 +4,12 @@
 from .hf import HfTokenizer
 from .mistral import MistralTokenizer
 from .protocol import TokenizerLike
-from .registry import TokenizerRegistry
+from .registry import TokenizerRegistry, get_tokenizer
 
-__all__ = ["TokenizerLike", "HfTokenizer", "MistralTokenizer", "TokenizerRegistry"]
+__all__ = [
+    "TokenizerLike",
+    "HfTokenizer",
+    "MistralTokenizer",
+    "TokenizerRegistry",
+    "get_tokenizer",
+]
diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py
index 64672fdbb120..344507312038 100644
--- a/vllm/tokenizers/hf.py
+++ b/vllm/tokenizers/hf.py
@@ -10,6 +10,7 @@
 from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
 
 from .protocol import TokenizerLike
+from .registry import TokenizerRegistry
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -67,6 +68,7 @@ def __reduce__(self):
     return cached_tokenizer  # type: ignore
 
 
+@TokenizerRegistry.register("hf")
 class HfTokenizer(TokenizerLike):
     @classmethod
     def from_pretrained(
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index de3e5ec43854..7e6745004b01 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -6,6 +6,7 @@
 from vllm.logger import init_logger
 
 from .protocol import TokenizerLike
+from .registry import TokenizerRegistry
 
 if TYPE_CHECKING:
     from mistral_common.protocol.instruct.request import (
@@ -165,6 +166,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
         return tokenizer.unk_id
 
 
+@TokenizerRegistry.register("mistral")
 class MistralTokenizer(TokenizerLike):
     @classmethod
     def from_pretrained(
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 3a236c99b356..d5e789932161 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -1,28 +1,197 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import importlib
+import importlib.util
+from collections.abc import Callable
+from pathlib import Path
+from typing import TypeVar, overload
+
+import huggingface_hub
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+from vllm.transformers_utils.utils import (
+    check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)
+from vllm.utils.import_utils import resolve_obj_by_qualname
 
 from .protocol import TokenizerLike
 
+logger = init_logger(__name__)
+
+_T = TypeVar("_T", bound=type[TokenizerLike])
+
 
 class TokenizerRegistry:
-    # Tokenizer name -> (tokenizer module, tokenizer class)
-    REGISTRY: dict[str, tuple[str, str]] = {}
+    # Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class)
+    REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {}
 
+    # In-tree tokenizers
     @staticmethod
-    def register(name: str, module: str, class_name: str) -> None:
-        TokenizerRegistry.REGISTRY[name] = (module, class_name)
+    @overload
+    def register(tokenizer_mode: str) -> Callable[[_T], _T]: ...
 
+    # OOT tokenizers
     @staticmethod
-    def get_tokenizer(
-        tokenizer_name: str,
-        *args,
-        **kwargs,
-    ) -> "TokenizerLike":
-        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
-        if tokenizer_cls is None:
-            raise ValueError(f"Tokenizer {tokenizer_name} not found.")
+    @overload
+    def register(tokenizer_mode: str, module: str, class_name: str) -> None: ...
+
+    @staticmethod
+    def register(
+        tokenizer_mode: str,
+        module: str | None = None,
+        class_name: str | None = None,
+    ) -> Callable[[_T], _T] | None:
+        # In-tree tokenizers
+        if module is None or class_name is None:
+
+            def wrapper(tokenizer_cls: _T) -> _T:
+                assert tokenizer_mode not in TokenizerRegistry.REGISTRY
+                TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls
+
+                return tokenizer_cls
+
+            return wrapper
+
+        # OOT tokenizers
+        if tokenizer_mode in TokenizerRegistry.REGISTRY:
+            logger.warning(
+                "%s.%s is already registered for tokenizer_mode=%r. "
+                "It is overwritten by the new one.",
+                module,
+                class_name,
+                tokenizer_mode,
+            )
+
+        TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name)
+
+        return None
+
+    @staticmethod
+    def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike":
+        if tokenizer_mode not in TokenizerRegistry.REGISTRY:
+            raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")
+
+        item = TokenizerRegistry.REGISTRY[tokenizer_mode]
+        if isinstance(item, type):
+            return item.from_pretrained(*args, **kwargs)
+
+        module, class_name = item
+        logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
 
-        tokenizer_module = importlib.import_module(tokenizer_cls[0])
-        class_ = getattr(tokenizer_module, tokenizer_cls[1])
+        class_ = resolve_obj_by_qualname(f"{module}.{class_name}")
         return class_.from_pretrained(*args, **kwargs)
+
+
+def get_tokenizer(
+    tokenizer_name: str | Path,
+    *args,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    revision: str | None = None,
+    download_dir: str | None = None,
+    **kwargs,
+) -> TokenizerLike:
+    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
+    if envs.VLLM_USE_MODELSCOPE:
+        # download model from ModelScope hub,
+        # lazy import so that modelscope is not required for normal use.
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        # avoid circular import
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
+        # Only set the tokenizer here, model will be downloaded on the workers.
+        if not Path(tokenizer_name).exists():
+            # Use file lock to prevent multiple processes from
+            # downloading the same file at the same time.
+            with get_lock(tokenizer_name, download_dir):
+                tokenizer_path = snapshot_download(
+                    model_id=str(tokenizer_name),
+                    cache_dir=download_dir,
+                    revision=revision,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    # Ignore weights - we only need the tokenizer.
+                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+                )
+                tokenizer_name = tokenizer_path
+
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
+
+        tokenizer_mode = "hf"
+        kwargs["use_fast"] = False
+
+    if "truncation_side" not in kwargs:
+        kwargs["truncation_side"] = "left"
+
+    # Separate model folder from file path for GGUF models
+    if is_gguf(tokenizer_name):
+        if check_gguf_file(tokenizer_name):
+            kwargs["gguf_file"] = Path(tokenizer_name).name
+            tokenizer_name = Path(tokenizer_name).parent
+        elif is_remote_gguf(tokenizer_name):
+            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
+            # Get the HuggingFace Hub path for the GGUF file
+            gguf_file = get_gguf_file_path_from_hf(
+                tokenizer_name,
+                quant_type,
+                revision=revision,
+            )
+            kwargs["gguf_file"] = gguf_file
+
+    # Try to use official Mistral tokenizer if possible
+    if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
+        allow_patterns = ["tekken.json", "tokenizer.model.v*"]
+        files_list = list_filtered_repo_files(
+            model_name_or_path=str(tokenizer_name),
+            allow_patterns=allow_patterns,
+            revision=revision,
+        )
+        if len(files_list) > 0:
+            tokenizer_mode = "mistral"
+
+    # Fallback to HF tokenizer
+    if tokenizer_mode == "auto":
+        tokenizer_mode = "hf"
+
+    tokenizer_args = (tokenizer_name, *args)
+    tokenizer_kwargs = dict(
+        trust_remote_code=trust_remote_code,
+        revision=revision,
+        download_dir=download_dir,
+        **kwargs,
+    )
+
+    if tokenizer_mode == "custom":
+        logger.warning_once(
+            "TokenizerRegistry now uses `tokenizer_mode` as the registry key "
+            "instead of `tokenizer_name`. "
+            "Please update the definition of `.from_pretrained` in "
+            "your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
+            "Then, you can pass `tokenizer_mode=%r` instead of "
+            "`tokenizer_mode='custom'` when initializing vLLM.",
+            tokenizer_args,
+            str(tokenizer_kwargs),
+            tokenizer_mode,
+        )
+
+        tokenizer_mode = str(tokenizer_name)
+
+    tokenizer = TokenizerRegistry.get_tokenizer(
+        tokenizer_mode,
+        *tokenizer_args,
+        **tokenizer_kwargs,
+    )
+    if not tokenizer.is_fast:
+        logger.warning(
+            "Using a slow tokenizer. This might cause a significant "
+            "slowdown. Consider using a fast tokenizer instead."
+        )
+
+    return tokenizer
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 622d5c7fe993..0911848c02e1 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,28 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import importlib.util
-import os
 import warnings
 from functools import lru_cache
-from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
-import huggingface_hub
 from typing_extensions import assert_never
 
-from vllm import envs
 from vllm.logger import init_logger
-from vllm.tokenizers import (
-    HfTokenizer,
-    MistralTokenizer,
-    TokenizerLike,
-    TokenizerRegistry,
-)
-
-from .gguf_utils import get_gguf_file_path_from_hf
-from .repo_utils import list_filtered_repo_files
-from .utils import check_gguf_file, is_gguf, is_remote_gguf, split_remote_gguf
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -108,117 +94,6 @@ def encode_tokens(
     return tokenizer.encode(text, **kw_args)
 
 
-def get_tokenizer(
-    tokenizer_name: str | Path,
-    *args,
-    tokenizer_mode: str = "auto",
-    trust_remote_code: bool = False,
-    revision: str | None = None,
-    download_dir: str | None = None,
-    **kwargs,
-) -> TokenizerLike:
-    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
-    if envs.VLLM_USE_MODELSCOPE:
-        # download model from ModelScope hub,
-        # lazy import so that modelscope is not required for normal use.
-        # pylint: disable=C.
-        from modelscope.hub.snapshot_download import snapshot_download
-
-        # avoid circuit import
-        from vllm.model_executor.model_loader.weight_utils import get_lock
-
-        # Only set the tokenizer here, model will be downloaded on the workers.
-        if not os.path.exists(tokenizer_name):
-            # Use file lock to prevent multiple processes from
-            # downloading the same file at the same time.
-            with get_lock(tokenizer_name, download_dir):
-                tokenizer_path = snapshot_download(
-                    model_id=tokenizer_name,
-                    cache_dir=download_dir,
-                    revision=revision,
-                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-                    # Ignore weights - we only need the tokenizer.
-                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
-                )
-                tokenizer_name = tokenizer_path
-
-    if tokenizer_mode == "slow":
-        if kwargs.get("use_fast", False):
-            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
-        kwargs["use_fast"] = False
-
-    if "truncation_side" not in kwargs:
-        kwargs["truncation_side"] = "left"
-
-    # Separate model folder from file path for GGUF models
-    if is_gguf(tokenizer_name):
-        if check_gguf_file(tokenizer_name):
-            kwargs["gguf_file"] = Path(tokenizer_name).name
-            tokenizer_name = Path(tokenizer_name).parent
-        elif is_remote_gguf(tokenizer_name):
-            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
-            # Get the HuggingFace Hub path for the GGUF file
-            gguf_file = get_gguf_file_path_from_hf(
-                tokenizer_name,
-                quant_type,
-                revision=revision,
-            )
-            kwargs["gguf_file"] = gguf_file
-
-    # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
-    # first to use official Mistral tokenizer if possible.
-    mistral_common_installed = importlib.util.find_spec("mistral_common") is not None
-    if tokenizer_mode == "auto" and mistral_common_installed:
-        allow_patterns = ["tekken.json", "tokenizer.model.v*"]
-        files_list = list_filtered_repo_files(
-            model_name_or_path=str(tokenizer_name),
-            allow_patterns=allow_patterns,
-            revision=revision,
-        )
-        if len(files_list) > 0:
-            tokenizer_mode = "mistral"
-
-    tokenizer: TokenizerLike
-    if tokenizer_mode == "mistral":
-        logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}")
-        tokenizer = MistralTokenizer.from_pretrained(
-            tokenizer_name,
-            *args,
-            trust_remote_code=trust_remote_code,
-            revision=revision,
-            download_dir=download_dir,
-            **kwargs,
-        )
-    elif tokenizer_mode == "custom":
-        logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}")
-        tokenizer = TokenizerRegistry.get_tokenizer(
-            str(tokenizer_name),
-            *args,
-            trust_remote_code=trust_remote_code,
-            revision=revision,
-            download_dir=download_dir,
-            **kwargs,
-        )
-    else:
-        logger.debug_once(f"Loading HfTokenizer from {tokenizer_name}")
-        tokenizer = HfTokenizer.from_pretrained(
-            tokenizer_name,
-            *args,
-            trust_remote_code=trust_remote_code,
-            revision=revision,
-            download_dir=download_dir,
-            **kwargs,
-        )
-
-    if not tokenizer.is_fast:
-        logger.warning(
-            "Using a slow tokenizer. This might cause a significant "
-            "slowdown. Consider using a fast tokenizer instead."
-        )
-
-    return tokenizer
-
-
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
 
From f37e8938d26ad9c55d7bdef7098dfdba4204dd94 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Mon, 1 Dec 2025 20:00:52 +0800
Subject: [PATCH 574/578] [XPU] Fix AWQ skipped layer detection in IPEX
 quantization (#29774)

Signed-off-by: Fanli Lin <fanli.lin@intel.com>
---
 vllm/model_executor/layers/quantization/ipex_quant.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 22c4bae041a5..a1571afba297 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -150,7 +150,10 @@ def get_quant_method(
         if isinstance(layer, LinearBase):
             if self.method == "awq":
                 if is_layer_skipped(
-                    prefix, self.modules_to_not_convert, self.packed_modules_mapping
+                    prefix,
+                    self.modules_to_not_convert,
+                    self.packed_modules_mapping,
+                    skip_with_substr=True,
                 ):
                     return UnquantizedLinearMethod()
                 return IPEXAWQLinearMethod(self)

From ad9d656bfa19d937b3ecb4b33b2824a8440bd8f5 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Mon, 1 Dec 2025 07:41:48 -0500
Subject: [PATCH 575/578] [multimodal][test] Reduce memory utilization for
 test_siglip to avoid OOM  (#29504)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/models/multimodal/pooling/test_siglip.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index c973676ba027..92ae115a1983 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -37,7 +37,12 @@ def _run_test(
     dtype: str,
 ) -> None:
     with vllm_runner(
-        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=64
+        model,
+        runner="pooling",
+        dtype=dtype,
+        enforce_eager=True,
+        max_model_len=64,
+        gpu_memory_utilization=0.7,
     ) as vllm_model:
         vllm_outputs = vllm_model.embed(input_texts, images=input_images)
 
@@ -134,6 +139,7 @@ def test_models_text_image_no_crash(
         dtype=dtype,
         enforce_eager=True,
         max_model_len=64,
+        gpu_memory_utilization=0.7,
     ) as vllm_model:
         with pytest.raises(ValueError, match="not both"):
             vllm_model.embed(texts, images=images)

From b95db244ee2d37390709aae27a2457a18984540a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 1 Dec 2025 21:12:51 +0800
Subject: [PATCH 576/578] [v1] Add real sliding window calculation to
 FlexAttention direct BlockMask building (#26015)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
Co-authored-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
---
 tests/v1/attention/test_attention_backends.py | 12 ++++++-
 vllm/v1/attention/backends/flex_attention.py  | 33 +++++++++++++++----
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index b46002c5fa8f..e7ec8380e0a8 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -74,6 +74,9 @@ def _convert_dtype_to_torch(dtype):
     ),
     "large_decode": BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
     "large_prefill": BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
+    "mixed_large": BatchSpec(
+        seq_lens=[1024, 2048, 4096, 1024, 2048, 4096], query_lens=[1, 1, 1, 32, 32, 32]
+    ),
     "single_decode": BatchSpec(seq_lens=[1024], query_lens=[1]),
     "single_prefill": BatchSpec(seq_lens=[1024], query_lens=[64]),
 }
@@ -587,7 +590,14 @@ def causal_mask_mod(
 
 @pytest.mark.parametrize(
     "batch_spec_name",
-    ["small_decode", "small_prefill", "mixed_medium", "large_decode", "large_prefill"],
+    [
+        "small_decode",
+        "small_prefill",
+        "mixed_medium",
+        "large_decode",
+        "large_prefill",
+        "mixed_large",
+    ],
 )
 @pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"])
 @pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 8de0a0a11471..fe92f6570501 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -4,6 +4,7 @@
 
 import math
 from dataclasses import dataclass
+from functools import cached_property
 from typing import ClassVar
 
 import torch
@@ -315,6 +316,14 @@ class FlexAttentionMetadata:
     transformed_score_mod: _score_mod_signature | None = None
     sliding_window: int | None = None
 
+    @cached_property
+    def logical_block_ids(self):
+        return torch.arange(
+            cdiv(self.max_seq_len, self.block_size),
+            device=self.block_table.device,
+            dtype=torch.long,
+        )
+
     def _convert_physical_to_logical(
         self,
         request_lookup: torch.Tensor,
@@ -493,6 +502,7 @@ def _build_block_mask_direct(self) -> BlockMask:
 
         The direct path works as follows:
         1. For each query token, fetch blocks from block_table using max_seq_len
+           and exclude out of sliding window blocks if needed.
            (this fetches more blocks than needed for shorter sequences)
         2. Group query tokens into chunks of q_block_size
         3. For each group, deduplicate the blocks using unique_static_unsorted
@@ -517,6 +527,23 @@ def _build_block_mask_direct(self) -> BlockMask:
         used_pages = self.block_table[
             self.doc_ids, : cdiv(self.max_seq_len, self.block_size)
         ]
+
+        if self.sliding_window and self.causal:
+            device = used_pages.device
+            assert self.doc_ids is not None
+            token_indices = torch.arange(
+                self.doc_ids.shape[0], device=device, dtype=torch.long
+            )
+            logical_q_idx = (
+                token_indices
+                - self.query_start_loc[self.doc_ids]
+                + self.decode_offset[self.doc_ids]
+            )
+            min_kv_idx = torch.clamp(logical_q_idx - (self.sliding_window - 1), min=0)
+            min_block_idx = min_kv_idx // self.block_size
+            sliding_mask = self.logical_block_ids >= min_block_idx[:, None]
+            used_pages.masked_fill_(~sliding_mask, 0)
+
         used_pages_padded = pad_to_multiple(
             used_pages, multiple=self.q_block_size, dim=0
         )
@@ -785,12 +812,6 @@ def forward(
         if attn_metadata.sliding_window != self.sliding_window:
             attn_metadata.sliding_window = self.sliding_window
             if attn_metadata.direct_build:
-                # TODO: Support skipping the computation of sliding window
-                # in direct block mask building code path.
-                logger.warning_once(
-                    "Using direct block mask building with sliding window, "
-                    "which is suboptimal now. Performance may be degraded."
-                )
                 # update mask mod in attention metadata
                 attn_metadata.mask_mod = attn_metadata.get_mask_mod()
                 attn_metadata.block_mask = attn_metadata._build_block_mask_direct()

From 5cfa967efa3525e97565979ab04118977e0b3bb3 Mon Sep 17 00:00:00 2001
From: Marcin Ostrowski <marcinx.ostrowski@intel.com>
Date: Mon, 1 Dec 2025 14:16:44 +0100
Subject: [PATCH 577/578] [Bugfix] TypeError: 'NoneType' object is not callable
 (#29414)

Signed-off-by: Marcin Ostrowski <marcinx.ostrowski@intel.com>
---
 tests/v1/core/test_prefix_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 64fd5ab1dd9a..0880a17c78d4 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -45,7 +45,7 @@
 def _auto_init_hash_fn(request):
     hash_fn: Callable
     if "hash_fn" in request.fixturenames:
-        hash_fn = init_none_hash(request.getfixturevalue("hash_fn"))
+        hash_fn = request.getfixturevalue("hash_fn")
     else:
         hash_fn = sha256
     init_none_hash(hash_fn)

From 36db0a35e45f32f7c37f6f1967dc8d6ff301d882 Mon Sep 17 00:00:00 2001
From: Shengqi Chen <i@harrychen.xyz>
Date: Mon, 1 Dec 2025 21:25:39 +0800
Subject: [PATCH 578/578] [CI] Renovation of nightly wheel build & generation
 (#29690)

Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
---
 .buildkite/generate_index.py                  |  46 ---
 .buildkite/release-pipeline.yaml              |  16 +-
 .buildkite/scripts/generate-nightly-index.py  | 368 ++++++++++++++++++
 .buildkite/scripts/upload-wheels.sh           | 121 +++---
 docs/getting_started/installation/cpu.md      |  15 +-
 .../installation/gpu.cuda.inc.md              |  73 ++--
 docs/getting_started/installation/gpu.md      |   2 +-
 setup.py                                      | 101 +++--
 vllm/envs.py                                  |   7 +-
 9 files changed, 568 insertions(+), 181 deletions(-)
 delete mode 100644 .buildkite/generate_index.py
 create mode 100644 .buildkite/scripts/generate-nightly-index.py

diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
deleted file mode 100644
index bbed80ebe847..000000000000
--- a/.buildkite/generate_index.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """<!DOCTYPE html>
-<html>
-    <body>
-    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
-        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
-    </body>
-</html>
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
-    print(f"Generated index.html for {args.wheel}")
-    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
-    if "x86_64" in filename:
-        x86_wheel = filename
-        arm_wheel = filename.replace("x86_64", "aarch64").replace(
-            "manylinux1", "manylinux2014"
-        )
-    elif "aarch64" in filename:
-        x86_wheel = filename.replace("aarch64", "x86_64").replace(
-            "manylinux2014", "manylinux1"
-        )
-        arm_wheel = filename
-    else:
-        raise ValueError(f"Unsupported wheel: {filename}")
-    # cloudfront requires escaping the '+' character
-    f.write(
-        template.format(
-            x86_wheel=x86_wheel,
-            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
-            arm_wheel=arm_wheel,
-            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
-        )
-    )
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 38c400ba1faf..fbfc923998f8 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,7 +8,7 @@ steps:
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -30,19 +30,6 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.8"
-    depends_on: ~
-    id: build-wheel-cuda-12-8
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
   - label: "Build wheel - CUDA 12.9"
     depends_on: ~
     id: build-wheel-cuda-12-9
@@ -109,7 +96,6 @@ steps:
   - label: "Annotate release workflow"
     depends_on:
       - create-multi-arch-manifest
-      - build-wheel-cuda-12-8
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
new file mode 100644
index 000000000000..a61f08107647
--- /dev/null
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+if not sys.version_info >= (3, 10):
+    raise RuntimeError("This script requires Python 3.10 or higher.")
+
+INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
+<html>
+  <meta name="pypi:repository-version" content="1.0">
+  <body>
+{items}
+  </body>
+</html>
+"""
+
+
+@dataclass
+class WheelFileInfo:
+    package_name: str
+    version: str
+    build_tag: str | None
+    python_tag: str
+    abi_tag: str
+    platform_tag: str
+    variant: str | None
+    filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+    """
+    Parse wheel file name to extract metadata.
+
+    The format of wheel names:
+        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+    Example:
+        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+    """
+    wheel_file_re = re.compile(
+        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
+    )
+    match = wheel_file_re.match(file)
+    if not match:
+        raise ValueError(f"Invalid wheel file name: {file}")
+
+    package_name = match.group("package_name")
+    version = match.group("version")
+    build_tag = match.group("build_tag")
+    python_tag = match.group("python_tag")
+    abi_tag = match.group("abi_tag")
+    platform_tag = match.group("platform_tag")
+
+    # extract variant from version
+    variant = None
+    if "dev" in version:
+        ver_after_dev = version.split("dev")[-1]
+        if "." in ver_after_dev:
+            variant = ver_after_dev.split(".")[-1]
+            version = version.removesuffix("." + variant)
+    else:
+        if "+" in version:
+            version, variant = version.split("+")
+
+    return WheelFileInfo(
+        package_name=package_name,
+        version=version,
+        build_tag=build_tag,
+        python_tag=python_tag,
+        abi_tag=abi_tag,
+        platform_tag=platform_tag,
+        variant=variant,
+        filename=file,
+    )
+
+
+def generate_project_list(subdir_names: list[str]) -> str:
+    """
+    Generate project list HTML content linking to each project & variant sub-directory.
+    """
+    href_tags = []
+    for name in sorted(subdir_names):
+        name = name.strip("/").strip(".")
+        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
+    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+
+
+def generate_package_index_and_metadata(
+    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+) -> tuple[str, str]:
+    """
+    Generate package index HTML content for a specific package, linking to actual wheel files.
+    """
+    href_tags = []
+    metadata = []
+    for file in sorted(wheel_files, key=lambda x: x.filename):
+        relative_path = (
+            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+        )
+        href_tags.append(
+            f'    <a href="{quote(relative_path.as_posix())}">{file.filename}</a><br/>'
+        )
+        file_meta = asdict(file)
+        file_meta["path"] = relative_path.as_posix()
+        metadata.append(file_meta)
+    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+    metadata_str = json.dumps(metadata, indent=2)
+    return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+    whl_files: list[str],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    default_variant: str | None = None,
+    alias_to_default: str | None = None,
+):
+    """
+    Generate index for all wheel files.
+
+    Args:
+        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+        wheel_base_dir (Path): Base directory for wheel files.
+        index_base_dir (Path): Base directory to store index files.
+        default_variant (str | None): The default variant name, if any.
+        alias_to_default (str | None): Alias variant name for the default variant, if any.
+
+    First, parse all wheel files to extract metadata.
+    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+    The index for the default variant (if any) is generated in the root index directory.
+
+    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+    is purely a copy of the corresponding variant index, with only the links adjusted.
+    Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+    as the default variant index, but the links are adjusted accordingly.
+
+    Index directory structure:
+        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
+            vllm/
+                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+                metadata.json # machine-readable metadata for all wheels in this package
+            cpu/ # cpu variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu129/ # cu129 is actually the alias to default variant
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu130/ # cu130 variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            ...
+
+    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+        [
+            {
+                "package_name": "vllm",
+                "version": "0.10.2rc2",
+                "build_tag": null,
+                "python_tag": "cp38",
+                "abi_tag": "abi3",
+                "platform_tag": "manylinux2014_aarch64",
+                "variant": "cu129",
+                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+                "path": "../vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL
+            },
+            ...
+        ]
+    """
+
+    parsed_files = [parse_from_filename(f) for f in whl_files]
+
+    if not parsed_files:
+        print("No wheel files found, skipping index generation.")
+        return
+
+    # Group by variant
+    variant_to_files: dict[str, list[WheelFileInfo]] = {}
+    for file in parsed_files:
+        variant = file.variant or "default"
+        if variant not in variant_to_files:
+            variant_to_files[variant] = []
+        variant_to_files[variant].append(file)
+
+    print(f"Found variants: {list(variant_to_files.keys())}")
+
+    # sanity check for default variant
+    if default_variant:
+        if "default" in variant_to_files:
+            raise ValueError(
+                "All wheel files must have variant suffixes when `default_variant` is specified."
+            )
+        if default_variant not in variant_to_files:
+            raise ValueError(
+                f"Default variant '{default_variant}' not found among wheel files."
+            )
+
+    if alias_to_default:
+        if "default" not in variant_to_files:
+            # e.g. only some wheels are uploaded to S3 currently
+            print(
+                "[WARN] Alias to default variant specified, but no default variant found."
+            )
+        elif alias_to_default in variant_to_files:
+            raise ValueError(
+                f"Alias variant name '{alias_to_default}' already exists among wheel files."
+            )
+        else:
+            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+            print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+    # Generate index for each variant
+    subdir_names = set()
+    for variant, files in variant_to_files.items():
+        if variant == "default":
+            variant_dir = index_base_dir
+        else:
+            variant_dir = index_base_dir / variant
+            subdir_names.add(variant)
+
+        variant_dir.mkdir(parents=True, exist_ok=True)
+
+        # gather all package names in this variant
+        packages = set(f.package_name for f in files)
+        if variant == "default":
+            # these packages should also appear in the "project list"
+            # generate after all variants are processed
+            subdir_names = subdir_names.union(packages)
+        else:
+            # generate project list for this variant directly
+            project_list_str = generate_project_list(sorted(packages))
+            with open(variant_dir / "index.html", "w") as f:
+                f.write(project_list_str)
+
+        for package in packages:
+            # filter files belonging to this package only
+            package_files = [f for f in files if f.package_name == package]
+            package_dir = variant_dir / package
+            package_dir.mkdir(parents=True, exist_ok=True)
+            index_str, metadata_str = generate_package_index_and_metadata(
+                package_files, wheel_base_dir, package_dir
+            )
+            with open(package_dir / "index.html", "w") as f:
+                f.write(index_str)
+            with open(package_dir / "metadata.json", "w") as f:
+                f.write(metadata_str)
+
+    # Generate top-level project list index
+    project_list_str = generate_project_list(sorted(subdir_names))
+    with open(index_base_dir / "index.html", "w") as f:
+        f.write(project_list_str)
+
+
+if __name__ == "__main__":
+    """
+    Arguments:
+        --version <version> : version string for the current build (e.g., commit hash)
+        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
+        --output-dir <output_directory> : directory to store generated index files
+        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
+    """
+
+    parser = argparse.ArgumentParser(
+        description="Process nightly build wheel files to generate indices."
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        required=True,
+        help="Version string for the current build (e.g., commit hash)",
+    )
+    parser.add_argument(
+        "--current-objects",
+        type=str,
+        required=True,
+        help="Path to JSON file containing current S3 objects listing in this version directory",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Directory to store generated index files",
+    )
+    parser.add_argument(
+        "--alias-to-default",
+        type=str,
+        default=None,
+        help="Alias variant name for the default variant",
+    )
+
+    args = parser.parse_args()
+
+    version = args.version
+    if "/" in version or "\\" in version:
+        raise ValueError("Version string must not contain slashes.")
+    current_objects_path = Path(args.current_objects)
+    output_dir = Path(args.output_dir)
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Read current objects JSON
+    with open(current_objects_path) as f:
+        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+    # current_objects looks like from list_objects_v2 S3 API:
+    """
+    "Contents": [
+        {
+            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+            "LastModified": "2025-11-28T14:00:32+00:00",
+            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+            "ChecksumAlgorithm": [
+                "CRC64NVME"
+            ],
+            "ChecksumType": "FULL_OBJECT",
+            "Size": 435649349,
+            "StorageClass": "STANDARD"
+        },
+        ...
+    ]
+    """
+
+    # Extract wheel file keys
+    wheel_files = []
+    for item in current_objects.get("Contents", []):
+        key: str = item["Key"]
+        if key.endswith(".whl"):
+            wheel_files.append(key.split("/")[-1])  # only the filename is used
+
+    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+    # Generate index and metadata, assuming wheels and indices are stored as:
+    # s3://vllm-wheels/{version}/<wheel files>
+    # s3://vllm-wheels/<anything>/<index files>
+    wheel_base_dir = Path(output_dir).parent / version
+    index_base_dir = Path(output_dir)
+
+    generate_index_and_metadata(
+        whl_files=wheel_files,
+        wheel_base_dir=wheel_base_dir,
+        index_base_dir=index_base_dir,
+        default_variant=None,
+        alias_to_default=args.alias_to_default,
+    )
+    print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 945c5e48c009..05accb9cf16d 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,6 +2,28 @@
 
 set -ex
 
+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,10) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+    # use new python from docker
+    docker pull python:3-slim
+    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
 
@@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
   echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
   exit 1
 fi
-
-# Get the single wheel file
 wheel="${wheel_files[0]}"
 
-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
-    manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
-    manylinux_version="manylinux2014"
-else
-    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
-    manylinux_version="manylinux1"
-fi
+# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="manylinux_2_31"
 
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+  echo "Error: Wheel filename does not contain 'linux': $wheel"
+  exit 1
+fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
 
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
-
-normal_wheel="$wheel" # Save the original wheel filename
-
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
-    suffix="${version##*.}"
-    if [[ $suffix == cu* ]]; then
-        new_version="1.0.0.dev+${suffix}"
-    else
-        new_version="1.0.0.dev"
-    fi
-    new_wheel="${wheel/$version/$new_version}"
-    # use cp to keep both files in the artifacts directory
-    cp -- "$wheel" "$new_wheel"
-    wheel="$new_wheel"
-    version="$new_version"
-fi
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"
 
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
 
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
 
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
-    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"
+
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels/<commit>/
+# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
 else
-    echo "Skipping index files for non-cu129 wheels"
+    alias_arg=""
 fi
 
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
 
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
-    echo "Skipping index files for non-cu129 wheels"
+# copy indices to /<commit>/ unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR 
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+    echo "Uploading indices to overwrite /nightly/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
 fi
 
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+# copy to /<pure_version>/ only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+    echo "Uploading indices to overwrite /$pure_version/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index d1beab7855b1..18dc6d19434b 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -46,10 +46,23 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ### Pre-built wheels
 
-Currently, there are no pre-built CPU wheels.
+Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
+
+When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
+For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
 
 ### Build wheel from source
 
+#### Set up using Python-only build (without compilation) {#python-only-build}
+
+Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
+
+```bash
+VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
+```
+
+#### Full build (with compilation) {#full-build}
+
 === "Intel/AMD x86"
 
     --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index 601d3659af88..ad26672f8092 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -26,43 +26,50 @@ uv pip install vllm --torch-backend=auto
 
 ??? console "pip"
     ```bash
-    # Install vLLM with CUDA 12.8.
-    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
+    # Install vLLM with CUDA 12.9.
+    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
     ```
 
-We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
 
 !!! note
     NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
 
-As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
 
 ```bash
-# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
+# Install vLLM with a specific CUDA version (e.g., 13.0).
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-export CUDA_VERSION=118 # or 126
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
+export CUDA_VERSION=130 # or other
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 
 #### Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
+
+* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
+* `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
+
+To install from nightly index, run:
 
 ```bash
 uv pip install -U vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/nightly
+    --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
 ```
 
-??? console "pip"
+!!! warning "`pip` caveat"
+
+    Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
+
+    If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
+
     ```bash
-    pip install -U vllm \
-        --pre \
-        --extra-index-url https://wheels.vllm.ai/nightly
+    pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
+    pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
     ```
 
-    `--pre` is required for `pip` to consider pre-released versions.
-
 ##### Install specific revisions
 
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
@@ -71,33 +78,13 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 uv pip install vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
 ```
 
-The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
-
-??? note "pip"
-    If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
-    performance regression), due to the limitation of `pip`, you have to specify the full URL of the
-    wheel file by embedding the commit hash in the URL:
-
-    ```bash
-    export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-    pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-    ```
-
-    Note that the wheels are built with Python 3.8 ABI (see [PEP
-    425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
-    with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
-    placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
-    the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
-    don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
-    wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-#### Set up using Python-only build (without compilation)
+#### Set up using Python-only build (without compilation) {#python-only-build}
 
 If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
 
@@ -121,18 +108,24 @@ This command will do the following:
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
 
 ```bash
-export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_COMIMT=$(git rev-parse HEAD~1) # or earlier commit on main
+export VLLM_USE_PRECOMPILED=1
 uv pip install --editable .
 ```
 
+There are more environment variables to control the behavior of Python-only build:
+
+* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
+* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
+* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
+
 You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
 
 !!! note
     There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
     It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
 
-#### Full build (with compilation)
+#### Full build (with compilation) {#full-build}
 
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
 
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index bc7508b29475..fb750f449985 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
 
-### Pre-built wheels
+### Pre-built wheels {#pre-built-wheels}
 
 === "NVIDIA CUDA"
 
diff --git a/setup.py b/setup.py
index 0022e7fe0bf3..67226b4447c7 100644
--- a/setup.py
+++ b/setup.py
@@ -310,9 +310,6 @@ def run(self):
 class precompiled_build_ext(build_ext):
     """Disables extension building when using precompiled binaries."""
 
-    def run(self) -> None:
-        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
-
     def build_extensions(self) -> None:
         print("Skipping build_ext: using precompiled extensions.")
         return
@@ -648,37 +645,97 @@ def _read_requirements(filename: str) -> list[str]:
     ]
 }
 
+
+def _fetch_metadata_for_variant(
+    commit: str, variant: str | None
+) -> tuple[list[dict], str]:
+    variant_dir = f"{variant}/" if variant is not None else ""
+    repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/"
+    meta_url = repo_url + "metadata.json"
+    logger.info("Trying to fetch metadata from {}", meta_url)
+    from urllib.request import urlopen
+
+    with urlopen(meta_url) as resp:
+        # urlopen raises HTTPError on unexpected status code
+        wheels = json.loads(resp.read().decode("utf-8"))
+    return wheels, repo_url
+
+
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
-    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+    # Attempts:
+    # 1. user-specified wheel location (can be either local or remote, via
+    #    VLLM_PRECOMPILED_WHEEL_LOCATION)
+    # 2. user-specified variant from nightly repo (current main commit via
+    #    VLLM_PRECOMPILED_WHEEL_VARIANT)
+    # 3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
+    # 4. the default variant from nightly repo (current main commit)
     wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
     if wheel_location is not None:
         wheel_url = wheel_location
+        logger.info("Using user-specified precompiled wheel location: {}", wheel_url)
     else:
         import platform
 
         arch = platform.machine()
-        if arch == "x86_64":
-            wheel_tag = "manylinux1_x86_64"
-        elif arch == "aarch64":
-            wheel_tag = "manylinux2014_aarch64"
-        else:
-            raise ValueError(f"Unsupported architecture: {arch}")
-        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
-        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
-        nightly_wheel_url = (
-            f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        # try to fetch the wheel metadata from the nightly wheel repo
+        main_variant = envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
+        variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
+        commit = os.getenv(
+            "VLLM_PRECOMPILED_WHEEL_COMMIT",
+            precompiled_wheel_utils.get_base_commit_in_main_branch(),
         )
-        from urllib.request import urlopen
-
+        logger.info(
+            "Using precompiled wheel commit {} with variant {}", commit, variant
+        )
+        try_default = False
+        wheels, repo_url = None, None
         try:
-            with urlopen(wheel_url) as resp:
-                if resp.status != 200:
-                    wheel_url = nightly_wheel_url
+            wheels, repo_url = _fetch_metadata_for_variant(commit, variant)
         except Exception as e:
-            print(f"[warn] Falling back to nightly wheel: {e}")
-            wheel_url = nightly_wheel_url
-
+            logger.warning(
+                "Failed to fetch precompiled wheel metadata for variant {}",
+                variant,
+                exc_info=e,
+            )
+            try_default = True  # try outside handler to keep the stacktrace simple
+        if try_default:
+            logger.info("Trying the default variant")
+            wheels, repo_url = _fetch_metadata_for_variant(commit, None)
+            # if this also fails, then we have nothing more to try / cache
+        assert wheels is not None and repo_url is not None, (
+            "Failed to fetch precompiled wheel metadata"
+        )
+        # The metadata.json has the following format:
+        # see .buildkite/scripts/generate-nightly-index.py for details
+        """[{
+"package_name": "vllm",
+"version": "0.11.2.dev278+gdbc3d9991",
+"build_tag": null,
+"python_tag": "cp38",
+"abi_tag": "abi3",
+"platform_tag": "manylinux1_x86_64",
+"variant": null,
+"filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
+"path": "../vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
+},
+...]"""
+        for wheel in wheels:
+            if wheel.get("package_name") == "vllm" and arch in wheel.get(
+                "platform_tag", ""
+            ):
+                logger.info("Found precompiled wheel metadata: {}", wheel)
+                if "path" not in wheel:
+                    raise ValueError(f"Wheel metadata missing path: {wheel}")
+                # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
+                wheel_url = repo_url + wheel["path"]
+                logger.info("Using precompiled wheel URL: {}", wheel_url)
+                break
+        else:
+            raise ValueError(
+                f"No precompiled vllm wheel found for architecture {arch} "
+                f"from repo {repo_url}. All available wheels: {wheels}"
+            )
     patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
     for pkg, files in patch.items():
         package_data.setdefault(pkg, []).extend(files)
diff --git a/vllm/envs.py b/vllm/envs.py
index 46f1aa3222be..d0912863e644 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -74,7 +74,7 @@
     VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
-    VLLM_MAIN_CUDA_VERSION: str = "12.8"
+    VLLM_MAIN_CUDA_VERSION: str = "12.9"
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -445,10 +445,9 @@ def get_vllm_port() -> int | None:
     # Target device of vLLM, supporting [cuda (by default),
     # rocm, cpu]
     "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
-    # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
-    # 12.8 is the default. This follows PyTorch but can be overridden.
+    # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
     "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
-    or "12.8",
+    or "12.9",
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),